R/ledger-lib.R

Defines functions maxWithoutNA

#' Max without NA
#'
#' Function that returns NA if all elements are NA, and the max value not NA, if not.
#'
#' @param x vector parameter
#' @noRd
maxWithoutNA <- function(x) ifelse(!all(is.na(x)), max(x, na.rm = TRUE), NA)


#' Scraper ledger
#'
#' @description
#' Ledger of scraping status of each objects. Allows different type of states:
#' queued, scraping, scraped, failed, exception, skipped
#'
#' For flexible and reproducible configuration for package development
#'
#' @section Methods:
#' \describe{
#'   \item{\code{getCRCPolyhedronName(source, polyheron.name)}}{Returns CRC of the polyhedron.name for storing in db folder}
#'   \item{\code{updateStatus(source, source.filename, status, status.field = 'status', scraped.polyhedron = NA, obs ='')}}{Updates status of source and filenames parameters in Ledger }
#'   \item{\code{savePreloadedData()}}{Internal method which saves a file with an estimation of time required time to scrape each filename}
#'   \item{\code{loadPreloadedData()}}{Load a file with an estimation of time required time to scrape each filename}
#'   \item{\code{getSizeToTimeScrape(sources, time2scrape = 60)}}{Estimates how much filenames could be scraped in a time frame, considering data retrieved with loadPreloadedData}
#'   \item{\code{resetStatesMetrics()}}{Reset metrics of application of different status values}
#'   \item{\code{countStatusUse(status.field,status)}}{Add an use to the metrics of status.field and status parameters}
#'   \item{\code{getFilenamesStatusMode(mode,sources = sort(unique(self$df$source)),max.quant = 0,order.by.vertices.faces = FALSE)}}{Get a list of the filenames in the ledger with a defined mode (status agrupation)}
#'   \item{\code{getFilenamesStatus(status,sources = sort(unique(self$df$source)),max.quant = 0,order.by.vertices.faces = FALSE)}}{Get a list of the filenames in the ledger with specified status}
#' }
#'
#' @format \code{\link{R6Class}} object.
#' @import lgr
#' @importFrom utils read.csv
#' @importFrom digest digest
#' @importFrom R6 R6Class
#' @noRd
ScraperLedger <- R6::R6Class("ScraperLedger",
  public = list(
    states = NULL,
    df = NA,
    dirty = FALSE,
    preloaded.data.filename = NA,
    preloaded.data = NA,
    #' @field logger class logger
    logger = NA,
    #' @description
    #' initializes the object
    initialize = function() {
      self$df <- data.frame(
        id = character(),
        source = character(),
        file.id = numeric(),
        source.filename = character(),
        start.scrape = as.POSIXct(character()),
        end.scrape = as.POSIXct(character()),
        status = character(),
        scraped.name = character(),
        symbol = character(),
        scraped.vertices = numeric(),
        scraped.faces = numeric(),
        status.test = character(),
        obs = character(),
        git.commit = character(),
        crc.filename = character(),
        time.scraped = numeric(),
        preloaded.name = character(),
        preloaded.vertices = numeric(),
        preloaded.faces = numeric(),
        preloaded.time2scrape = numeric(),
        stringsAsFactors = FALSE
      )
      self$resetStatesMetrics()
      self$loadPreloadedData()
      self$logger <- genLogger(self)
      self
    },
    #' @description
    #' Gets the available sources
    getAvailableSources = function() {
      sort(unique(self$df$source))
    },
    #' @description
    #' Adds filename to the ledger
    #' @param source the source to add the filename to
    #' @param source.filename the filename to add to the source
    addFilename = function(source, source.filename) {
      logger <- getLogger(self)
      r <- NULL
      default.status <- "queued"
      if (is.null(self$getIdFilename(source, source.filename))) {
        logger$debug(
          "Adding Filename to ledger ",
          source = source,
          source.filename = source.filename
        )
        r <- nrow(self$df) + 1
        status.field <- "status"
        self$countStatusUse(
          status.field = status.field,
          status = default.status
        )
        states.row <- which(self$states$status.field %in% status.field &
          self$states$status %in% default.status)
        file.id <- self$states[states.row, "count"]

        # obtain preloaded.time2scrape
        row.preloaded.t2s <- which(self$preloaded.data$source == source &
          self$preloaded.data$filename == source.filename)
        if (length(row.preloaded.t2s) == 1) {
          preloaded.data <- self$preloaded.data[row.preloaded.t2s, ]
          self$df[r, c("preloaded.name")] <-
            as.character(preloaded.data[, 3])
          self$df[r, c(
            "preloaded.vertices",
            "preloaded.faces",
            "preloaded.time2scrape"
          )] <-
            c(preloaded.data[, 4:6])
        } else {
          preloaded.time2scrape <- NA
        }
        self$df[r, c(
          "id",
          "file.id",
          "source",
          "source.filename",
          "status"
        )] <- c(
          r,
          file.id,
          source,
          source.filename,
          default.status
        )

        self$dirty <- TRUE
      }
      r
    },
    #' @description
    #' Returns id/row of source and filenames parameters in the ledger
    #'  @param source the source
    #' @param source.filename the filename
    #' @returns the filename id
    getIdFilename = function(source, source.filename) {
      r <- which(self$df$source == source &
        self$df$source.filename == source.filename)
      if (length(r) > 0) {
        self$df[r, "id"]
      } else {
        r <- NULL
      }
      r
    },
    #' @description
    #' Returns CRC of the polyhedron.name for storing in db folder
    #' @param source the source to add the filename to
    #' @param source.filename the filename to add to the source
    getCRCPolyhedronName = function(source, polyhedron.name) {
      r <- which(self$df$source == source & tolower(self$df$scraped.name)
      == tolower(polyhedron.name))
      ret <- NULL
      if (length(r) > 0) {
        ret <- self$df[r, "crc.filename"]
        if (is.na(ret)) {
          ret <- NULL
        }
      }
      if (is.null(ret)) {
        ret <- digest::digest(tolower(polyhedron.name),
          algo = "crc32"
        )
      }
      ret
    },
    updateStatus = function(source, source.filename,
                            status,
                            status.field = "status",
                            scraped.polyhedron = NA,
                            obs = "") {
      logger <- getLogger(self)
      scraped.name.lower <- ""
      if (is.null(obs)) {
        obs <- ""
      }
      if (!status.field %in% c("status", "status.test")) {
        stop(paste(
          "Cannot update invalid status field",
          status.field
        ))
      }
      ret <- NULL
      retrieved.id <- self$getIdFilename(
        source = source,
        source.filename = source.filename
      )
      if (length(retrieved.id) != 1) {
        stop(paste(
          "There must be a unique row for",
          source,
          source.filename,
          "and have",
          length(retrieved.id)
        ))
      }
      if (status.field == "status") {
        end.scrape <- Sys.time()
        if (status == "scraping") {
          fields.update <- c("start.scrape")
          values.update <- as.character(end.scrape)
        }
        if (status %in% c("scraped", "failed", "skipped", "exception")) {
          start.scrape <- self$df[retrieved.id, "start.scrape"]
          time.scraped <- round(as.numeric(end.scrape - start.scrape), 3)

          fields.update <- c("end.scrape", "git.commit")
          values.update <- c(as.character(end.scrape), getGitCommit())
          fields.numeric.update <- c("time.scraped")
          values.numeric.update <- c(time.scraped)
          # in a different commands for not converting it to character
          if (status %in% "scraped") {
            scraped.name <- scraped.polyhedron$getName()
            symbol <- scraped.polyhedron$getState()$getSymbol()
            scraped.vertices <- nrow(scraped.polyhedron$getState()$
              getVertices(solid = TRUE))
            scraped.faces <- length(scraped.polyhedron$getState()$getSolid())
            crc.filename <- self$getCRCPolyhedronName(
              source = source,
              polyhedron.name = scraped.name
            )
            # Check scraped.name = preloaded name
            error <- ""
            preloaded.name <- self$df[retrieved.id, "preloaded.name"]
            scraped.name.lower <- tolower(scraped.name)
            existing.polyhedron.name.rows <- which(self$df$source == source &
              self$df$scraped.name == scraped.name.lower)
            if (length(existing.polyhedron.name.rows) > 0) {
              error <- paste(
                error, "Scraped name ", scraped.name.lower,
                "must be unique for source and exists in rows",
                paste(existing.polyhedron.name.rows,
                  collapse = ","
                )
              )
            }

            if (!is.na(preloaded.name)) {
              if (scraped.name.lower != tolower(preloaded.name)) {
                error <- paste(
                  error,
                  "Scraped name is",
                  scraped.name,
                  "and preloaded name is",
                  preloaded.name
                )
              }
            }
            if (nchar(error) > 0) {
              stop(error)
            }

            fields.update <- c(
              fields.update,
              "scraped.name",
              "symbol",
              "crc.filename"
            )
            values.update <- c(
              values.update,
              scraped.name.lower,
              symbol,
              crc.filename
            )
            fields.numeric.update <- c(
              fields.numeric.update,
              "scraped.vertices",
              "scraped.faces"
            )
            values.numeric.update <- c(
              values.numeric.update,
              scraped.vertices,
              scraped.faces
            )
          }
          self$df[retrieved.id, fields.numeric.update] <-
            values.numeric.update
        }
      }
      if (status.field == "status.test") {
        # status.test only possible value
        if (status %in% c("tested", "testing", "failed")) {
          fields.update <- NULL
          values.update <- NULL
        }
      }
      fields.update <- c(fields.update, status.field, "obs")
      values.update <- c(values.update, status, obs)

      logger$debug(
        "Updating ledger for",
        source.filename = source.filename,
        scraped.name.lower = scraped.name.lower,
        polyhedron.values = paste(fields.update, values.update,
          sep = "=",
          collapse = "|"
        )
      )
      self$df[retrieved.id, fields.update] <- values.update
      ret <- self$df[retrieved.id, ]
      # count status uses
      self$countStatusUse(status.field, status)
      self$dirty <- TRUE
      ret
    },
    updateCalculatedFields = function() {
      resolveScrapedPreloaded <- function(x, field) {
        maxWithoutNA(c(
          x[paste("scraped", field, sep = ".")],
          x[paste("preloaded", field, sep = ".")]
        ))
      }
      if (!"vertices" %in% names(self$df)) {
        self$dirty <- TRUE
      }
      if (self$dirty) {
        self$df$vertices <- as.numeric(apply(self$df,
          MARGIN = 1,
          FUN = function(x) {
            resolveScrapedPreloaded(
              x = x,
              field = "vertices"
            )
          }
        ))
        self$df$faces <- as.numeric(apply(self$df,
          MARGIN = 1,
          FUN = function(x) {
            resolveScrapedPreloaded(
              x = x,
              field = "faces"
            )
          }
        ))
        self$df$faces <- as.numeric(apply(self$df,
          MARGIN = 1,
          FUN = function(x) {
            resolveScrapedPreloaded(
              x = x,
              field = "faces"
            )
          }
        ))

        self$dirty <- FALSE
      }
    },
    getAvailablePolyhedra = function(sources = names(
                                       getUserEnvir(".available.sources")
                                     ),
                                     search.string = "",
                                     ret.fields = c(
                                       "source", "scraped.name", "symbol", "vertices",
                                       "faces", "status"
                                     ),
                                     ignore.case = TRUE) {
      self$updateCalculatedFields()
      if (is.null(ret.fields)) {
        ret.fields <- seq_len(ncol(self$df))
      }
      ret <- self$df[
        !is.na(self$df$scraped.name) & self$df$source %in% sources,
        ret.fields
      ]
      if (!is.null(search.string)) {
        ret <- ret[grepl(search.string,
          ret$scraped.name,
          ignore.case = ignore.case
        ), ]
      }
      ret <- ret[order(ret$vertices, ret$faces, ret$source), ]
      ret
    },
    savePreloadedData = function() {
      preloaded.data <- self$df[
        !is.na(self$df$time.scraped),
        c(
          "source",
          "source.filename",
          "scraped.name",
          "scraped.vertices",
          "scraped.faces",
          "time.scraped"
        )
      ]
      preloaded.data <- preloaded.data[order(
        preloaded.data$time.scraped,
        preloaded.data$source,
        preloaded.data$filename
      ), ]
      names(preloaded.data)[3:6] <- c(
        "name", "vertices",
        "faces", "time2scrape"
      )
      write.csv(preloaded.data, self$preloaded.data.filename,
        row.names = FALSE
      )
      preloaded.data
    },
    loadPreloadedData = function() {
      self$preloaded.data.filename <- getPreloadedDataFilename()
      self$preloaded.data <- utils::read.csv(self$preloaded.data.filename)
      self$preloaded.data
    },
    getSizeToTimeScrape = function(sources, time2scrape = 60) {
      pre.comp.source <- self$preloaded.data[self$preloaded.data$source %in%
        sources, ]
      pre.comp.source <- pre.comp.source[order(
        pre.comp.source$vertices,
        pre.comp.source$faces,
        pre.comp.source$source,
        pre.comp.source$filename
      ), ]

      pre.comp.source$cummsum <- cumsum(pre.comp.source$time2scrape)
      length(which(pre.comp.source$cummsum < time2scrape))
    },
    resetStatesMetrics = function() {
      self$states <- data.frame(
        status.field = character(),
        status = character(),
        count = numeric(),
        stringsAsFactors = FALSE
      )
      self$states
    },
    countStatusUse = function(status.field, status) {
      status.row <- which(self$states$status.field %in% status.field &
        self$states$status %in% status)
      if (length(status.row) == 0) {
        status.row <- nrow(self$states) + 1
        count <- 1
      } else {
        count <- self$states[status.row, "count"] + 1
      }
      self$states[status.row, c("status.field", "status")] <-
        c(status.field, status)
      self$states[status.row, "count"] <- count
      self$states
    },
    getFilenamesStatusMode = function(mode,
                                      sources = sort(unique(self$df$source)),
                                      max.quant = 0,
                                      order.by.vertices.faces = FALSE) {
      # status in queued, scraped, exception, retry, skipped
      allowed.status <- NULL
      if (mode == "scrape.retry") {
        allowed.status <- c("queued", "exception", "failed")
      }
      if (mode == "scrape.queued") {
        allowed.status <- c("queued", "scraping")
      }
      if (mode == "test") {
        allowed.status <- "scraped"
      }
      if (mode == "skipped") {
        # special mode skipped for intentionally avoid scraping
        allowed.status <- c("skipped", "scraping")
      }
      self$getFilenamesStatus(
        status = allowed.status,
        sources = sources,
        max.quant = max.quant,
        order.by.vertices.faces = order.by.vertices.faces
      )
    },
    getFilenamesStatus = function(status,
                                  sources = sort(unique(self$df$source)),
                                  max.quant = 0,
                                  order.by.vertices.faces = FALSE) {
      self$updateCalculatedFields()
      filtred.rows <- which(self$df$source %in% sources &
        self$df$status %in% status)
      ret <- NULL
      if (length(filtred.rows) > 0) {
        if (max.quant > 0) {
          filtred.rows <- filtred.rows[1:min(
            max.quant,
            length(filtred.rows)
          )]
        }
        ret <- self$df[filtred.rows, ]
        if (order.by.vertices.faces) {
          ret <- ret[order(
            ret$vertices, ret$faces,
            ret$source,
            ret$scraped.name
          ), ]
        }
      }
      ret
    }
  )
)
qbotics/Rpolyhedra documentation built on Jan. 26, 2024, 1 a.m.