R/class-plotdata-scatter.R

Defines functions scattergl scattergl.dt validateScatterPD newScatterPD

Documented in scattergl scattergl.dt

newScatterPD <- function(.dt = data.table::data.table(),
                         variables = veupathUtils::VariableMetadataList(),
                         value = character(),
                         useGradientColorscale = FALSE,
                         overlayValues = veupathUtils::BinList(),
                         idColumn = character(),
                         returnPointIds = logical(),
                         correlationMethod = character(),
                         sampleSizes = logical(),
                         completeCases = logical(),
                         evilMode = character(),
                         verbose = logical(),
                         ...,
                         class = character()) {

  .pd <- newPlotdata(.dt = .dt,
                     variables = variables,
                     useGradientColorscale = useGradientColorscale,
                     overlayValues = overlayValues,
                     idColumn = idColumn,
                     returnPointIds = returnPointIds,
                     sampleSizes = sampleSizes,
                     completeCases = completeCases,
                     evilMode = evilMode,
                     verbose = verbose,
                     class = "scatterplot")

  attr <- attributes(.pd)
  variables <- attr$variables

  x <- veupathUtils::findColNamesFromPlotRef(variables, 'xAxis')
  xType <- veupathUtils::findDataTypesFromPlotRef(variables, 'xAxis')
  y <- veupathUtils::findColNamesFromPlotRef(variables, 'yAxis')
  yType <- veupathUtils::findDataTypesFromPlotRef(variables, 'yAxis')
  group <- veupathUtils::findColNamesFromPlotRef(variables, 'overlay')
  panel <- findPanelColName(veupathUtils::findVariableSpecFromPlotRef(variables, 'facet1'), 
                            veupathUtils::findVariableSpecFromPlotRef(variables, 'facet2'))
  # If we ask for the point ids, ensure the column is present. Otherwise set to null. 
  if (returnPointIds) {
    if (!is.null(idColumn) && idColumn %in% names(.dt)) {
      idCol <- 'pointIds' # Will become the name of the id column in the output.
    } else {
      stop("idColumn not found or not supplied. Supply proper idColumn if returnPointIds is TRUE.")
    }
  } else {
    idCol <- NULL
  }
  
  dtForCorr <- data.table::as.data.table(.pd)

  if (useGradientColorscale) {
    #series data w gradient
    .pd$overlayMissingData <- is.na(.pd[[group]])
    series <- collapseByGroup(.pd, group = 'overlayMissingData', panel)
    .pd$overlayMissingData <- NULL
    series$overlayMissingData <- NULL
    data.table::setnames(series, c(panel, 'seriesX', 'seriesY', 'seriesGradientColorscale', idCol))

    # corr results w gradient, same as w/o groups so set group to NULL
    dtForCorr[[group]] <- NULL
    if (correlationMethod != 'none') {
      corrResult <- groupCorrelation(dtForCorr, x, y, NULL, panel, correlationMethod = correlationMethod)
    }
  } else {
    #series data w/o gradient
    series <- collapseByGroup(.pd, group, panel)
    data.table::setnames(series, c(group, panel, 'seriesX', 'seriesY', idCol))

    # corr results w/o gradient
    if (correlationMethod != 'none') {
      corrResult <- groupCorrelation(dtForCorr, x, y, group, panel, correlationMethod = correlationMethod)
    }
  }
  veupathUtils::logWithTime('Calculated correlation results per group.', verbose)

  if (xType == 'DATE') {
    series$seriesX <- lapply(series$seriesX, format, '%Y-%m-%d')
  } else {
    series$seriesX <- lapply(series$seriesX, as.character)
  }
  if (class(series$seriesX) != 'list') series$seriesX <- list(list(series$seriesX))

  if (yType == 'DATE') {
    series$seriesY <- lapply(series$seriesY, format, '%Y-%m-%d')
  } else {
    series$seriesY <- lapply(series$seriesY, as.character)
  }
  if (class(series$seriesY) != 'list') series$seriesY <- list(list(series$seriesY))


  if (useGradientColorscale) {
    if (identical(veupathUtils::findDataTypesFromPlotRef(variables, 'overlay'),'DATE')) {
      series$seriesGradientColorscale <- lapply(series$seriesGradientColorscale, format, '%Y-%m-%d')
    } else {
      series$seriesGradientColorscale <- lapply(series$seriesGradientColorscale, as.character)
    }
    if (class(series$seriesGradientColorscale) != 'list') series$seriesGradientColorscale <- list(list(series$seriesGradientColorscale))
  }
  
  veupathUtils::logWithTime('Collected raw scatter plot data.', verbose)

  if (value == 'smoothedMean') {
    
    smoothedMean <- groupSmoothedMean(.pd, x, y, group, panel)
    .pd <- smoothedMean
    veupathUtils::logWithTime('Calculated smoothed means.', verbose)

  } else if (value == 'smoothedMeanWithRaw') {
    
    smoothedMean <- groupSmoothedMean(.pd, x, y, group, panel)
    if (!is.null(key(series))) {
      .pd <- merge(series, smoothedMean)
    } else {
      .pd <- cbind(series, smoothedMean)
    }
    veupathUtils::logWithTime('Calculated smoothed means.', verbose)

  } else if (value == 'bestFitLineWithRaw') {

    bestFitLine <- groupBestFitLine(.pd, x, y, group, panel)
    if (!is.null(key(series))) {
      .pd <- merge(series, bestFitLine)
    } else {
      .pd <- cbind(series, bestFitLine)
    }
    veupathUtils::logWithTime('Calculated best fit line.', verbose)

  } else if (value == 'density') {
    
    # Note, density is not implemented in production code.
    density <- groupDensity(.pd, NULL, x, group, panel)
    .pd <- density
    veupathUtils::logWithTime('Kernel density estimate calculated from raw data.', verbose)

  } else {
    .pd <- series
  }

  if (correlationMethod != 'none') {
    if (!is.null(key(.pd))) {
      .pd <- merge(.pd, corrResult)
    } else {
      .pd <- cbind(.pd, corrResult)
    }

    attr$correlationMethod <- jsonlite::unbox(correlationMethod)
  }

  attr$names <- names(.pd)
  if (useGradientColorscale) attr$useGradientColorscale <- useGradientColorscale

  veupathUtils::setAttrFromList(.pd, attr)

  return(.pd)
}

validateScatterPD <- function(.scatter, verbose) {
  variables <- attr(.scatter, 'variables')
  xShape <- veupathUtils::findDataShapesFromPlotRef(variables, 'xAxis')
  if (!xShape %in% c('CONTINUOUS')) {
    stop('The independent axis must be continuous for scatterplot.')
  }
  yShape <- veupathUtils::findDataShapesFromPlotRef(variables, 'yAxis')
  if (!yShape %in% c('CONTINUOUS')) {
    stop('The dependent axis must be continuous for scatterplot.')
  }
  veupathUtils::logWithTime('Scatter plot request has been validated!', verbose)

  return(.scatter)
}

#' Scatter Plot as data.table
#'
#' This function returns a data.table of  
#' plot-ready data with one row per group (per panel). Columns 
#' 'seriesX' and 'seriesY' contain the raw data for the 
#' scatter plot. Column 'group' and 'panel' specify the group the 
#' series data belongs to. Optionally, columns 'smoothedMeanX', 
#' 'smoothedMeanY' and 'smoothedMeanSE' specify the x, y and 
#' standard error respectively of the smoothed conditional mean 
#' for the group. Columns 'densityX' and 'densityY' contain the 
#' calculated kernel density estimates. Column 
#' 'seriesGradientColorscale' contains values to be used with a 
#' gradient colorscale when plotting.
#' 
#' @section Evil Mode:
#' An `evilMode` exists. It will do the following: \cr
#' - when `strataVariables` it will return 'no data' as a regular value for strata vars but will discard such cases for the axes vars. \cr
#' - when `allVariables` it will return 'no data' as a regular value for all variables. \cr
#' - when `noVariables` it will do the sensible thing and return complete cases only. \cr
#' - not return statsTables \cr
#' - allow smoothed means and agg values etc over axes values where we have no data 
#' for the strata vars \cr
#' - return a total count of plotted incomplete cases \cr
#' - represent missingness poorly, conflate the stories of completeness and missingness, 
#' mislead you and steal your soul \cr
#' @param data data.frame to make plot-ready data for
#' @param variables veupathUtil::VariableMetadataList
#' sourceId and its position in the plot. Recognized plotRef values are 'xAxisVariable', 
#' 'yAxisVariable', 'overlayVariable', 'facetVariable1' and 'facetVariable2'
#' @param value character indicating whether to calculate 'smoothedMean', 'bestFitLineWithRaw'
#'  or 'density' estimates (no raw data returned), alternatively 'smoothedMeanWithRaw' 
#' to include raw data with smoothed mean. Note only 'raw' is compatible with a continuous 
#' overlay variable.
#' @param overlayValues veupathUtils::BinList providing overlay values of interest
#' @param correlationMethod character indicating which correlation method to use. One of 'pearson', 
#' 'spearman', 'sparcc' or 'none'. Default is 'none'.
#' @param sampleSizes boolean indicating if sample sizes should be computed
#' @param completeCases boolean indicating if complete cases should be computed
#' @param evilMode String indicating how evil this plot is ('strataVariables', 'allVariables', 'noVariables') 
#' @param idColumn character indicating the column name of the id variable in data
#' @param returnPointIds boolean indicating if any point ids should be returned with the scatterplot data.
#' This value will only be used when idColumn is present.
#' @param verbose boolean indicating if timed logging is desired
#' @return data.table plot-ready data
#' @examples
#' # Construct example data
#' df <- data.table('entity.xvar' = rnorm(100),
#'                  'entity.yvar' = rnorm(100),
#'                  'entity.overlay' = sample(c('red','green','blue'), 100, replace=T), stringsAsFactors = F)
#' 
#' # Create VariableMetadataList that specifies variable role in the plot and supplies variable metadata
#' variables <- veupathUtils::VariableMetadataList(
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'xvar', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'xAxis'),
#'     dataType = veupathUtils::DataType(value = 'NUMBER'),
#'     dataShape = veupathUtils::DataShape(value = 'CONTINUOUS')
#'   ),
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'overlay', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'overlay'),
#'     dataType = veupathUtils::DataType(value = 'STRING'),
#'     dataShape = veupathUtils::DataShape(value = 'CATEGORICAL')
#'   ),
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'yvar', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'yAxis'),
#'     dataType = veupathUtils::DataType(value = 'NUMBER'),
#'     dataShape = veupathUtils::DataShape(value = 'CONTINUOUS')
#'   )
#' )
#'  
#' # Returns a data table with plot-ready data
#' dt <- scattergl.dt(df, map, value = 'bestFitLineWithRaw')
#' @export
scattergl.dt <- function(data, 
                         variables, 
                         value = c('smoothedMean', 
                                   'smoothedMeanWithRaw', 
                                   'bestFitLineWithRaw', 
                                   'density', 
                                   'raw'),
                         overlayValues = NULL,
                         correlationMethod = c('none','pearson', 'sparcc', 'spearman'),
                         sampleSizes = c(TRUE, FALSE),
                         completeCases = c(TRUE, FALSE),
                         evilMode = c('noVariables', 'allVariables', 'strataVariables'),
                         collectionVariablePlotRef = NULL,
                         computedVariableMetadata = NULL,
                         idColumn = NULL,
                         returnPointIds = c(FALSE, TRUE),
                         verbose = c(TRUE, FALSE)) {
  
  if (!inherits(variables, 'VariableMetadataList')) stop("The `variables` argument must be a VariableMetadataList object.")
  value <- veupathUtils::matchArg(value)
  correlationMethod <- veupathUtils::matchArg(correlationMethod)
  sampleSizes <- veupathUtils::matchArg(sampleSizes)
  completeCases <- veupathUtils::matchArg(completeCases)
  evilMode <- veupathUtils::matchArg(evilMode)
  verbose <- veupathUtils::matchArg(verbose)
  returnPointIds <- veupathUtils::matchArg(returnPointIds)

  if (!'data.table' %in% class(data)) {
    data.table::setDT(data)
  }

  xVM <- veupathUtils::findVariableMetadataFromPlotRef(variables, 'xAxis')
  if (is.null(xVM)) {
    stop("Must provide x-axis variable for plot type scatter.")
  } else {
    if (!xVM@dataType@value %in% c('NUMBER','INTEGER') & value == 'density') {
      stop('Density curves can only be provided for numeric independent axes.')
    }
  }

  yVM <- veupathUtils::findVariableMetadataFromPlotRef(variables, 'yAxis')
  collectionVM <- veupathUtils::findCollectionVariableMetadata(variables)
  if (is.null(yVM)) {
    if (is.null(collectionVM)) {
      stop("Must provide y axis variable for plot type scatter when no collection variable is provided.")
    }
  } else {
    if (!yVM@dataType@value %in% c('NUMBER', 'INTEGER') & value != 'raw') {
      stop('Trend lines can only be provided for numeric dependent axes.')
    }
  }

  # If returnPointIds is TRUE, require that the idColumn is present in the data.
  if (returnPointIds) {
    if (is.null(idColumn) || !(idColumn %in% names(data))) {
      stop("idColumn not found or not supplied. Supply proper idColumn if returnPointIds is TRUE.")
    }
  }

  groupVM <- veupathUtils::findVariableMetadataFromPlotRef(variables, 'overlay')
  # Decide if we should use a gradient colorscale
  # For now the decision is handled internally. Eventually we may allow for this logic to be overridden and it can be a function arg.
  useGradientColorscale <- FALSE
  if (!is.null(groupVM) && !groupVM@isCollection) {
    groupColName <- veupathUtils::getColName(groupVM@variableSpec)
    if (identical(groupVM@dataShape@value, 'CONTINUOUS') && data.table::uniqueN(data[[groupColName]]) > 8) useGradientColorscale <- TRUE
  }

  if (useGradientColorscale && value != 'raw') {
    stop('Gradient colorscales cannot be used with trend lines.')
  }

  # Handle collectionVars
  if (!is.null(collectionVM)) {
    if (!collectionVM@plotReference@value %in% c('overlay', 'facet1', 'facet2')) stop('Collection variable PlotReference must be either overlayVariable, facetVariable1, or facetVariable2 for scatter.')
  }

  .scatter <- newScatterPD(.dt = data,
                            variables = variables,
                            value = value,
                            useGradientColorscale = useGradientColorscale,
                            overlayValues = overlayValues,
                            correlationMethod = correlationMethod,
                            idColumn = idColumn,
                            returnPointIds = returnPointIds,
                            sampleSizes = sampleSizes,
                            completeCases = completeCases,
                            inferredVarAxis = 'y',
                            evilMode = evilMode,
                            verbose = verbose)

  .scatter <- validateScatterPD(.scatter, verbose)
  veupathUtils::logWithTime(paste('New scatter plot object created with parameters value =', value,
                                                                                ', correlationMethod =', correlationMethod,
                                                                                ', sampleSizes = ', sampleSizes,
                                                                                ', completeCases = ', completeCases,
                                                                                ', evilMode =', evilMode,
                                                                                ', verbose =', verbose), verbose)

  return(.scatter)
}

#' Scatter Plot data file
#'
#' This function returns the name of a json file containing 
#' plot-ready data with one row per group (per panel). Columns 
#' 'seriesX' and 'seriesY' contain the raw data for the 
#' scatter plot. Column 'group' and 'panel' specify the group the 
#' series data belongs to. Optionally, columns 'smoothedMeanX', 
#' 'smoothedMeanY' and 'smoothedMeanSE' specify the x, y and 
#' standard error respectively of the smoothed conditional mean 
#' for the group. Columns 'densityX' and 'densityY' contain the 
#' calculated kernel density estimates. Column 
#' 'seriesGradientColorscale' contains values to be used with a 
#' gradient colorscale when plotting.
#' 
#' @section Evil Mode:
#' An `evilMode` exists. It will do the following: \cr
#' - when `strataVariables` it will return 'no data' as a regular value for strata vars but will discard such cases for the axes vars. \cr
#' - when `allVariables` it will return 'no data' as a regular value for all variables. \cr
#' - when `noVariables` it will do the sensible thing and return complete cases only. \cr
#' - not return statsTables \cr
#' - allow smoothed means and agg values etc over axes values where we have no data for 
#' the strata vars \cr
#' - return a total count of plotted incomplete cases \cr
#' - represent missingness poorly, conflate the stories of completeness and missingness, 
#' mislead you and steal your soul \cr
#' @param data data.frame to make plot-ready data for
#' @param variables veupathUtil::VariableMetadataList 
#' and its position in the plot. Recognized plotRef values are 'xAxisVariable', 'yAxisVariable', 
#' 'overlayVariable', 'facetVariable1' and 'facetVariable2'
#' @param value character indicating whether to calculate 'smoothedMean', 'bestFitLineWithRaw' or 
#' 'density' estimates (no raw data returned), alternatively 'smoothedMeanWithRaw' to include raw 
#' data with smoothed mean. Note only 'raw' is compatible with a continuous overlay variable.
#' @param overlayValues veupathUtils::BinList providing overlay values of interest
#' @param correlationMethod character indicating which correlation method to use. One of 'pearson', 
#' 'spearman','sparcc' or 'none'. Default is 'none'.
#' @param sampleSizes boolean indicating if sample sizes should be computed
#' @param completeCases boolean indicating if complete cases should be computed
#' @param evilMode String indicating how evil this plot is ('strataVariables', 'allVariables', 'noVariables') 
#' @param idColumn character indicating the column name of the id variable in data
#' @param returnPointIds boolean indicating if any point ids should be returned with the scatterplot data.
#' @param verbose boolean indicating if timed logging is desired
#' @return character name of json file containing plot-ready data
#' @examples
#' # Construct example data
#' df <- data.table('entity.xvar' = rnorm(100),
#'                  'entity.yvar' = rnorm(100),
#'                  'entity.overlay' = sample(c('red','green','blue'), 100, replace=T), stringsAsFactors = F)
#' 
#' # Create VariableMetadataList that specifies variable role in the plot and supplies variable metadata
#' variables <- veupathUtils::VariableMetadataList(
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'xvar', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'xAxis'),
#'     dataType = veupathUtils::DataType(value = 'NUMBER'),
#'     dataShape = veupathUtils::DataShape(value = 'CONTINUOUS')
#'   ),
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'overlay', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'overlay'),
#'     dataType = veupathUtils::DataType(value = 'STRING'),
#'     dataShape = veupathUtils::DataShape(value = 'CATEGORICAL')
#'   ),
#'   veupathUtils::VariableMetadata(
#'     variableClass = veupathUtils::VariableClass(value = 'native'),
#'     variableSpec = veupathUtils::VariableSpec(variableId = 'yvar', entityId = 'entity'),
#'     plotReference = veupathUtils::PlotReference(value = 'yAxis'),
#'     dataType = veupathUtils::DataType(value = 'NUMBER'),
#'     dataShape = veupathUtils::DataShape(value = 'CONTINUOUS')
#'   )
#' )
#'  
#' # Returns the name of a json file
#' scattergl(df, map, value = 'bestFitLineWithRaw')
#' @export
scattergl <- function(data,
                      variables,
                      value = c('smoothedMean', 
                                'smoothedMeanWithRaw', 
                                'bestFitLineWithRaw', 
                                'density', 
                                'raw'),
                      overlayValues = NULL,
                      correlationMethod = c('none','pearson', 'sparcc', 'spearman'),
                      sampleSizes = c(TRUE, FALSE),
                      completeCases = c(TRUE, FALSE),
                      evilMode = c('noVariables', 'allVariables', 'strataVariables'),
                      idColumn = NULL,
                      returnPointIds = c(FALSE, TRUE),
                      verbose = c(TRUE, FALSE)) {

  verbose <- veupathUtils::matchArg(verbose)

  .scatter <- scattergl.dt(data,
                           variables,
                           value = value,
                           overlayValues = overlayValues,
                           correlationMethod = correlationMethod,
                           sampleSizes = sampleSizes,
                           completeCases = completeCases,
                           evilMode = evilMode,
                           idColumn = idColumn,
                           returnPointIds = returnPointIds,
                           verbose = verbose)
                           
  outFileName <- writeJSON(.scatter, evilMode, 'scattergl', verbose)

  return(outFileName)
}
VEuPathDB/plot.data documentation built on Feb. 20, 2025, 6:33 p.m.