R/xrfDBquery.R

Defines functions xrfDBquery

Documented in xrfDBquery

#' Function to get XRF specific data from the database.
#' 
#' This function will perform a query on the database to get XRF analysis 
#' results. The query can be filtered using the arguments of the function to 
#' reduce the size of the data obtained from the database. Further filtering of 
#' the returned data frame can be done in R. The function uses an open database 
#' connection to perform a query and stores the result of the query in a data 
#' frame.
#' 
#' @param conn connection object linked to the IMPROVE or CSN database
#'   
#' @param network character string for the network, \code{IMPROVE} or \code{CSN}
#'   
#' @param minDate character string of the minimum XRF analysis date (inclusive)
#'   
#' @param maxDate character string of the maximum XRF analysis date (inclusive)
#'   
#' @param site character string of the site name to query. For IMPROVE a 
#'   character vector can be used to query multiple sites.
#'   
#' @param sampleDate character string in the format \code{YYYY-mm-dd}. This will
#'   query the database for a filter with the specified sample date. For CSN the
#'   sample date will be the 'IntendedUseDate'. Supercedes \code{minSampleDate} 
#'   and \code{maxSampleDate}.
#'   
#' @param minSampleDate character string of the minimum sample date to query. 
#'   For CSN the sample date is the 'IntendedUseDate'. If a sampleDate is 
#'   entered this argument will be ignored.
#'   
#' @param maxSampleDate character string of the maximum sample date to query. 
#'   For CSN the sample date is the 'IntendedUseDate'. If a sampleDate is 
#'   entered this argument will be ignored.
#'   
#' @param filterId numeric vector of Filter IDs to query.
#'   
#' @param barcodeId character vector of barcodes (ContractorFilterAnalysisId in 
#'   the database). This field doesn't exist in the IMPROVE database, so this 
#'   argument will be ignored if the network = IMPROVE.
#'   
#' @param trayFileSampleId character vector of TrayFileSampleIdent's to query.
#'   
#' @param includeQC character string. \code{yes} indicates QC samples will be 
#'   included with network samples in the query. \code{no} will exclude QC 
#'   samples from the query results. \code{only} will only include QC samples in
#'   the query results.
#'   
#' @param valid boolean. \code{TRUE} will only include valid results in the 
#'   query results. \code{FALSE} will return both valid and invalid results.
#'   
#' @param elements character vector of elements to include in the results (e.g. 
#'   c("Al", "Si", "Ca", "Fe")). \code{all} will include all elements in the 
#'   results.
#'   
#' @param showQuery boolean. If \code{FALSE} (the default), the data frame
#'   created from the database query is returned. If \code{TRUE}, then the SQL
#'   query text generated by dbplyr is returned. This is mostly for debugging
#'   information.
#'   
#' @return data frame of the query results from the database.
#'   
#' @details Use of this function requires some knowledge of the database 
#'   structure for the AQRC databases. The queries produced by this function are
#'   specfically for obtaining results from the XRF instruments (i.e. no HIPS, 
#'   qCarbon, or ions data can be queried using this function). The most common 
#'   search criteria were included in the list of function arguments, however, 
#'   if further filtering of the database results is required, this can easily 
#'   be done in the R environment with the returned data frame. The elements 
#'   over which you can filter is created by the available elements using the 
#'   other filtering criteria in your search, therefore, it could change based 
#'   on the dates you are searching over.
#'   
#' @importFrom dbplyr in_schema
#' 
#' @importFrom bit64 is.integer64
#' 
#' @import dplyr
#' 
#' @examples
#' \dontrun{
#' # A query of the crustal element results from samples analyzed in the month
#' # of May, 2019 from the IMPROVE database.
#' df <- xrfDBquery(poolConn, network="IMPROVE", minDate="2019-05-01",
#'                  maxDate="2019-05-31", elements=c("Al","Si","Ca","Fe"))
#'                  
#' # Another query of all valid sample results (no QC samples) sampled in
#' # September, 2017 in the CSN network.
#' df <- xrfDBquery(poolConn, network="CSN", minSampleDate="2017-09-01",
#'                  maxSampleDate="2017-09-30", includeQC="no")
#'                  
#' # To include both valid and invalid results add \code{valid=FALSE}
#' df <- xrfDBquery(poolConn, network="CSN", minSampleDate="2017-09-01",
#'                  maxSampleDate="2017-09-30", includeQC="no", valid=FALSE)
#'                  
#' # For the CSN network specific filter barcodes can be searched.
#' df <- xrfDBquery(poolConn, network="CSN",
#'                  barcodeId=c("F164145","F162513","F163396"))
#'                  
#' # Both networks can be searched by the TrayFileSampleIdent.
#' df <- xrfDBquery(poolConn, network="IMPROVE",
#'                  trayFileSampleId=c("BIBE1", "SIME1"),
#'                  sampleDate="2018-11-19")
#'                  
#' # Additional filtering can be done via normal R procedures. For instance, to
#' # get the IMPROVE results for May, 2018 from just Froya add filtering after
#' # the function call.
#' df <- xrfDBquery(poolConn, network="IMPROVE", minSampleDate="2018-05-01",
#'                  maxSampleDate="2018-05-31", includeQC="no")
#' froya_results <- df[df$DeviceName=="Froya",]
#' }
#' 
#' @export
#' 

xrfDBquery <- function(conn,
                       network=c("IMPROVE","CSN"),
                       minDate = "2019-01-01",
                       maxDate = as.character(Sys.Date()),
                       site = NULL,
                       sampleDate = NULL,
                       minSampleDate = NULL,
                       maxSampleDate = NULL,
                       filterId = NULL,
                       barcodeId = NULL,
                       trayFileSampleId = NULL,
                       includeQC = c("yes","no","only"),
                       valid = TRUE,
                       elements = "all",
                       showQuery = FALSE) {
    
    # Restrict inputs
    network <- match.arg(network)
    includeQC <- match.arg(includeQC)
    
    # Setup the links to the database tables
    dc <- tbl(conn,dbplyr::in_schema("xrf","DeviceCounts"))
    sa <- tbl(conn,dbplyr::in_schema("xrf","SampleAnalysis")) %>%
        rename(SampleAnalysisId = Id)
    d <- tbl(conn,dbplyr::in_schema("xrf","Device")) %>%
        rename(DeviceId = Id, DeviceName = Name)
    f <- tbl(conn,dbplyr::in_schema("filter","Filters")) %>%
        rename(FilterId = Id)
    sam <- tbl(conn,dbplyr::in_schema("sampler","Samplers")) %>%
        rename(SamplerId = Id)
    
    # Network specific tables and joins
    if(network=="IMPROVE"){
        mod <- tbl(conn,dbplyr::in_schema("module","Modules")) %>%
            rename(SamplerModuleId = Id)
        
        df <- dc %>%
            left_join(sa,by="SampleAnalysisId") %>%
            left_join(d,by="DeviceId") %>%
            left_join(f,by="FilterId") %>%
            left_join(mod,by="SamplerModuleId") %>%
            left_join(sam,by="SamplerId") %>%
            filter(XRFDate >= minDate, XRFDate <= maxDate) %>%
            rename(Site = Name)
    }
    
    if(network=="CSN"){
        site <- tbl(conn,dbplyr::in_schema("sampler","Sites")) %>%
            rename(SiteId = Id)
        
        df <- dc %>%
            left_join(sa,by="SampleAnalysisId") %>%
            left_join(d,by="DeviceId") %>%
            left_join(f,by="FilterId") %>%
            left_join(sam,by="SamplerId") %>%
            left_join(site,by="SiteId") %>%
            filter(XRFDate >= minDate, XRFDate <= maxDate) %>%
            rename(SampleDate = IntendedUseDate,
                   Site = SiteName)
    }
    
    # Restrict element input to just those in the data
    elemList <- df %>% select(Parameter) %>% collect() %>% unique() %>% pull()
    elements <- match.arg(elements,choices=c("all",elemList))
    
    ### Filter the query based on user provided parameter values
    
    # Filter by site
    if(!is.null(site)) {
        df <- df %>% filter(Site==site)
    }
    
    # Filter by specific SampleDate
    if(!is.null(sampleDate)) {
        testDate <- tryCatch(as.Date(sampleDate),
                             error=function(e) {
                                 paste("The sample date entered was incorrect.",
                                       "Please enter a date in the format,",
                                       "YYYY-MM-DD (e.g. 2019-03-14)")
                             })
        # Sanity check the date
        if(as.Date(sampleDate)<'1975-01-01' | as.Date(sampleDate)>Sys.Date()){
            stop(paste("The sample date entered isn't a likely date.",
                       "Please enter a true sampling date."))
        }
        
        df <- df %>% filter(SampleDate == as.Date(sampleDate))
    }
    
    # Filter by SampleDate range
    if(is.null(sampleDate) & !is.null(minSampleDate)) {
        if(is.null(maxSampleDate)) {
            maxSampleDate = Sys.Date()
        } else {
            testDate <- tryCatch(as.Date(maxSampleDate),
                                 error=function(e){
                                     paste("The maximum sample date entered",
                                           "is incorrect. Please enter a date",
                                           "in the format YYYY-MM-DD")
                                 })
        }
        testDate <- tryCatch(as.Date(minSampleDate),
                             error=function(e) {
                                 paste("The minimum sample date entered",
                                       "is incorrect. Please enter a date in",
                                       "the format YYYY-MM-DD")
                             })
        
        df <- df %>% filter(SampleDate >= minSampleDate,
                            SampleDate <= maxSampleDate)
    }
    
    # Filter by filter ID
    if(!is.null(filterId)) {
        df <- df %>% filter(FilterId %in% filterId)
    }
    
    # Filter by Barcode ID for CSN only
    if(network=="CSN" & !is.null(barcodeId)) {
        df <- df %>% filter(ContractorFilterAnalysisId %in% barcodeId)
    }
    
    # Filter by TrayFileSampleIdent
    if(!is.null(trayFileSampleId)) {
        df <- df %>% filter(TrayFileSampleIdent %in% trayFileSampleId)
    }
    
    # Filter by QC samples
    if(includeQC=="no") {
        df <- df %>% filter(!(TrayFileSampleIdent %like% 'QC%'))
    } else if(includeQC=="only") {
        df <- df %>% filter(TrayFileSampleIdent %like% 'QC%')
    }
    
    # Filter only valid results
    if(valid) {
        df <- df %>% filter(AnalysisQcCode == 1)
    }
    
    # Filter results by element (Parameter)
    if(elements != "all") {
        df <- df %>% filter(Parameter %in% elements)
    }
    
    if(showQuery) {
        return(show_query(df))
    }
    
    # Collect results so they are not lazy evaluated.
    df <- df %>% collect()
    
    # Convert integer64 class to integer
    df_mut <- df %>%
        dplyr::mutate_if(bit64::is.integer64, as.integer)
    
    return(df_mut)
}
jgiacomo/AQRCxray documentation built on Nov. 19, 2022, 9:15 p.m.