In leppott/ContDataSumViz: Create Visualizations of Continuous Monitoring Data

# Options
knitr::opts_chunk$set(results='asis', echo=FALSE, warning=FALSE)
# Packages
# library(knitr)
# library(DT)
# library(plotly)
# library(ggplot2)
# library(dplyr)
# Data files
strFile <- file.path(dir_data, file_data)
data.import <- read.csv(strFile)

DATA FILE INFORMATION

 # Report Info
  myReportDate <- format(Sys.Date(),ContData.env$myFormat.Date)
  cat(paste("**Report Date:** ",myReportDate,"\n\n",sep=""))
  myUser <- Sys.getenv("USERNAME")
  cat(paste("**Generated By:** ",myUser,"\n\n",sep=""))

  #filename
  cat("**Filename:** ",file_data,"\n\n",sep="")
  mySiteID <- data.import[1,ContData.env$myName.SiteID]

  #cat(paste("**SiteID:** ",mySiteID,"\n\n",sep=""))

  myNumRecords <- nrow(data.import) 
  # 20170228, mod from records 10 and 11 to half way point

  # myTimeDiff <- difftime(data.import[10,ContData.env$myName.DateTime]
  #,data.import[11,ContData.env$myName.DateTime],units="mins")
  #x <- data.import[,ContData.env$myName.DateTime]
  myT <- strptime(data.import[,ContData.env$myName.DateTime]
                  ,format=ContData.env$myFormat.DateTime)
  myTimeDiff.all <- difftime(myT[-1],myT[-length(myT)],units="mins")
  myTimeDiff <- median(as.vector(myTimeDiff.all),na.rm=TRUE)
  cat(paste("\n\n**Period of Record, Actual:** "
            ,min(data.import[,ContData.env$myName.Date])
            ," to "
            ,max(data.import[,ContData.env$myName.Date])
            ,"\n\n"
            ,sep=""))

  cat(paste("**Recording Interval (estimated):** "
            ,myTimeDiff[1]
            ," minutes\n\n"
            ,sep=""))

   myParameters.ALL     <- ContData.env$myNames.DataFields[ContData.env$myNames.DataFields 
                                                           %in% names(data.import)==TRUE] 
   myParameters.Lab.ALL <- ContData.env$myNames.DataFields.Lab[ContData.env$myNames.DataFields 
                                                               %in% names(data.import)==TRUE]
   # Filter out Discrete
   myParameters     <- myParameters.ALL[!grepl(ContData.env$myPrefix.Discrete,myParameters.ALL)]
   myParameters.Lab <- myParameters.Lab.ALL[
                                           !grepl(ContData.env$myPrefix.Discrete
                                                   ,myParameters.Lab.ALL)]


 #cat("**Parameters:** ",paste(myParameters.Lab,", ",sep=""),"\n\n",sep="")
  # above line not working, preventing pandoc conversion to WORD
  #cat("**Included Parameters:** \n\n")
  cat("**Parameters Included:** ",paste(myParameters.ALL
                                        ,", "
                                        ,sep="")
      ,"\n\n"
      ,sep="")

DATA SUMMARY, OVERALL

# Plot of range of data
df_plot <- unique(data.import[, ContData.env$myName.Date, FALSE])
names(df_plot) <- "Date"
df_plot[, "Date"] <- as.Date(df_plot[, "Date"])
# julian dates
df_plot$Year <- format(df_plot[, "Date"], "%Y")
df_plot$j <- as.numeric(format(df_plot[, "Date"], "%j"))
df_plot$Date_Plot <- as.Date(as.numeric(format(df_plot[, "Date"]
                                                        , "%j")) - 1
                                      , origin = as.Date("2004-01-01"))
# plot
p_years <- ggplot2::ggplot(data = df_plot
                           , ggplot2::aes(y = Year)) +
  ggplot2::scale_x_date(date_labels = "%b%d"
                        , limits = as.Date(c("2004-01-01", "2004-12-31"))
                        , date_breaks = "1 month"
                        , expand = c(0, 0)) + 
  ggplot2::labs(x = "Date"
                , y = "Year"
                , title = "Data by Year and Dates") + 
  ggplot2::geom_line(ggplot2::aes(x = Date_Plot
                                  , y = Year)
                     , size = 2
                     , na.rm = TRUE) +
  ggplot2::theme_bw()
p_years
cat("\n\n")


      # Convert time interval (minutes) to number per day
      records.expected <- round(24*60/as.numeric(myTimeDiff[1]),1)

      cat(paste("Estimated number of records per day is ",records.expected,".",sep=""))
        cat("\n\n")

    cat("Number of records by year and month (with totals).")
    cat("\n")
    myTable <- addmargins(table(data.import[,ContData.env$myName.Yr]
                                ,data.import[,ContData.env$myName.Mo]
                                , useNA = "ifany"))
        print(knitr::kable(myTable, format="markdown", row.names=TRUE))
    cat("\n\n")

  # revert to longer table (20170228)
    cat("Number of records by day and month (with totals).")
    cat("\n")
    myTable <- addmargins(table(data.import[,ContData.env$myName.Day]
                                ,data.import[,ContData.env$myName.Mo]
                                , useNA = "ifany"))
        print(knitr::kable(myTable, format="markdown", row.names=TRUE))
    cat("\n\n") 
#     
#     pandoc.table(myTable.month.day.rec.GT15,style="rmarkdown")
#    
#     
    myTable.month.day.rec <- table(data.import[,ContData.env$myName.Mo]
                                   ,data.import[,ContData.env$myName.Day]
                                   , useNA = "ifany")
#     use apply function to count number of records not equal to the expected value
    cat("\n\n")

    #
    myTable.DateTime.N <- aggregate(data.import[,ContData.env$myName.SiteID]  
                                    ~ data.import[,ContData.env$myName.DateTime]
                                    , data=data.import,FUN=length)
    # myTable <- myTable.DateTime.N[myTable.DateTime.N[,ContData.env$myName.SiteID]!=1,]
    # cat(paste0("Date and time records with more than one entry; N=",nrow(myTable),". Duplicate date.time records are shown below."))
    # cat("\n")
    #   print(knitr::kable(myTable, format="markdown", row.names=TRUE))
    # mask error, 20170307

    # Missing Dates

    myDateRange.Data <- seq(as.Date(min(data.import[,ContData.env$myName.Date]))
                            ,as.Date(max(data.import[,ContData.env$myName.Date]))
                            ,by="day")

    if(exists("fun.myData.DateRange.Start")==TRUE){
      myDateRange.Target <- seq(as.Date(fun.myData.DateRange.Start)
                              ,as.Date(fun.myData.DateRange.End)
                              ,by="day")
    } else {
      myDateRange.Target <- myDateRange.Data
    }

    myDateRange.Target.Missing <- myDateRange.Target[!myDateRange.Target %in% myDateRange.Data]
    cat(paste0("There are "
               , length(myDateRange.Target.Missing)
               , "dates with missing data between the min ("
               , min(myDateRange.Data)
               , ") and max("
               , max(myDateRange.Data)
               , ") for the provided data. \n\n"
               , "Below are the dates:"))
    cat("\n\n")
    print(ifelse(length(myDateRange.Target.Missing)==0,"",myDateRange.Target.Missing))

FLAGS

cat(paste("* **",ContData.env$myFlagVal.Pass,"** = Pass,",sep=""))
cat("\n")
cat(paste("* **",ContData.env$myFlagVal.Suspect,"** = Suspect,",sep=""))  
cat("\n")
cat(paste("* **",ContData.env$myFlagVal.Fail,"** = Fail,",sep=""))
cat("\n")
cat(paste("* **",ContData.env$myFlagVal.NoData,"** = No Data or Not Applicable (NA).",sep=""))

Overall flags are assigned by examining the results of the four tests below.

cat(paste("* **",ContData.env$myFlagVal.Pass,"** = no Fail or Suspect and at least one Pass,",sep=""))
cat("\n")
cat(paste("* **",ContData.env$myFlagVal.Suspect,"** = no Fail and at least one Suspect,",sep="")) 
cat("\n") 
cat(paste("* **",ContData.env$myFlagVal.Fail,"** = at least one Fail,",sep=""))
cat("\n")
cat(paste("* **",ContData.env$myFlagVal.NoData,"** = all tests were Missing Data.",sep=""))

# Overall flags by paramter
# (maybe combine into a single table)
# skip for now
# duplicated in by parameter section for missing data
# 
# 
#      for (j in 1:length(myParameters)) {##FOR.j.START
# #       #
#        #j.num <- match(j,myParameters)
#  #   cat(paste(myParameters.Lab[j.num],"/n/n",sep=""))
#        
#        # ISSUE WITH knitr::kable INSIDE OF LOOPS
#        
#     cat(paste("## ",myParameters[j],"\n\n",sep=""))
#        colnum <- match(paste("Flag",myParameters[j],sep="."),names(data.import))
#        
#        #print(myTable.Flags.Overall <- ftable(data.import[,colnum]))
#        myTable <- addmargins(table(data.import[,colnum]
#                                    , useNA = "ifany")
#                              ,1)
#        print(knitr::kable(t(as.matrix(myTable)), format = "markdown"))
#        # future mod, 1,3,4,9 to Pass, Suspect, Fail, NA
#        # exclude=NA to get all
#        # future mod, 2nd line in table to include Percent of row total
#        # future mod, sort levels
#        
#        cat("\n\n")
#        
#      }##FOR.j.END

MISSING DATA, BY PARAMETER {.tabset}

Number of expected values is estimated but this assumes equal sampling effort in all cases. This may not be the case for many data files.

# 3.1. Number of Records (revised 20170228)
 # Convert time interval (minutes) to number per day
records.expected <- round(24*60/as.numeric(myTimeDiff[1]),1)
cat(paste("The estimated number of records per day is ",records.expected,".",sep=""))
cat("\n\n")

The number of existing values will be shown in a heat map and the number of dates with not the expected number of records is shown in a table.

# DT::datatable and plotly objects can only have one per chunk
# this is why they don't render when generate them in a loop

# 3. Data Info, by Parameter
    # Individual Data Type Report
    # QC
    i <- myParameters[1]
     for (i in myParameters) {##FOR.i.START
#       #
       i.num <- match(i,myParameters)
       myTitle.Sub <- myParameters.Lab[i.num]

       i.flag <- paste(ContData.env$myName.Flag, i, sep = ".")

      cat("## ",i,"\n\n",sep="")

       #cat(paste("**QC TESTS,",myTitle.Sub,sep=""))

       # 3.1. Flags, overall
       #cat(myTitle.Sub <- "### Flags")
             cat("\n\n")
      cat("#### Flag summary, overall number of records.")
      cat("\n\n")
       myTable <- addmargins(table(data.import[, i.flag]
                                   , useNA = "ifany")
                             ,1)
       print(knitr::kable(t(as.matrix(myTable)), format = "markdown"))

      cat("\n\n")

      #

      #
      cat("Number of records by year and month (with totals)")
      cat("\n")
      myTable <- addmargins(table(data.import[,ContData.env$myName.Yr]
                                  ,data.import[,ContData.env$myName.Mo]
                                  , useNA = "ifany"))
      print(knitr::kable(myTable, format="markdown", row.names=TRUE))
      cat("\n\n")
      #

      df_base <- data.import %>%
        dplyr::filter(!!as.name(i.flag) == "P") %>%
        dplyr::group_by(!!as.name(ContData.env$myName.MoDa)
                 , !!as.name(ContData.env$myName.Yr)) %>%
        dplyr::summarise(n = dplyr::n(), .groups = "drop")
      # munge
      df_base$MonthDay <- sprintf("%04d", df_base$MonthDay)
      # change class
      df_base$MonthDay <- as.character(df_base$MonthDay)
      df_base$Year <- as.character(df_base$Year)

      # Plot, heat map
      str_title_hm <- paste0("Records with 'P'assing flags, ", mySiteID, ", ", i)
      p_hm <- ggplot2::ggplot(df_base
                              , ggplot2::aes(x = !!as.name(ContData.env$myName.Yr)
                                          , y = !!as.name(ContData.env$myName.MoDa)
                                          , fill = n)) +
        ggplot2::geom_tile() +
        ggplot2::scale_fill_gradient(low = "orange", high = "dark green") +
        ggplot2::labs(title = str_title_hm) +
        ggplot2::scale_y_discrete(limits = rev)
      # display as plotly
      print(p_hm)
      #plotly::ggplotly(p_hm)
     # print(p_hm_ly)

      # without plotly use
      #scale_y_discrete( guide = ggplot2::guide_axis(check.overlap = TRUE))


      cat("\n\n")

      # table of dates with missing data
      df_missing <- df_base %>%
        dplyr::filter(n != records.expected) %>%
        dplyr::select(Year, MonthDay, n) %>%
        dplyr::arrange(Year, MonthDay)
      cat <- paste0("Dates with Potential Missing Records (expected = "
                    , records.expected
                    , ").")
      # print(DT::datatable(df_missing
      #               , filter = "top"
      #               , caption = cat
      #               , options = list(scrollX = TRUE
      #                                , scrollY = TRUE
      #                                , pageLength = 30
      #                                , lengthMenu = c(15, 30, 60, 90, 120, 180, 366)
      #                                , autoWidth = TRUE
      #                                )))
      cat("\n\n")
      print(knitr::kable(df_missing, caption = cat))
      cat("\n\n")
      # cat(knitr::knit_print(DT::datatable(mtcars)))


      cat("Number of records by day and month (with totals)")
      cat("\n")
      myTable <- addmargins(table(data.import[,ContData.env$myName.Day]
                                  ,data.import[,ContData.env$myName.Mo]
                                  , useNA = "ifany"))
      #table, kable (static)
      #print(knitr::kable(myTable, format="markdown", row.names=TRUE))

      # table, DT (interactive)
      DT::datatable(as.data.frame.matrix(myTable)
                    , filter = "top"
                    , caption = "Records by day and month"
                    , options = list(scrollX = TRUE
                                     , scrollY = TRUE
                                     , lengthMenu = c(10, 20, 32)
                                     , autoWidth = TRUE
                                     ))



      cat("\n\n") 
      #
      # identify days/months where not the expected number of records
      # (expect first and last day)
  #    print("days where not the expected number of records")
      #

      #~~~~~~~~~~~~
      # New, 20220616
      # OTHER


      # Pivot
      df_plot <- tidyr::pivot_wider(df_base
                             , names_from = ContData.env$myName.Yr
                             , values_from = "n")

      cat("\n\n")
      #

      # table, DT (interactive)
      # add color code (https://rstudio.github.io/DT/010-style.html)
      brks <- quantile(df_plot[, -1], probs = seq(.05, .95, .05), na.rm = TRUE)
      clrs <- round(seq(255, 40, length.out = length(brks) + 1), 0) %>%
                      {paste0("rgb(255,", ., ",", ., ")")}


      cat <- "Records by month-day and year"
      print(knitr::kable(df_plot, caption = cat))

      DT::datatable(df_plot
                    , filter = "top"
                    , caption = cat
                    , options = list(scrollX = TRUE
                                     , scrollY = TRUE
                                     , pageLength = 30
                                     , lengthMenu = c(15, 30, 60, 90, 120, 180, 366)
                                     , autoWidth = TRUE
                                     )) 
      # coloring doesn't look right
      # %>%
      #   formatStyle(names(df_plot)[-1]
      #                , backgroundColor = styleInterval(brks, clrs))
      # 
      cat("\n\n")



     }##FOR.i.END