tests/testthat/test-performance.R

context("Datastep Performance Tests")

base_path <- "c:\\packages\\libr\\tests\\testthat\\data"

base_path <- "./data"

DEV <- FALSE


test_that("add_autos() function works as expected", {
  
  libname(dat, file.path(base_path, "SDTM"), "sas7bdat")

  tm <- Sys.time()
  
  dat3 <- add_autos(dat$lb, c("USUBJID", "LBCAT", "LBTESTCD"))
  
  
  tmdiff <- Sys.time() - tm
  tmdiff
  
  expect_equal("first." %in% names(dat3), TRUE)
  expect_equal("last." %in% names(dat3), TRUE)

})

test_that("sort check works as expected", {

  dat3 <- add_autos(mtcars, c("am"), sort_check = FALSE)
  

  expect_equal("first." %in% names(dat3), TRUE)
  expect_equal("last." %in% names(dat3), TRUE)
  expect_error( add_autos(mtcars, c("am"), sort_check = TRUE))
})


# Baseline of 10 sec on condition
test_that("datastep() performance is good", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")

    
    tm <- Sys.time()
    
    res <- datastep(dat$lb, 
                    {
                      if (is.na(LBBLFL))
                        blisna <- TRUE
                      else
                        blisna <- FALSE
                      
                      
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 10, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Jumps to 14 seconds when retain added
test_that("datastep() performance with retain is good", {
  
  
  libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
  
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    
    tm <- Sys.time()
    
    res <- datastep(dat$lb, retain = list(rnum = 0),
                    {
                      if (is.na(LBBLFL))
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                      rnum <- 2 + 1
                      
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 14, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Still less than < 11 seconds when group by added
test_that("datastep() performance with by group is good", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    
    tm <- Sys.time()
    
    res <- datastep(dat$lb, 
                    by = c("USUBJID"),
                    {
                      if (first.)
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                    #  rnum <- rnum + 1
                      
                    })
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 11, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})

# Jumps to 14 seconds when retain and group by added
test_that("datastep() performance with retain is good", {
  
  if (DEV) {
    
    libname(dat, file.path(base_path, "SDTM"), "sas7bdat")
    
    tm <- Sys.time()
  
    
    res <- datastep(dat$lb, retain = list(rnum = 0),
                    by = c("USUBJID", "LBCAT", "LBTESTCD"),
                    {
                      if (first.)
                        subjstart <- TRUE
                      else
                        subjstart <- FALSE
                      
                      rnum <- rnum + 1
                      
                      rnum2 <- n.
                      
                    }, sort_check = TRUE)
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    expect_equal(tmdiff < 16, TRUE)
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("datastep() with group_by performance is good", {
  
  if (DEV) {
    
    library(dplyr)
    
    scs <- specs(PE = import_spec(PESTAT = "character"))
    
    libname(dat, file.path(base_path, "SDTM"), "csv", import_specs = scs)
    
    tm <- Sys.time()
    
    prep <- dat$DM %>% 
      left_join(dat$VS, by = c("USUBJID" = "USUBJID")) %>% 
      select(USUBJID, VSTESTCD, VISIT, VISITNUM, VSSTRESN, ARM, VSBLFL) %>% 
      filter(VSTESTCD %in% c("PULSE", "RESP", "TEMP", "DIABP", "SYSBP"), 
             !(VISIT == "SCREENING" & VSBLFL != "Y")) %>% 
      arrange(USUBJID, VSTESTCD, VISITNUM) %>% 
      group_by(USUBJID, VSTESTCD) %>%
      #datastep(by = c("USUBJID", "VSTESTCD"), retain = list(BSTRESN = 0), {
      datastep(retain = list(BSTRESN = 0), {
        
        # Combine treatment groups
        # And distingish baseline time points
        if (ARM == "ARM A") {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "A_BASE"
          } else {
            GRP <- "A_TRT"
          }
          
        } else {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "O_BASE"
          } else {
            GRP <- "O_TRT"
          }
          
        }
        
        # Populate baseline value
        if (first.)
          BSTRESN = VSSTRESN
        
      })
    
    tmdiff <- Sys.time() - tm 
    tmdiff
    
    expect_equal(tmdiff < 3, TRUE)
    
  } else 
    expect_equal(TRUE, TRUE)
  
})


test_that("100,000 row datastep on data.frame is good.", {
  
  if (DEV) {
    
    l <- 100000
    
    df <- data.frame(C1 = seq_len(l), C2 = runif(l), 
                     C3 = runif(l), C4 = runif(l), stringsAsFactors = FALSE)
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 20 seconds
    # 1,000,000 rows is 3.6 minutes
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("100,000 row datastep on tibble is good.", {
  
  if (DEV) {
    
    library(tibble)
    
    l <- 100000
    
    df <- tibble(C1 = seq_len(l), C2 = runif(l), 
                     C3 = runif(l), C4 = runif(l))
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 21.2 seconds
    # 1,000,000 rows is 3.8 minutes
    
  } else
    expect_equal(TRUE, TRUE)
  
})



test_that("100,000 row datastep on data.table is good.", {
  
  if (DEV) {
    
    library(data.table)
    
    l <- 100000
    
    df <- data.table(C1 = seq_len(l), C2 = runif(l), 
                 C3 = runif(l), C4 = runif(l))
    
    tm <- Sys.time()
    
    res <- datastep(df, attrib = list(C5 = 0, C6 = 0),
                    {
                      C5 <- C2 + C3 + C4
                      C6 <- max(C2, C3, C4)
                      
                    })
    
    
    tmdiff <- Sys.time() - tm
    tmdiff
    
    res[1:10, ]
    
    expect_equal(tmdiff < 30, TRUE)
    
    # 100,000 rows is 1.3 minutes with out modification to datastep
    # 100,000 rows is 22.3 seconds after modification 
    # 1,000,000 rows not going to try
    
  } else
    expect_equal(TRUE, TRUE)
  
})


test_that("complex datastep() performance is good", {
  
  if (DEV) {
    
    library(dplyr)
    
    scs <- specs(PE = import_spec(PESTAT = "character"))
    
    libname(dat, file.path(base_path, "SDTM"), "csv", import_specs = scs, 
            quiet = TRUE)
    
    tm <- Sys.time()
    
    prep <- dat$DM %>% 
      left_join(dat$VS, by = c("USUBJID" = "USUBJID")) %>% 
      select(USUBJID, VSTESTCD, VISIT, VISITNUM, VSSTRESN, ARM, VSBLFL) %>% 
      filter(VSTESTCD %in% c("PULSE", "RESP", "TEMP", "DIABP", "SYSBP"), 
             !(VISIT == "SCREENING" & VSBLFL != "Y")) %>% 
      arrange(USUBJID, VSTESTCD, VISITNUM) %>% 
      group_by(USUBJID, VSTESTCD) %>%
      datastep(by = c("USUBJID", "VSTESTCD"), retain = list(BSTRESN = 0), {
        
        # Combine treatment groups
        # And distingish baseline time points
        if (ARM == "ARM A") {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "A_BASE"
          } else {
            GRP <- "A_TRT"
          }
          
        } else {
          
          if (VSBLFL %eq% "Y") {
            GRP <- "O_BASE"
          } else {
            GRP <- "O_TRT"
          }
          
        }
        
        # Populate baseline value
        if (first.)
          BSTRESN = VSSTRESN
        
      })
    
    tmdiff <- Sys.time() - tm 
    tmdiff
    
    expect_equal(tmdiff < 3, TRUE)
    
  } else 
    expect_equal(TRUE, TRUE)
  
})

Try the libr package in your browser

Any scripts or data that you put into this service are public.

libr documentation built on Nov. 18, 2023, 1:08 a.m.