testEndToEndTitanic.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference

# SPDX-Copyright: Copyright (c) Capital One Services, LLC 
# SPDX-License-Identifier: Apache-2.0 
# Copyright 2017 Capital One Services, LLC 
#
# Licensed under the Apache License, Version 2.0 (the "License"); 
# you may not use this file except in compliance with the License. 
#
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
#
# Unless required by applicable law or agreed to in writing, software distributed 
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied.

#' @suggests tibble data.frame

#
# SYSTEM TEST: dataCompareR : Single Match Keys
#
# End-to-end system tests of the dataCompareR function that look at 
# different scenarios involving single-key data frame comparisons. 
#

context("Single Match Key Comparisons")

test_that("ComparisonOfEquals", {
  
  if(require(tibble) & require(data.table) & require(titanic)) {
    # Create a series of data we can use for testing with a single index
    source('createTitanicDatasets.R')
    
    # Part 1 - comparing a dataframe to itself should result in a very simple output
    #           and it shouldn't matter what the formats are 
    
    # compare to itself , with or without keys should be the same
    a1 <- rCompare(titanic2, titanic2, trimChars = TRUE) 
    a2 <- rCompare(titanic2, titanic2, keys = 'PassengerId', trimChars = TRUE)
    # or with a different order, and keys
    a3 <- rCompare(titanic2, titanic2shuffle, keys = 'PassengerId', trimChars = TRUE)
    # And the source data format should not matter
    a4 <- rCompare(titanic2, titanic2DataTable, keys = 'PassengerId', trimChars = TRUE)
    a5 <- rCompare(titanic2, titanic2Matrix, keys = 'PassengerId', trimChars = TRUE)
    a6 <- rCompare(titanic2, titanic2Tibble, keys = 'PassengerId', trimChars = TRUE)
  
    #  a1 is direct A:A comparison, so check that the results are expected
    
    expect_that(length(a1$colMatching$inboth), equals(ncol(titanic2)) )
    expect_that(length(a1$colMatching$inA), equals(0) )
    expect_that(length(a1$colMatching$inB), equals(0) )
    
    expect_that(length(a1$rowMatching$inboth), equals(nrow(titanic2)) )
    expect_that(a1$rowMatching$inA[[1]], equals(integer(0)) )
    expect_that(a1$rowMatching$inB[[1]], equals(integer(0)) )
    
    
    expect_that(length(a1$mismatches), equals(0) )
    expect_that(length(a1$matches), equals(ncol(titanic2) ))
    
    expect_that(all.equal(a1$meta$A,a1$meta$B), equals(TRUE))
    
    # a2 should match on matching details, but the object will differ a little due
    # to a2 using a match key
    
    expect_that(all.equal(a1$colMatching,a2$colMatching),equals(TRUE))
    # Cannot compare row matching, details are different
    expect_that(all.equal(a1$cleaninginfo,a2$cleaninginfo), equals(TRUE))
    expect_that(all.equal(a1$mismatches,a2$mismatches), equals(TRUE))
    # Matches will exclude the key column, so should be out by 1
    expect_that(length(a1$matches) - length(a2$matches), equals(1))
    
    # a3 should match a2 on important details
    expect_that(a2$meta$A, equals(a3$meta$A))
    #expect_that(a2$meta$B, equals(a3$meta$B)) NO! Names differ
    expect_that(a2$colMatching, equals(a3$colMatching))
    expect_that(a2$rowMatching, equals(a3$rowMatching))
    expect_that(a2$cleaninginfo, equals(a3$cleaninginfo))
    expect_that(a2$matches, equals(a3$matches))
    expect_that(a2$mismatches, equals(a3$mismatches))
   
    # a4 should all match a2 too
    expect_that(a2$meta$A, equals(a4$meta$A))
    expect_that(a2$colMatching, equals(a4$colMatching))
    expect_that(a2$rowMatching, equals(a4$rowMatching))
    expect_that(a2$cleaninginfo, equals(a4$cleaninginfo))
    expect_that(a2$matches, equals(a4$matches))
    expect_that(a2$mismatches, equals(a4$mismatches))
    
    # a5 should match a2 for row/col matching, but values are different due to 
    # factor > char conversion
    expect_that(a2$meta$A, equals(a5$meta$A))
    expect_that(a2$colMatching, equals(a5$colMatching))
    expect_that(a2$rowMatching, equals(a5$rowMatching))
    
    # a6 should match a2 too
    expect_that(a2$meta$A, equals(a6$meta$A))
    expect_that(a2$colMatching, equals(a6$colMatching))
    expect_that(a2$rowMatching, equals(a6$rowMatching))
    expect_that(a2$cleaninginfo, equals(a6$cleaninginfo))
    expect_that(a2$matches, equals(a6$matches))
    expect_that(a2$mismatches, equals(a6$mismatches))
  }
  
})


test_that("ComparisonOfUnequals", {

  if(require(tibble) & require(data.table) & require(titanic)) {
    # Create a series of data we can use for testing with a single index
    source('createTitanicDatasets.R')
    
    b1 <- rCompare(titanic,titanic2, trimChars = FALSE, keys = 'PassengerId')
    b1a <- rCompare(titanic,titanic2,trimChars = TRUE, keys = 'PassengerId')
    b1b <- rCompare(titanic,titanic2,trimChars = TRUE)
    
    b2 <- rCompare(titanic,titanic2, keys = 'PassengerId',trimChars = FALSE)
    b3 <- rCompare(titanic, titanic2shuffle, keys = 'PassengerId',trimChars = FALSE)
    b4 <- rCompare(titanic,titanic2DataTable, keys = 'PassengerId',trimChars = FALSE)
    b5 <- rCompare(titanic,titanic2Matrix, keys = 'PassengerId',trimChars = FALSE)
    b6 <- rCompare(titanic,titanic2Tibble, keys = 'PassengerId',trimChars = FALSE)
  
    # Part 1 - determine that b1 looks correct
    # Based on
    # 
    #* `Embarked` has different order of levels, this results in different rows.
    #* `Cabin` has additional levels, this however does not result in different rows (they are the same via `as.character`)
    #* `HasSurvived` has additional level: NaN. Line 5 has HasSurvived as NaN, but in the basel dataset it is NA.
    #
    #The rest of the differences are demonstrated on the `Name` column:
    #  
    # * Row 1: very long string, difference at the end 
    #* Row 2: additional spaces at the beginning
    #* Row 3: additional spaces at the end
    #* Row 4: additional spaces at the beginning and at the end
    #* Row 5: Na vs NaN
    #* Row 6: Inserted new line instead of spaces
    #* Row 7: name in lowercase
    #* Row 8: name in uppercase
    #* Row 9: special characters included
    
    # Check b1 matches all columns
    expect_that(length(b1$colMatching$inboth), equals(ncol(titanic2)) )
    expect_that(length(b1$colMatching$inA), equals(0) )
    expect_that(length(b1$colMatching$inB), equals(0) )
    
    # Check b1 matches all columns 
    expect_that(length(b1$rowMatching$inboth$PASSENGERID), equals(nrow(titanic2)) )
    expect_that(b1$rowMatching$inA[[1]], equals(logical(0)) )
    expect_that(b1$rowMatching$inB[[1]], equals(logical(0)) )
    
    # Check that b1 has mismatches - not trimming chars so expect 6 issues
    expect_true(length(b1$mismatches) >0)
    # 3 fields mismatch
    expect_true(length(b1$mismatches) == 3)
    expect_that(names(b1$mismatches)[1], equals("EMBARKED"))
    expect_that(names(b1$mismatches)[2], equals("HASSURVIVED"))
    expect_that(names(b1$mismatches)[3], equals("NAME"))
    
    # EMBARKED should mismatch
    # Should have 6 columns
    expect_true(length(b1$mismatches$EMBARKED) == 7)
    # I am not working this out - assume that 170 is correct and make sure it holds later
    expect_true(length(b1$mismatches$EMBARKED[[1]]) == 170)
    
    # CABIN should not have mismatches
    expect_that(which(names(b1$mismatches) == 'CABIN'),equals(integer(0)) )
    
    # HasSurvived should have 1 mismatch
    # but first, it should have 6 columns
    expect_true(length(b1$mismatches$HASSURVIVED) == 7)
    # and 1 row
    expect_true(length(b1$mismatches$HASSURVIVED[[1]]) == 1)
    # And they're NAN and NA
    expect_true(is.na(b1$mismatches$HASSURVIVED$valueA[1]))
    expect_true(as.character(b1$mismatches$HASSURVIVED$valueB[1]) == "NaN")
    
    # NAME should have 9 rows of mismatches
    # And of course, 6 cols
    expect_true(length(b1$mismatches$NAME) == 7)
    expect_true(length(b1$mismatches$NAME[[1]]) == 9)
    
    # b1a should be the same, apart from the mismatches, which should only reveal 6 in total
    expect_true(all.equal(b1$matches,b1a$matches))
    expect_true(all.equal(b1$rowMatching,b1a$rowMatching))
    expect_true(all.equal(b1$colMatching,b1a$colMatching))
    expect_true(all.equal(b1$cleaninginfo,b1a$cleaninginfo))
    
    expect_true(all.equal(b1$mismatches$EMBARKED,b1a$mismatches$EMBARKED))
    expect_true(all.equal(b1$mismatches$HASSURVIVED,b1a$mismatches$HASSURVIVED))
    expect_true(all.equal(b1$mismatches$CABIN,b1a$mismatches$CABIN))
    expect_false(length(b1$mismatches$NAME[[1]]) == length(b1a$mismatches$NAME[[1]]))
    expect_true(length(b1a$mismatches$NAME[[1]]) == 6)
    
    
    # b1b should match b1a in terms of content, but differs in format, so check it finds same number of name changes
    expect_true(all.equal(b1b$colMatching,b1a$colMatching))
    expect_true(all.equal(b1b$cleaninginfo,b1a$cleaninginfo))
    expect_true(length(b1b$mismatches$NAME[[1]]) == 6)
    
    
    # 1 diff, due to lack of a match key
    expect_true(length(b1b$matches)- length(b1a$matches) == 1)
    expect_true(length(b1b$rowMatching$inboth) == length(b1a$rowMatching$inboth$PASSENGERID))
    expect_true(length(b1b$mismatches) == length(b1a$mismatches))
    
      # Part 2 - determine that b2-b6 match b1 as far as expected
  
    # b2 should match b1
    expect_that(all.equal(b1$colMatching,b2$colMatching),equals(T))
    expect_that(all.equal(b1$rowMatching,b2$rowMatching),equals(T))
    expect_that(all.equal(b1$mismatches,b2$mismatches),equals(T))
    expect_that(all.equal(b1$matches,b2$matches),equals(T))
    expect_that(all.equal(b1$cleaninginfo,b2$cleaninginfo),equals(T))
    
    # b2 should match b1
    expect_that(all.equal(b1$colMatching,b3$colMatching),equals(T))
    expect_that(all.equal(b1$rowMatching,b3$rowMatching),equals(T))
    expect_that(all.equal(b1$mismatches,b3$mismatches),equals(T))
    expect_that(all.equal(b1$matches,b3$matches),equals(T))
    expect_that(all.equal(b1$cleaninginfo,b3$cleaninginfo),equals(T))
    
    
    # b4 should all match b1 too
    expect_that(all.equal(b1$colMatching,b4$colMatching),equals(T))
    expect_that(all.equal(b1$rowMatching,b4$rowMatching),equals(T))
    expect_that(all.equal(b1$mismatches,b4$mismatches),equals(T))
    expect_that(all.equal(b1$matches,b4$matches),equals(T))
    expect_that(all.equal(b1$cleaninginfo,b4$cleaninginfo),equals(T))
    
    # b5 should match b1 for row/col matching, but values are different due to 
    # factor > char conversion
    expect_that(all.equal(b1$colMatching,b5$colMatching),equals(T))
    # The rest is hard to work on, don't think its worth coding for
    
    # b6 should match b1 too
    expect_that(all.equal(b1$colMatching,b6$colMatching),equals(T))
    expect_that(all.equal(b1$rowMatching,b6$rowMatching),equals(T))
    expect_that(all.equal(b1$mismatches,b6$mismatches),equals(T))
    expect_that(all.equal(b1$matches,b6$matches),equals(T))
    expect_that(all.equal(b1$cleaninginfo,b6$cleaninginfo),equals(T))
  }
  
})


test_that("ComparisonWithMissingRows", {
  if(require(tibble) & require(data.table) & require(titanic)) {
    # Create a series of data we can use for testing with a single index
    source('createTitanicDatasets.R')
    
    # Create a subset of the dataset to test for missing rows
    titanic3 <- titanic2[1:800,]
    
    c1 <- rCompare(titanic,titanic3, trimChars = TRUE, keys = 'PassengerId')
    c2 <- rCompare(titanic2,titanic3, trimChars = TRUE, keys = 'PassengerId')
    c3 <- rCompare(titanic3,titanic2, trimChars = TRUE, keys = 'PassengerId')
    
    
    # Both should match on all columns
    expect_that(length(c1$colMatching$inboth), equals(ncol(titanic2)) )
    expect_that(length(c1$colMatching$inA), equals(0) )
    expect_that(length(c1$colMatching$inB), equals(0) )
    
    expect_that(length(c2$colMatching$inboth), equals(ncol(titanic2)) )
    expect_that(length(c2$colMatching$inA), equals(0) )
    expect_that(length(c2$colMatching$inB), equals(0) )
    
    expect_that(length(c3$colMatching$inboth), equals(ncol(titanic2)) )
    expect_that(length(c3$colMatching$inA), equals(0) )
    expect_that(length(c3$colMatching$inB), equals(0) )
    
    
    # Row matching - expect to miss out on 91 rows
    expect_true(length(c1$rowMatching$inboth$PASSENGERID) == 800)
    expect_true(length(c1$rowMatching$inA$PASSENGERID) == 91)
    expect_true(length(c1$rowMatching$inB$PASSENGERID) == 0)
    
    expect_true(length(c2$rowMatching$inboth$PASSENGERID) == 800)
    expect_true(length(c2$rowMatching$inA$PASSENGERID) == 91)
    expect_true(length(c2$rowMatching$inB$PASSENGERID) == 0)
    
    expect_true(length(c3$rowMatching$inboth$PASSENGERID) == 800)
    expect_true(length(c3$rowMatching$inA$PASSENGERID) == 0)
    expect_true(length(c3$rowMatching$inB$PASSENGERID) == 91)
    
    # Can't do much about if the mismatches work, but, we can check equality where we can
    # Col matching should be the same
    expect_that(all.equal(c1$colMatching,c2$colMatching),equals(TRUE))
    expect_that(all.equal(c3$colMatching,c2$colMatching),equals(TRUE))
  
    # Same # mismatches in 2/3 (which is to say, 0)
    expect_that(length(c3$mismatches) == length(c2$mismatches),equals(TRUE))
    
    # Could do more here, but won't
  }
})


test_that("ComparisonWithMissingCols", {
  
  if(require(tibble) & require(data.table) & require(titanic)) {
    # Create a series of data we can use for testing with a single index
    source('createTitanicDatasets.R')
    
    # Create a subset of the dataset to test for missing rows
    titanic3 <- titanic2[,1:10]
    
    d1 <- rCompare(titanic3,titanic2, trimChars = TRUE, keys = 'PassengerId')
    d2 <- rCompare(titanic2,titanic3, trimChars = TRUE, keys = 'PassengerId')
    
    # Again, we'll only check a few of the easier aspects
    
    # Check we match 10 cols, mismatch 3 in each case
    expect_that(length(d1$colMatching$inboth), equals(10) )
    expect_that(length(d1$colMatching$inA), equals(0) )
    expect_that(length(d1$colMatching$inB), equals(3) )
    
    expect_that(length(d2$colMatching$inboth), equals(10) )
    expect_that(length(d2$colMatching$inA), equals(3) )
    expect_that(length(d2$colMatching$inB), equals(0) )
    
    expect_that(d1$rowMatching, equals(d2$rowMatching) )
    expect_that(length(d1$rowMatching$inboth$PASSENGERID), equals(891) )
    expect_that(length(d1$rowMatching$inA$PASSENGERID), equals(0) )
    expect_that(length(d1$rowMatching$inB$PASSENGERID), equals(0) )
    
    expect_that(d1$matches, equals(d2$matches) )
  }
  })
  
  test_that("TestRoundingEndToEnd", {
    
    if(require(titanic)) {
      # Create a series of data we can use for testing with a single index
      source('createTitanicDatasets.R')
      
      titanicShort <- select(titanic, PassengerId, Age, Fare)
      
      # Expect no error
      expect_error(roundedCompare <- rCompare(titanicShort, titanicShort, keys = 'PassengerId', roundDigits = 0), NA)
      
      titanicShort2 <- titanicShort
      titanicShort2$Age <- titanicShort2$Age + 0.00001
      
      # Run a test where we get rounding errors
      
      # Expect no error
      expect_error(roundedCompare2 <- rCompare(titanicShort, titanicShort2, keys = 'PassengerId', roundDigits = NA), NA)
      
      # Expect all rows to error
      expect_true(nrow(roundedCompare2$mismatches$AGE) == nrow(filter(titanicShort, !is.na(Age))))
      # Expect only noise to remain
      expect_true(all(abs(roundedCompare2$mismatches$AGE$diffAB+1e-05) < 1E-14))
      
      
      # And another where we don't
      
      # Expect no error
      expect_error(roundedCompare3 <- rCompare(titanicShort, titanicShort2, keys = 'PassengerId', roundDigits = 1), NA)
      
      # Expect no mismatches
      expect_true(length(roundedCompare3$mismatches) == 0)
      expect_true(length(roundedCompare3$matches)==2)
    }
  })
Any scripts or data that you put into this service are public.
dataCompareR documentation built on Nov. 23, 2021, 9:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
dataCompareR
Compare Two Data Frames and Summarise the Difference

tests/testthat/testEndToEndTitanic.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference

Try the dataCompareR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataCompareR Compare Two Data Frames and Summarise the Difference

tests/testthat/testEndToEndTitanic.R In dataCompareR: Compare Two Data Frames and Summarise the Difference

Try the dataCompareR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataCompareR
Compare Two Data Frames and Summarise the Difference

tests/testthat/testEndToEndTitanic.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference