Introduction to diffdf

Craig Gower & Kieran Martin

2019-03-12

The purpose of diffdf is to provide proc compare like functionality to R for use in second line programming. In particular we focus on raising warnings if any differences are found whilst providing in-depth diagnostics to highlight where these differences have occurred.

Basic usage

Here we show the basic functionality of diffdf using a dummy data set.

library(diffdf)

LENGTH = 30

suppressWarnings(RNGversion("3.5.0"))
set.seed(12334)

test_data <- tibble::tibble( 
    ID          = 1:LENGTH,
    GROUP1      = rep( c(1,2) , each = LENGTH/2),
    GROUP2      = rep( c(1:(LENGTH/2)), 2 ),
    INTEGER     = rpois(LENGTH , 40),
    BINARY      = sample( c("M" , "F") , LENGTH , replace = T),
    DATE        = lubridate::ymd("2000-01-01") + rnorm(LENGTH, 0, 7000),
    DATETIME    = lubridate::ymd_hms("2000-01-01 00:00:00") + rnorm(LENGTH, 0, 200000000), 
    CONTINUOUS  = rnorm(LENGTH , 30 , 12),
    CATEGORICAL = factor(sample( c("A" , "B" , "C") , LENGTH , replace = T)),
    LOGICAL     = sample( c(TRUE , FALSE) , LENGTH , replace = T),
    CHARACTER   = stringi::stri_rand_strings(LENGTH,  rpois(LENGTH , 13),  pattern = "[ A-Za-z0-9]")
)

test_data
#> # A tibble: 30 x 11
#>       ID GROUP1 GROUP2 INTEGER BINARY DATE       DATETIME           
#>    <int>  <dbl>  <int>   <int> <chr>  <date>     <dttm>             
#>  1     1      1      1      41 M      2003-06-22 2000-11-28 20:40:53
#>  2     2      1      2      41 F      2016-12-03 1994-08-30 19:05:02
#>  3     3      1      3      41 M      2016-05-08 1992-09-11 11:30:18
#>  4     4      1      4      32 M      2015-06-02 2007-11-12 11:28:29
#>  5     5      1      5      55 F      1986-04-15 1998-08-04 01:27:49
#>  6     6      1      6      33 M      1994-05-25 2001-12-05 08:24:35
#>  7     7      1      7      40 F      2009-02-08 1986-11-02 18:13:03
#>  8     8      1      8      44 F      2020-07-21 1998-08-22 05:23:24
#>  9     9      1      9      51 F      1967-05-25 2003-01-03 22:09:29
#> 10    10      1     10      40 M      2044-03-11 1996-04-19 11:10:12
#> # … with 20 more rows, and 4 more variables: CONTINUOUS <dbl>,
#> #   CATEGORICAL <fct>, LOGICAL <lgl>, CHARACTER <chr>

diffdf( test_data , test_data)
#> No issues were found!

As you would expect no differences are found. We now look to introduce various types differences into the data in order to show how diffdf highlights them. Note that for the purposes of this vignette we have used the suppress_warnings argument to stop errors being raised; it is recommended however that this option is not used in production code as it may mask problems.

Missing Columns

test_data2 <- test_data 
test_data2 <- test_data2[,-6]
diffdf(test_data , test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE that are not in COMPARE !!
#> All rows are shown in table below
#> 
#>   =========
#>    COLUMNS 
#>   ---------
#>     DATE   
#>   ---------

Missing Rows

test_data2 <- test_data
test_data2 <- test_data2[1:(nrow(test_data2) - 2),]
diffdf(test_data, test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are rows in BASE that are not in COMPARE !!
#> All rows are shown in table below
#> 
#>   ===============
#>    ..ROWNUMBER.. 
#>   ---------------
#>         29       
#>         30       
#>   ---------------

Different Values

test_data2 <- test_data
test_data2[5,2] <- 6
diffdf(test_data , test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> Not all Values Compared Equal
#> All rows are shown in table below
#> 
#>   =============================
#>    Variable  No of Differences 
#>   -----------------------------
#>     GROUP1           1         
#>   -----------------------------
#> 
#> 
#> All rows are shown in table below
#> 
#>   ========================================
#>    VARIABLE  ..ROWNUMBER..  BASE  COMPARE 
#>   ----------------------------------------
#>     GROUP1         5         1       6    
#>   ----------------------------------------

Different Types

test_data2 <- test_data
test_data2[5,2] <- "blah"
diffdf(test_data , test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE and COMPARE with different modes !!
#> All rows are shown in table below
#> 
#>   ================================
#>    VARIABLE  MODE.BASE  MODE.COMP 
#>   --------------------------------
#>     GROUP1    numeric   character 
#>   --------------------------------
#> 
#> There are columns in BASE and COMPARE with different classes !!
#> All rows are shown in table below
#> 
#>   ==================================
#>    VARIABLE  CLASS.BASE  CLASS.COMP 
#>   ----------------------------------
#>     GROUP1    numeric    character  
#>   ----------------------------------

Different Labels

test_data2 <- test_data
attr(test_data$ID , "label") <- "This is a interesting label"
attr(test_data2$ID , "label") <- "what do I type here?"
diffdf(test_data , test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE and COMPARE with differing attributes !!
#> All rows are shown in table below
#> 
#>   ========================================================================
#>    VARIABLE  ATTR_NAME          VALUES.BASE              VALUES.COMP      
#>   ------------------------------------------------------------------------
#>       ID       label    This is a interesting label  what do I type here? 
#>   ------------------------------------------------------------------------

Different Factor Levels

test_data2 <- test_data
levels(test_data2$CATEGORICAL) <- c(1,2,3)
diffdf(test_data , test_data2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE and COMPARE with differing attributes !!
#> All rows are shown in table below
#> 
#>   ============================================================
#>     VARIABLE    ATTR_NAME    VALUES.BASE       VALUES.COMP    
#>   ------------------------------------------------------------
#>    CATEGORICAL   levels    c("A", "B", "C")  c("1", "2", "3") 
#>   ------------------------------------------------------------
#> 
#> Not all Values Compared Equal
#> All rows are shown in table below
#> 
#>   ================================
#>     Variable    No of Differences 
#>   --------------------------------
#>    CATEGORICAL         30         
#>   --------------------------------
#> 
#> 
#> First 10 of 30 rows are shown in table below
#> 
#>   ===========================================
#>     VARIABLE    ..ROWNUMBER..  BASE  COMPARE 
#>   -------------------------------------------
#>    CATEGORICAL        1         C       3    
#>    CATEGORICAL        2         C       3    
#>    CATEGORICAL        3         A       1    
#>    CATEGORICAL        4         C       3    
#>    CATEGORICAL        5         A       1    
#>    CATEGORICAL        6         A       1    
#>    CATEGORICAL        7         A       1    
#>    CATEGORICAL        8         A       1    
#>    CATEGORICAL        9         C       3    
#>    CATEGORICAL       10         B       2    
#>   -------------------------------------------

Grouping Variables

A key feature of diffdf that enables easier diagnostics is the ability to specify which variables form a unique row i.e. which rows should be compared against each other based upon a key. By default if no key is specified diffdf will use the row numbers as the key however in general this isn’t recommended as it means two identical datasets simply sorted differently can lead to incomprehensible error messages as every observation is flagged as different. In diffdf keys can be specified as character vectors using the keys argument.

test_data2 <- test_data
test_data2$INTEGER[c(5,2,15)] <- 99L 
diffdf( test_data , test_data2 , keys = c("GROUP1" , "GROUP2") , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> Not all Values Compared Equal
#> All rows are shown in table below
#> 
#>   =============================
#>    Variable  No of Differences 
#>   -----------------------------
#>    INTEGER           3         
#>   -----------------------------
#> 
#> 
#> All rows are shown in table below
#> 
#>   =========================================
#>    VARIABLE  GROUP1  GROUP2  BASE  COMPARE 
#>   -----------------------------------------
#>    INTEGER     1        2     41     99    
#>    INTEGER     1        5     55     99    
#>    INTEGER     1       15     44     99    
#>   -----------------------------------------

Printing and Saving the Compare Log

When there are lots of mismatches the log can become unwieldly. In order to deal with this you can either use the print function to display a particular variable or the outfile option to save the log to a file.

result <- diffdf( 
    test_data , 
    test_data2 , 
    keys = c("GROUP1" , "GROUP2") , 
    outfile =  "reports/diffdf.log"
)

Misc

Accessing problem rows

As an additional utility diffdf comes with the function diffdf_issuerows() which can be used to subset your dataset against the issue object to return just the rows that are flagged as containing issues.

iris2 <- iris
for (i in 1:3) iris2[i,i] <- 99
diff <- diffdf( iris , iris2, suppress_warnings = TRUE)
diffdf_issuerows( iris , diff)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
diffdf_issuerows( iris2 , diff)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1         99.0         3.5          1.4         0.2  setosa
#> 2          4.9        99.0          1.4         0.2  setosa
#> 3          4.7         3.2         99.0         0.2  setosa

Bear in mind that the vars option can be used to just subset down to issues associated with particular variables.

diffdf_issuerows( iris2 , diff , vars = "Sepal.Length")
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           99         3.5          1.4         0.2  setosa
diffdf_issuerows( iris2 , diff , vars = c("Sepal.Length" , "Sepal.Width"))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1         99.0         3.5          1.4         0.2  setosa
#> 2          4.9        99.0          1.4         0.2  setosa

Are there issues ?

Sometimes it can be useful to use the comparison result to fuel further checks or programming logic. To assist with this diffdf offers two pieces of functionality namely the suppress_warnings argument (which has already been shown) and the diffdf_has_issues() helper function which simply returns TRUE if differences have been found else FALSE.

iris2 <- iris
for (i in 1:3) iris2[i,i] <- 99
diff <- diffdf( iris , iris2, suppress_warnings = TRUE)

diffdf_has_issues(diff)
#> [1] TRUE
if ( diffdf_has_issues(diff)){
    #<Further programming steps / logic> 
}

Tolerance

You can use the tolerance argument of diffdf to define how sensitive the comparison should be to decimal place inaccuracies. This important as very often floating point numbers will not compare equal due to machine rounding as they cannot be perfectly represented in binary. By default tolerance is set to sqrt(.Machine$double.eps)

dsin1 <- data.frame(x = 1.1e-06)
dsin2 <- data.frame(x = 1.1e-07)

diffdf(dsin1  , dsin2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> Not all Values Compared Equal
#> All rows are shown in table below
#> 
#>   =============================
#>    Variable  No of Differences 
#>   -----------------------------
#>       x              1         
#>   -----------------------------
#> 
#> 
#> All rows are shown in table below
#> 
#>   ===========================================
#>    VARIABLE  ..ROWNUMBER..   BASE    COMPARE 
#>   -------------------------------------------
#>       x            1        1.1e-06  1.1e-07 
#>   -------------------------------------------

diffdf(dsin1  , dsin2 , tolerance = 0.001 , suppress_warnings = T)
#> No issues were found!

Strictness

By default, the function will note a difference between integer and double columns, and factor and character columns. It can be useful in some contexts to prevent this from occuring. We can do so with the strict_numeric = FALSE and strict_factor = FALSE arguments.

dsin1 <- data.frame(x = as.integer(c(1,2,3)))
dsin2 <- data.frame(x = as.numeric(c(1,2,3)))

diffdf(dsin1  , dsin2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE and COMPARE with different classes !!
#> All rows are shown in table below
#> 
#>   ==================================
#>    VARIABLE  CLASS.BASE  CLASS.COMP 
#>   ----------------------------------
#>       x       integer     numeric   
#>   ----------------------------------
diffdf(dsin1  , dsin2 , suppress_warnings = T, strict_numeric = FALSE)
#> NOTE: Variable x in base was casted to numeric
#> No issues were found!

dsin1 <- data.frame(x = as.character(c(1,2,3)), stringsAsFactors = FALSE)
dsin2 <- data.frame(x = as.factor(c(1,2,3)))

diffdf(dsin1  , dsin2 , suppress_warnings = T)
#> Differences found between the objects!
#> 
#> A summary is given below.
#> 
#> There are columns in BASE and COMPARE with different modes !!
#> All rows are shown in table below
#> 
#>   ================================
#>    VARIABLE  MODE.BASE  MODE.COMP 
#>   --------------------------------
#>       x      character   numeric  
#>   --------------------------------
#> 
#> There are columns in BASE and COMPARE with different classes !!
#> All rows are shown in table below
#> 
#>   ==================================
#>    VARIABLE  CLASS.BASE  CLASS.COMP 
#>   ----------------------------------
#>       x      character     factor   
#>   ----------------------------------
diffdf(dsin1  , dsin2 , suppress_warnings = T, strict_factor = FALSE)
#> NOTE: Variable x in compare was casted to character
#> No issues were found!