R/pd_matchRows.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference

Documented in checkUniqueness createAntiSubset matchMultiIndex matchNoIndex matchRows matchSingleIndex

# SPDX-Copyright: Copyright (c) Capital One Services, LLC 
# SPDX-License-Identifier: Apache-2.0 
# Copyright 2017 Capital One Services, LLC 
#
# Licensed under the Apache License, Version 2.0 (the "License"); 
# you may not use this file except in compliance with the License. 
#
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
#
# Unless required by applicable law or agreed to in writing, software distributed 
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied.


#' Generate two dataframes and returns subsets of these dataframes that have shared rows.
#' 
#' @param df_a A dataframe
#' @param df_b A dataframe
#' @param indices The indices to match rows between \code{df_a} and \code{df_b}. Can be NA,
#' single character, or a vector of characters
#' @import stringi
#' @return A list containing the two dataframes, subsetted by shared indices, and a list which itself 
#'        contains dataframes for the dropped rows
matchRows <- function(df_a, df_b, indices = NA)
{ 
  if (length(indices) == 1) {
    if (is.na(indices)) {
      return(matchNoIndex(df_a, df_b))
    } else {
      return(matchSingleIndex(df_a, df_b, indices))
    }
  }

  if(length(indices) > 1){
    return(matchMultiIndex(df_a, df_b, indices))
  }
}


#' Checks that a list of indexes areunique
#' 
#' @param df_indices A vector of values
#' @return Boolean - true if all values in vector are unique, false if not
#' @examples
#' \dontrun{checkUniqueness(c('car','van','van'))}
#' \dontrun{checkUniqueness(c('car','van','bus'))}
checkUniqueness <- function(df_indices)
{
  if(length(unique(df_indices)) == length(df_indices)){
    return(TRUE)
  }
  return(FALSE)
}

#' Create a dataframe of the rows that don't match 
#' 
#' @param index_antisubset Vector of mismatching indices
#' @param original_keys A character array
#' @param index_key A character array
#' @param df A data frame
#' @return A dataframe containing the dropped rows
createAntiSubset <- function(index_antisubset,original_keys,index_key, df)
{
  if(length(index_antisubset)==0){
    #df_antisubset <- data.frame(integer())
    df_antisubset <- data.frame(matrix(data = NA, ncol = length(original_keys), nrow = 0))
    colnames(df_antisubset) <- original_keys
  } else if(length(original_keys)==1){
    df_antisubset <- data.frame(df[df[,index_key] %in% index_antisubset,original_keys])
    colnames(df_antisubset) <- original_keys
  } else {
    df_antisubset <- df[df[,index_key] %in% index_antisubset,original_keys]
  }
  return(df_antisubset)
}



#' Generate two dataframes that contain the same rows based on a single index
#' 
#' @param df_a A dataframe
#' @param df_b A dataframe
#' @param index_key A character vector
#' @param original_keys A character vector
#' @return A list containing the two dataframes, subsetted by shared indices, and a list which itself 
#'        contains the vectors for the dropped rows
#'        
matchSingleIndex <- function(df_a, df_b, index_key, original_keys)
{
  indexIsUnique <- checkUniqueness(df_a[,index_key]) & checkUniqueness(df_b[,index_key])
  
  if(!indexIsUnique){
    stop("The indices are not unique in the submitted dataframes. Please resubmit with unique indices.")
  }
  
  if(missing(original_keys)){
    original_keys <- c(index_key)
  }
  
  indices_a <- c(df_a[,index_key])
  indices_b <- c(df_b[,index_key])
  
  index_subset <- indices_a[indices_a %in% indices_b]
  index_a_antisubset <- setdiff(indices_a,index_subset)
  index_b_antisubset <- setdiff(indices_b,index_subset)

  # This is better if we have a character key
  
  
  if(is.character(df_a[,index_key]) & is.character(df_b[,index_key])) {
    index_subset_df <- data.frame(index_subset, stringsAsFactors = FALSE)
    names(index_subset_df) <- index_key
  
    df_a2 <- suppressMessages(inner_join(df_a, index_subset_df))
    df_b2 <- suppressMessages(inner_join(df_b, index_subset_df))
    
    df_a_subset <- df_a2[stri_order(df_a2[,index_key]),]
    df_b_subset <- df_b2[stri_order(df_b2[,index_key]),]  
  } else {
    # And this is better for numbers (and works in general)
    
    df_a_subset <- df_a[df_a[,index_key] %in% index_subset,] %>% arrange_(index_key)
    df_b_subset <- df_b[df_b[,index_key] %in% index_subset,] %>% arrange_(index_key)
  }
  

  df_a_antisubset <- createAntiSubset(index_a_antisubset, original_keys, index_key,df_a)  
  df_b_antisubset <- createAntiSubset(index_b_antisubset, original_keys, index_key,df_b)

  return(list(df_a_subset,df_b_subset,list(df_a_antisubset,df_b_antisubset)))
}


#' Generate two dataframes that contain the same rows based on a two-column index
#' 
#' @param df_a A dataframe
#' @param df_b A dataframe
#' @param indices A char vector
#' @return A list containing the two dataframes, subsetted by shared indices, and a list which itself 
#'        contains the vectors for the dropped rows
matchMultiIndex <- function(df_a, df_b, indices)
{
  df_a$dataCompareR_merged_indices <- do.call(paste,c(df_a[indices],sep = "_"))
  df_b$dataCompareR_merged_indices <- do.call(paste,c(df_b[indices],sep = "_"))
  
  mtchedData <- matchSingleIndex(df_a, df_b, 'dataCompareR_merged_indices', indices)
  
  # Lose these columns
  mtchedData[[1]]$dataCompareR_merged_indices <- NULL
  mtchedData[[2]]$dataCompareR_merged_indices <- NULL
  
  return(mtchedData)
}

#' Generate two dataframes that contain the same rows based on a two-column index
#' 
#' @param df_a A dataframe
#' @param df_b A dataframe
#' @return A list containing the two dataframes, subsetted to the size of the smaller one, and
#'          a list containing vectors of the rows dropped.
matchNoIndex <- function(df_a, df_b)
{
  if(nrow(df_a)>nrow(df_b)) {
    
    if(nrow(df_b)==0) {
      df_a_subset <- df_a[0, , drop=FALSE]
    } else {
      df_a_subset <- df_a[1:nrow(df_b), , drop=FALSE]
    }
    
    df_b_subset <- df_b
    rows_dropped_from_a <- data.frame(indices_removed=(nrow(df_a_subset)+1):nrow(df_a))
    rows_dropped_from_b <- data.frame(indices_removed=integer())
  }
  else if (nrow(df_b)>nrow(df_a)) {
    df_a_subset <- df_a
    
    if(nrow(df_a)==0) {
      df_b_subset <-df_b[0, , drop=FALSE]
    } else {
      df_b_subset <-df_b[1:nrow(df_a), , drop=FALSE]
    }
    
    rows_dropped_from_a <- data.frame(indices_removed=integer())
    rows_dropped_from_b <- data.frame(indices_removed=(nrow(df_b_subset)+1):nrow(df_b))
  }
  else {
    df_a_subset <- df_a
    df_b_subset <- df_b
    rows_dropped_from_a <- data.frame(indices_removed=integer())
    rows_dropped_from_b <- data.frame(indices_removed=integer())
  }
  

  return(list(df_a_subset, df_b_subset, list(rows_dropped_from_a,rows_dropped_from_b)))
}

Any scripts or data that you put into this service are public.

dataCompareR documentation built on Nov. 23, 2021, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dataCompareR
Compare Two Data Frames and Summarise the Difference

R/pd_matchRows.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference

Defines functions matchNoIndex matchMultiIndex matchSingleIndex createAntiSubset checkUniqueness matchRows

Documented in checkUniqueness createAntiSubset matchMultiIndex matchNoIndex matchRows matchSingleIndex

Try the dataCompareR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataCompareR Compare Two Data Frames and Summarise the Difference

R/pd_matchRows.R In dataCompareR: Compare Two Data Frames and Summarise the Difference

Defines functions matchNoIndex matchMultiIndex matchSingleIndex createAntiSubset checkUniqueness matchRows

Documented in checkUniqueness createAntiSubset matchMultiIndex matchNoIndex matchRows matchSingleIndex

Try the dataCompareR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dataCompareR
Compare Two Data Frames and Summarise the Difference

R/pd_matchRows.R
In dataCompareR: Compare Two Data Frames and Summarise the Difference