R/merge.R

Defines functions getSubmissionDirectories removeDuplicates mergeData mergeAll mergePlots mergeTrials mergeLocations mergeAccessions

Documented in mergeAccessions mergeAll mergeLocations mergePlots mergeTrials

#' Merge Accessions
#' 
#' Create a merged accession template from multiple submissions
#' 
#' @param dir The path to the parent directory that contains the submission directories (default=current directory)
#' @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
#' @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
#' 
#' @import readxl WriteXLS tibble dplyr readr digest
#' @export
mergeAccessions <- function(dir=".", useMostRecent=TRUE, keepUniqueDuplicates=FALSE) {
    mergeData(
        dir, 
        "accessions.xls", 
        "accession_name", 
        useMostRecent,
        keepUniqueDuplicates
    )
}


#' Merge Locations
#' 
#' Create a merged location template from multiple submissions
#' 
#' @param dir The path to the parent directory that contains the submission directories (default=current directory)
#' @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
#' @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
#' 
#' @import readxl WriteXLS tibble dplyr readr digest
#' @export
mergeLocations <- function(dir=".", useMostRecent=TRUE, keepUniqueDuplicates=FALSE) {
    mergeData(
        dir, 
        "locations.xls", 
        "Name", 
        useMostRecent,
        keepUniqueDuplicates
    )
}


#' Merge Trials
#' 
#' Create a merged trial layout template from multiple submissions
#' 
#' @param dir The path to the parent directory that contains the submission directories (default=current directory)
#' @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
#' @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
#' 
#' @import readxl WriteXLS tibble dplyr readr digest
#' @export
mergeTrials <- function(dir=".", useMostRecent=TRUE, keepUniqueDuplicates=FALSE) {
    mergeData(
        dir, 
        "trial_layout.xls", 
        "plot_name", 
        useMostRecent,
        keepUniqueDuplicates
    )
}


#' Merge Plots
#' 
#' Create a merged plot data template from multiple submissions
#' 
#' @param dir The path to the parent directory that contains the submission directories (default=current directory)
#' @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
#' @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
#' 
#' @import readxl WriteXLS tibble dplyr readr digest
#' @export
mergePlots <- function(dir=".", useMostRecent=TRUE, keepUniqueDuplicates=FALSE) {
    mergeData(
        dir, 
        "trial_observations.xls", 
        "observationunit_name", 
        useMostRecent,
        keepUniqueDuplicates
    )
}


#' Merge All
#' 
#' Merge all of the supported data types
#' 
#' @param dir The path to the parent directory that contains the submission directories (default=current directory)
#' @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
#' @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
#' 
#' @import readxl WriteXLS tibble dplyr readr digest
#' @export
mergeAll <- function(dir=".", useMostRecent=TRUE, keepUniqueDuplicates=FALSE) {
    mergeAccessions(dir, useMostRecent, keepUniqueDuplicates)
    mergeLocations(dir, useMostRecent, keepUniqueDuplicates)
    mergeTrials(dir, useMostRecent, keepUniqueDuplicates)
    mergePlots(dir, useMostRecent, keepUniqueDuplicates)
}


# Merge Data
# 
# Perform the actual merging of the data with the provided properties
# 
# @param dir path to parent directory of the submissions
# @param filename name of the file to merge
# @param key the name of the key column for finding duplicates
# @param useMostRecent When set to TRUE, only use the most recent submission directory for each trial
# @param keepUniqueDuplicates When set to TRUE, the script will keep all unique versions of duplicate rows (and won't prompt the user to pick)
mergeData <- function(dir, filename, key, useMostRecent, keepUniqueDuplicates) {
    print(sprintf("Merging %s files from submissions in %s", filename, dir))

    # Subset plot data with these columns
    col_oun <- 23
    col_start_traits <- 31
    is_plot_data <- grepl("^trial_observations", filename)

    # Get directories to parse
    dirs <- getSubmissionDirectories(dir, useMostRecent)

    # Read the Excel files
    tables <- list()
    for ( sub_dir in dirs ) {
        skip <- ifelse(is_plot_data, 3, 0)
        table <- readxl::read_excel(paste(dir, sub_dir, filename, sep="/"), skip=skip, col_types="text")
        if ( is_plot_data ) {
            cols <- c(23, col_start_traits:ncol(table))
            table <- table[, cols]
        }
        tables[[sub_dir]] <- table
    }

    # Get all of the unique column names
    cols <- c()
    for ( sub_dir in names(tables) ) {
        table <- tables[[sub_dir]]
        fc <- colnames(table)
        for ( c in fc ) {
            if ( ! c %in% cols ) {
                cols <- c(cols, c)
            }
        }
    }

    # Create an empty to template to merge into
    merged <- as_tibble(data.frame(matrix(ncol = length(cols), nrow = 0)))
    colnames(merged) <- cols
    for ( c in cols ) {
        merged[,c] <- as.character(merged[,c])
    }

    # Parse each table
    for ( sub_dir in names(tables) ) {
        table <- tables[[sub_dir]]
        merged <- dplyr::bind_rows(merged, table)
    }

    # Fix plot data (rename observationunit and move notes to end)
    if ( is_plot_data ) {
        colnames(merged)[which(colnames(merged) == "observationUnitName")] <- "observationunit_name"
        merged <- relocate(merged, notes, .after = last_col())
    }

    # Filter out duplicates
    filtered <- removeDuplicates(merged, key, keepUniqueDuplicates)

    # Write the filtered table
    dir.create(file.path(dir, "merged"), showWarnings=FALSE)
    WriteXLS::WriteXLS(filtered, paste(dir, "merged", filename, sep="/"))
}



# Remove Duplicates
# 
# Remove duplicate rows that have the same value in the specified key column
# 
# @param merged The merged tibble that contains all of the rows
# @param key_column The name of the column that will be used to find unique values
# @param keepUniqueDuplicates When TRUE, the script will not prompt to pick a unique duplicate row, but will keep all of them
# 
# @return a tibble with the duplicate rows removed
removeDuplicates <- function(merged, key_column, keepUniqueDuplicates) {
    
    # Get unique keys of the key column
    keys <- sort(unique(merged[[key_column]]))
    filtered <- readr::read_csv("\n", col_names=colnames(merged), show_col_types=FALSE)
    
    # Parse each key
    for ( key in keys ) {
        
        # Subset the merged data for the key and get the number of rows
        s <- dplyr::filter(merged, !!as.name(key_column) == key)
        count <- nrow(s)

        # Key is only found once, add it to the filtered table
        if ( count == 1 ) {
            filtered <- rbind(filtered, s)
        }

        # Key is found more than once, compare the rows
        else {

            # list of unique rows (hash is the key, row is the value)
            unique_rows <- list()
            
            # Parse each row
            for ( i in c(1:count) ) {
                row <- s[i,]
                row_string <- ""
                
                # Create and store a hash of the row's values
                for ( j in colnames(row) ) {
                    row_string <- paste(row_string, row[[j]], sep="|")
                }
                hash <- digest::digest(row_string, algo="md5")
                unique_rows[[hash]] <- row
            }

            # There is only one unique row, add it to the filtered table
            if ( length(names(unique_rows)) == 1 ) {
                filtered <- rbind(filtered, unique_rows[[names(unique_rows)[[1]]]])
            }

            # There were multiple unique rows, the user needs to choose one or keep all
            else {
                print(sprintf("WARNING: There were multiple rows for %s that are different!", key))

                # Display the unique rows and prompt user to pick one (or all) to keep
                if ( !keepUniqueDuplicates ) {
                    for ( i in c(1:length(names(unique_rows))) ) {
                        print(sprintf("ROW #%i", i))
                        print(unique_rows[[names(unique_rows)[[i]]]])
                    }
                    print("all: Keep all rows in the merged file")
                    keep <- readline(prompt="Enter row to keep (#/all): ")
                }

                # Keep all unique duplicates
                else {
                    keep <- "all"
                }
                
                # Keep all of the rows
                if ( keep == "all" ) {
                    for ( hash in names(unique_rows) ) {
                        filtered <- rbind(filtered, unique_rows[[hash]])
                    }
                }
                # Keep the selected row
                else {
                    filtered <- rbind(filtered, unique_rows[[names(unique_rows)[[as.integer(keep)]]]])
                }

            }
        }
    }

    # Return the filtered subset of the merged table
    return(filtered)

}



# Get Submission Directories
# 
# Get the list of submission directories from the parent directory
# 
# @param dir The path to the parent directory that contains the submission directores
# @param useMostRecent When set to TRUE, only include the most recent submission directory for each trial
# 
# @return vector of submission directories to parse
getSubmissionDirectories <- function(dir, useMostRecent=TRUE) {
    
    # Get the submission directories from the provided directory
    all_dirs <- list.files(dir, pattern="[0-9]{8}_[0-9]{6}_[0-9]+")
    rtn_dirs <- c()
    
    # Get the most recent submission of each trial
    if ( useMostRecent ) {
        trial_ts <- list()
        rtn_dirs_list <- list()
        for ( dir in all_dirs ) {
            parts <- strsplit(dir, "_")[[1]]
            trial <- parts[3]
            ts <- paste0(parts[1], parts[2])
            prev_ts <- trial_ts[[trial]]
            if ( is.null(prev_ts) || ts > prev_ts ) {
                trial_ts[[trial]] <- ts
                rtn_dirs_list[[trial]] <- dir
            }
        }
        for ( trial in names(rtn_dirs_list) ) {
            rtn_dirs <- c(rtn_dirs, rtn_dirs_list[[trial]])
        }
    }

    # Return all submissions, may include duplicate trials
    else {
        rtn_dirs <- all_dirs
    }

    # Return the filtered directories
    return(rtn_dirs)

}
TriticeaeToolbox/breedbase.R documentation built on March 23, 2024, 10:29 a.m.