R/IMPUTATION_functions.R

Defines functions get_gen_cv_sets impute_mean calc_impute_acc test_mean_impute_cv

# This script includes all custom functions for the IMPUTATION_demo.R

# This function uses a cvFolds assignment to set matrix values to missing and retains the original values
# folds = a cvFolds object that split all genotype indices into different groups
# genotypes = a numeric genotype matrix (rows = taxa, columns = markers)
# fold_ct = the number of folds to use for genotype subsetting (if NULL, use all folds)
get_gen_cv_sets <- function(folds, genotypes, fold_ct = NULL)
{
  cv_genotype_list <- list()
  
  if (!is.null(fold_ct))
  {
    iterator <- 1:fold_ct
  } else
  {
    iterator <- unique(folds$which)
  }
  
  for (i in iterator)
  {
    fold_genotypes <- unlist(genotypes) # Decompose the matrix to simplify cross-validation index reference
    fold_genotypes[folds$subsets[folds$which == i]] <- NA # Set the values to missing
    fold_genotypes <- data.frame(matrix(fold_genotypes, ncol = ncol(genotypes), nrow = nrow(genotypes))) # Recompose the genotype matrix
    names(fold_genotypes) <- names(genotypes)
    row.names(fold_genotypes) <- row.names(genotypes)
    
    fold_list <- list(fold_genotypes) 
    names(fold_list) <- c('fold_genotypes')
    
    cv_genotype_list[[i]] <- fold_list
    names(cv_genotype_list)[i] <- paste('fold_', i)
  }
  
  return(cv_genotype_list)
}

# Perform mean imputation for a given genotype table
# genotypes
impute_mean <- function(genotypes)
{
  for (i in 1:ncol(genotypes))
  {
    na_indices <- is.na(genotypes[,i])
    genotypes[na_indices,i] <- mean(genotypes[,i], na.rm = T)
  }
  return(genotypes)
}

# Calculate imptuation accuracy
calc_impute_acc <- function(genotypes, genotypes_na, genotypes_imp)
{
  na_indices <- which(is.na(genotypes_na), arr.ind = T) # Get the indices of missing values
  
  orig_values <- genotypes[na_indices]
  imp_values <- genotypes_imp[na_indices]
  
  accuracy <- cor(orig_values, imp_values)
  
  return(accuracy)
}

# This function will test the accuracy of mean imputation across a set of folds
test_mean_impute_cv <- function(genotypes, genotypes_na_folds)
{
  accuracy_vect <- vector(mode = 'numeric', length = length(genotypes_na_folds))
  
  for (i in 1:length(accuracy_vect))
  {
    print(paste("Testing fold: ", i, sep = ''))
    genotypes_na <- genotypes_na_folds[[i]][[1]] # Extract the genotypes with missing values
    genotypes_imp <- impute_mean(genotypes_na) # Perform imputation
    accuracy_vect[i] <- calc_impute_acc(genotypes, genotypes_na, genotypes_imp) # Calculate an accuracy
  }
  
  return(accuracy_vect)
}
mtmcgowan/CropSci545 documentation built on Feb. 27, 2020, 12:43 a.m.