R/dataload_supp.R

Defines functions missing.imputation mctools.matching extraction

Documented in extraction mctools.matching missing.imputation

# Supplemental functions not-exported 

#' Matching between unq_ids and mblid for metabolomics to match with 16S data
#' @description Matching unq_ids and mblids for data. First it searches for missing mblids in the
#'     data set and capture their unq_ids. Then it subsets the reference according to the unq_ids
#'     found. Then it selects the time period based upon the timelabel. For every row of data we check
#'     if their unq_ids is in the subsetted reference's unq_id, then we grab the indices of those ones.
#'     If there are more than one unq_id, we select the one with the highest read count. If not, we
#'     grab the mblid and inputted it in.
#' @param reference The data frame containing references
#' @param data The data frame of the metabolomic data frame of interest
#' @param timelabel The label of time point of interest

missing.imputation <- function(reference,data,timelabel){
  message("Imputing ids based on reference...")
  # what indexes where missing
  missing.index <- which(data$mblid == "")
  no.mblid.index <- data$unq_id[missing.index]
  
  #find entries with unq_ids identified prior, with the appropriate time period label.
  subsetted.reference <- subset(reference, rlang::.data$unq_id %in% no.mblid.index)
  subsetted.reference <- subset(subsetted.reference, rlang::.data$TimePeriod == timelabel)
  subsetted.reference <- subset(subsetted.reference, rlang::.data$specimen_type == "Stool")
  
  # extracting appropriate mblid and assign it to the data
  for (i in 1:nrow(data)){
    if (data$unq_id[i] %in% subsetted.reference$unq_id){
      index <- which(subsetted.reference$unq_id == data$unq_id[i])
      if (length(index) > 1){
        index <- index[which.max(subsetted.reference$SequenceCount[index])]
      }
      data$mblid[i] <- subsetted.reference$mblid[index]
    }
  }
  
  message("Check if there are remaining samples with missing ids...")
  missing <- data$unq_id[which(data$mblid == "")]
  message(paste("Number of samples removed due to inability to find corresponding mblids", length(missing)))
  data <- data[-which(data$unq_id %in% missing),]
  return(data)
}

#' @title Matching functions between two data sets
#' @description A function to match between two data sets (either metabolomics and taxonomic or
#'    between metabolomic data sets). For matching between metabolomics and taxonomic data, translated
#'    mblids are used. For matching between complimentary metabolomics data sets, the unq_ids are used.
#' @param data.1 The first data frame to be inputted, what it is depends on the variable `type`
#' @param data.2 The second data frame to be inputted, what it is depends on the variable `type`
#' @param type Which matching to be done. If `type == "cross"` then it is matching between metabolomics
#'    (`data.1`)and taxonomic (`data.2`) data sets. If `type == "same"` it's between two metabolomics data
#'    sets.

mctools.matching <- function(data.1, data.2, type){
  if (type == "cross"){
    metabo.data <- data.1
    tax.data <- data.2
    mblid <- metabo.data$mblid
    index <- match(mblid, rownames(tax.data))
    unmatch <- which(is.na(index))
    message(paste("The number of unmatched samples is", length(unmatch)))
    tax.index <- as.vector(stats::na.omit(index))
    list <- list(tax.index, unmatch)
    names(list) <- c("tax.index","metab.unmatch")
  } else if (type == "same"){
    metabo.data.1 <- data.1
    metabo.data.2 <- data.2
    id.1 <- unique(metabo.data.1$unq_id)
    id.2 <- unique(metabo.data.2$unq_id)
    
    # getting matches by unq_id
    match_unq_id <- match(id.1, id.2)
    match_unq_1 <- as.numeric(stats::na.omit(match_unq_id))
    match_unq_2 <- which(!is.na(match_unq_id))
    
    # extracting appropriate metabolomic profiles
    list <- list(match_unq_1, match_unq_2)
    names(list) <- c("match.index.1", "match.index.2")
  }
  return(list)
}

#' @title Extracting data sets from index after matching  
#' @description A function to simplify the extraction process for multiple time points after
#'      getting the extraction indices from `mctools.matching`.   
#' @param metabo Metabolic data set  
#' @param tax Taxonomic data set
extraction <- function(metabo, tax){
  index <- mctools.matching(data.1 = metabo, data.2 = tax, type = "cross")
  tax.index <- index$tax.index
  metab.index <- index$metab.unmatch
  
  metabo <- metabo[-metab.index,]
  tax <- tax[tax.index,]
  list <- list(metabo, tax)
  return(list)
}
quangnguyen1995/MCTools documentation built on May 23, 2019, 8:56 a.m.