R/mixing_utilities.R

#' @section Mixing Utilities:
#'
#' This section is devoted to functions used in the study of mixing processes from hydrogeochenmical information.
#'
#' This section includes the definition of five functions:
#' m3_mixing_model,select_end_members,uncertainty_m3_mixing_model,plot_m3_mixing_results,constrained_lm
#' @docType package
#' @name GQAnalyzer
NULL
#' @title
#' m3_mixing_model
#' @description
#' Function that implements the M3 mixing model. This model is based on the projection of
#' the chemical information in a low-dimensional space defined using Principal Component
#' Analysis (PCA). The compositions of pre-defined end-members are also projected and from
#' these coordinates, the mixing ratios of each end-member are calculated using a constrained
#' least-squares procedure (mixing ratios add up to 1).
#' @param gdata A geochemical_dataset object with the major ions only.
#' @param end.members If provided, this is a numeric vector with the indices of the end members
#' to be used in the mixing calculations. If it is not specified then the end members are
#' defined using the convex hull of the first two principal components.
#' @return
#' This function returns a list with the following entries:
#' \itemize{
#' \item mixing.ratios: Matrix with the mixing ratios estimated using constrained
#' least-squares.
#' \item res.pca: Object returned by the princomp function. This object contains the
#' eigenvectors, eigenvalues, scores and other results of the PCA.
#' \item res.lm: List with the results of the multiple linear regression applied on each
#' ion included in the gdata geochemical_dataset object. Each entry of this list is a lm
#' object for each ion.
#' \item names.lm: Character vector with the names of the ions used in the mixing analysis.
#' \item X:
#' \item dataset:
#' }
#' @author
#' Oscar Garcia-Cabrejo \email{khaors@gmail.com}
#' @family mixing functions
#' @importFrom stats princomp lm residuals predict
#' @importFrom graphics plot
#' @importFrom MASS stepAIC
#' @importFrom grDevices chull
#' @references
#' Laaksoharju, M., Skarrman, C., \& Skarrman, E. (1999). Multivariate mixing and mass
#' balance (M3) calculations, a new tool for decoding hydrogeochemical information.
#' Applied Geochemistry, 14(7), 861–871. http://doi.org/10.1016/S0883-2927(99)00024-4
#' @export
m3_mixing_model <- function(gdata, end.members = NULL){
  p <- NULL
  conc_ions <- colnames(gdata$dataset)
  meql_ions <- c("Ca", "Mg", "Na", "K", "HCO3", "CO3", "Cl", "SO4")
  if(class(gdata) != "geochemical_dataset"){
    stop('ERROR: A geochemical_dataset is required as input')
  }
  # Step 1. Multivariate Analysis
  # Scaling dataset
  dataset <- scale(gdata$dataset[meql_ions], center = TRUE, scale = TRUE)
  nsamples <- nrow(dataset)
  res.pca <- princomp(dataset, cor = TRUE, scores = TRUE)
  res.pca.def <- res.pca$scores[,1:2]
  n.endmembers <- 0
  if(is.null(end.members)){
    pos.end.members <- chull(res.pca.def[,1], res.pca.def[,2])
    n.endmembers <- length(pos.end.members)
  }
  else{
    pos.end.members <- end.members
    n.endmembers <- length(end.members)
  }
  # Step 2. Calculate mixing ratios using end members
  end.members.mat <- t(res.pca.def[pos.end.members,])
  FFt <- t(end.members.mat)%*%end.members.mat
  Ie <- matrix(1.0, nrow = n.endmembers, ncol = 1)
  tmp1 <- cbind(FFt,Ie)
  A <- rbind(tmp1, c(t(Ie), 0))
  res.mixing.ratios <- matrix(0.0, nrow = nsamples, ncol = n.endmembers)
  for(i in 1:nsamples){
    b <- rbind(t(end.members.mat)%*%res.pca.def[i,], 1)
    mixing.ratios <- solve(A+1e-6*diag(n.endmembers+1),b)
    res.mixing.ratios[i,] <- mixing.ratios[1:n.endmembers]
  }
  # Correct mixing ratios for end members
  for(i in 1:n.endmembers){
    current.ratio <- rep(0.0, n.endmembers)
    current.ratio[i] <- 1
    res.mixing.ratios[pos.end.members[i],] <- current.ratio
  }
  #
  # Step 3. Mass Balance Calculations
  ions <- c("Ca", "Mg", "Na", "K", "HCO3", "CO3", "Cl","SO4")
  X <- matrix(0.0, nrow = nsamples, ncol = 2)
  residuals <- matrix(0.0, nrow = nsamples, ncol = length(ions))
  predicted <- matrix(0.0, nrow = nsamples, ncol = length(ions))
  lm.models <- list()
  for(i in 1:length(ions)){
    current.ion <- ions[i]
    y <- dataset[,i]
    for(isample in 1:nsamples){
      X[isample,] <- res.mixing.ratios[isample,]%*%t(end.members.mat)
    }
    current.df <- data.frame(y = y, PC1 = X[,1], PC2 = X[,2])
    res.lm <- lm(y ~ PC1 + PC2, data = current.df)
    lm.models[[i]] <- res.lm
    predicted[,i] <- predict(res.lm)
    residuals[,i] <- residuals(res.lm)
  }
  #
  res <- list(std.dataset = dataset,
              mixing.ratios = res.mixing.ratios,
              res.pca = res.pca.def,
              end.members = t(end.members.mat),
              predicted = predicted,
              residuals = residuals,
              lm.models = lm.models)
  return(res)
}
#
#' @title
#' select_end_members
#' @description
#' Function to estimate the end members of a solution
#' @param gdata A geochemical_dataset object
#' @return
#' This function returns
#' @author
#' Oscar Garcia-Cabrejo, \email{khaors@gmail.com}
#' @family mixing functions
#' @export
select_end_members <- function(gdata){
  if(class(gdata) != "geochemical_dataset"){
    stop('ERROR: A geochemical_dataset is required as input')
  }
  dataset <- scale(gdata$dataset, center = TRUE, scale = TRUE)
  res.pca <- princomp(dataset, cor = TRUE, scores = TRUE)
  plot(res.pca)
}
#' @title
#' uncertainty_m3_mixing_model
#' @description
#' Function to quantify the uncertainty in the mixing ratios due to the variability in
#' the concentrations of the end.members using Monte Carlo simulation.
#' @param gdata A geochemical_dataset object
#' @param end.members If provided, this is a numeric vector with the indices of the end members
#' to be used in the mixing calculations. If it is not specified then the end members are
#' defined using the convex hull of the first two principal components.
#' @param lower A matrix with the lower limits of the concentrations of the end.members.
#' @param upper A matrix with the upper limits of the concentrations of the end.members.
#' @param nreal The number of realizations.
#' @param seed Random seed.
#' @return
#' This function returns a list with the following entries:
#' \itemize{
#' \item end.members: realizations of the end.members 
#' \item mixing.ratios: estimated mixing ratios
#' }
#' @details 
#' This function implements a procedure designed to assess the impact of the 
#' compositional variability of the end members on the estimated mixing ratios. 
#' This procedure is based on the random generation of a predefined number of 
#' end member realizations where the concentration of each species is assumed 
#' to follow a log-normal distribution. These distributions are defined in 
#' terms of the upper and lower limits which corresponds to their  0.05 and 0.95 
#' quantiles. Then a large number of end member compostions are randomly 
#' generated and from each one the mixing ratios are estimated.   
#' @author
#' Oscar Garcia-Cabrejo, \email{khaors@gmail.com}
#' @family mixing functions
#' @importFrom stats rlnorm
#' @export
#' @references
#' Laaksoharju, M., Skarrman, C., \& Skarrman, E. (1999). Multivariate mixing and mass
#' balance (M3) calculations, a new tool for decoding hydrogeochemical information.
#' Applied Geochemistry, 14(7), 861–871. http://doi.org/10.1016/S0883-2927(99)00024-4
uncertainty_m3_mixing_model <- function(gdata, end.members = NULL, 
                                        lower, upper, nreal = 100, 
                                        seed = 12345){
  set.seed(seed)
  if(class(gdata) != "geochemical_dataset"){
    stop('ERROR: A geochemical_dataset is required as input')
  }
  #
  if(class(lower) != "matrix" | class(upper) != "matrix"){
    stop("ERROR: the lower and upper input variables must be numeric vectors")
  }
  #
  conc_ions <- colnames(gdata$dataset)
  meql_ions <- c("Ca", "Mg", "Na", "K", "HCO3", "CO3", "Cl", "SO4")
  #
  dataset <- gdata$dataset[meql_ions]
  #
  n.endmember <- nrow(lower)
  nvar <- ncol(lower)
  nsamples <- nrow(dataset)
  # Define alpha and beta parameters
  alpha <- (log(upper)+log(lower))/2
  beta <- (log(upper)-log(lower))/(2*2.576)
  # Define mu and sigma
  mu <- exp(alpha + (beta^2)/2)
  sigma <- mu*sqrt(exp(beta^2)-1)
  # Generate random end.members
  current.end.members <- array(0.0, dim = c(n.endmember, nvar, nreal))
  for(ireal in 1:nreal){
    for(iend in 1:n.endmember){
      for(ivar in 1:nvar){
        current.end.members[iend,ivar,] <- rlnorm(nreal, 
                                                  meanlog = alpha[iend, ivar],
                                                  sdlog = beta[iend, ivar])
      }
    }
  }
  # Estimate the mixing ratios
  current.mixing.ratios <- array(-1.0, dim = c(nsamples, n.endmember, nreal))
  pos.end.member <- seq((nsamples+1),(nsamples+n.endmember), by = 1)
  end.members.array <- array(-999.99, dim = c(n.endmember, 2, nreal))
  for(ireal in 1:nreal){
    # Define current end member
    currentEM <- as.data.frame(current.end.members[,,ireal])
    names(currentEM) <- names(dataset)
    # Add currentEM to original dataset
    #current.dataset <- rbind(dataset, currentEM)
    current.dataset <- dataset
    current.dataset[end.members,] <- currentEM
    current.dataset1 <- scale(current.dataset, center = TRUE, scale = TRUE)
    # PCA
    res.pca <- princomp(current.dataset1, cor = TRUE, scores = TRUE)
    res.pca.def <- res.pca$scores[,1:2]
    end.members.mat <- t(res.pca.def[end.members,])
    end.members.array[,,ireal] <- t(end.members.mat)
    FFt <- t(end.members.mat)%*%end.members.mat
    Ie <- matrix(1.0, nrow = n.endmember, ncol = 1)
    tmp1 <- cbind(FFt,Ie)
    A <- rbind(tmp1, c(t(Ie), 0))
    res.mixing.ratios <- matrix(0.0, nrow = nsamples, ncol = n.endmember)
    # Estimate mixing ratios
    for(i in 1:nsamples){
      b <- rbind(t(end.members.mat)%*%res.pca.def[i,], 1)
      mixing.ratios <- solve(A+1e-6*diag(n.endmember+1),b)
      res.mixing.ratios[i,] <- mixing.ratios[1:n.endmember]
    }
    #Correct mixing ratios for end members
    for(i in 1:n.endmember){
      current.ratio <- rep(0.0, n.endmember)
      current.ratio[i] <- 1
      res.mixing.ratios[end.members[i],] <- current.ratio
    }
    # Save estimated mixing ratios
    current.mixing.ratios[,,ireal] <- res.mixing.ratios
  }
  #
  res <- list(end.members = current.end.members, 
              mixing.ratios = current.mixing.ratios, 
              end.members.pca = end.members.array)
  return(res)
}
#' @title
#' plot_m3_mixing_results
#' @description
#' Function to plot different results obtained from the application of the M3 mixing model
#' @param gdata A geochemical_dataset object
#' @param mixing.res A list with ther results of the m3_mixing_model function
#' @param type A character string specifying the plot type to be created. Currently the
#' supported options are:
#' \itemize{
#' \item concentration
#' \item mixing.ratio
#' }
#' @param element A character string with the name of the ion
#' @return
#' This function returns a ggplot2 object with the requested plot.
#' @author
#' Oscar Garcia-Cabrejo \email{khaors@gmail.com}
#' @family mixing functions
#' @importFrom ggplot2 ggplot geom_point geom_polygon ggtitle
plot_m3_mixing_results <- function(gdata, mixing.res,
                                   type = c("concentration",
                                            "mixing.ratio",
                                            "residual"),
                                   element){
  if(class(gdata) != "geochemical_dataset"){
    stop('ERROR: A geochemical_dataset is required as input')
  }
  x <- NULL
  y <- NULL
  p <- NULL
  res.pca.df <- NULL
  current.title <- NULL
  ions <- list("Ca" = 1, "Mg" = 2, "Na" = 3, "K" = 4, "HCO3" = 5,
               "CO3" = 6, "Cl" = 7, "SO4" = 8)
  end.members.df <- data.frame(x = mixing.res$end.members[,1],
                               y = mixing.res$end.members[,2])
  #
  if(type == "concentration"){
    res.pca.df <- data.frame(PC1 = mixing.res$res.pca[,1],
                             PC2 = mixing.res$res.pca[,2],
                             concentration = mixing.res$residuals)
    #
    current.title <- paste0("Concentration: ", element)
    p <- ggplot() + geom_point(aes_string(x = "PC1", y = "PC2",
                                          color = paste0("residuals.",
                                                         ions[[element]])),
                               data = res.pca.df, size = 3) +
      scale_color_gradientn(colors=rainbow(10)) +
      geom_polygon(aes(x = x, y = y), data = end.members.df,
                   fill = NA, colour = "black") +
      theme_bw() +
      ggtitle(current.title)
  }
  else if(type == "mixing.ratio"){
    if(class(element) != "numeric"){
      stop("ERROR: the requested end member must be a number")
    }
    res.pca.df <- data.frame(PC1 = mixing.res$res.pca[,1],
                             PC2 = mixing.res$res.pca[,2],
                             mixing.ratio = mixing.res$residuals)
    #
    current.title <- paste0("Mixing Ratio: End Member", as.character(element))
    p <- ggplot() + geom_point(aes_string(x = "PC1", y = "PC2",
                                          color = paste0("mixing.ratio.",
                                                         as.character(element))),
                               data = res.pca.df, size = 3) +
      scale_color_gradientn(colors=rainbow(10)) +
      geom_polygon(aes(x = x, y = y), data = end.members.df,
                   fill = NA, colour = "black") +
      theme_bw() +
      ggtitle(current.title)
  }
  else if(type == "residual"){
    res.pca.df <- data.frame(PC1 = mixing.res$res.pca[,1],
                             PC2 = mixing.res$res.pca[,2],
                             residuals = mixing.res$residuals)
    #
    current.title <-
    p <- ggplot() + geom_point(aes_string(x = "PC1", y = "PC2",
                                   color = paste0("residuals.",
                                                  ions[[element]])),
                               data = res.pca.df, size = 3) +
      scale_color_gradientn(colors=rainbow(10)) +
      geom_polygon(aes(x = x, y = y), data = end.members.df,
                   fill = NA, colour = "black") +
      theme_bw() +
      ggtitle(element)
  }
  return(p)
}
#' @title
#' constrained_lm
#' @description
#' Constrained least-squares function. The constraint included ensures that the sum of the
#' estimated coefficients is equal to 1 and the coefficients are positive.
#' @param y A column matrix with the y values
#' @param X The design matrix of the problem
#' @param tol A numeric value with the tolerance specified to avoid numerical instability
#' during the solution of the least-squares problem.
#' @return
#' This function returns a numeric vector with the values of the estimated coeffcicients.
#' @author
#' Oscar Garcia-Cabrejo, \email{khaors@gmail.com}
#' @family mixing functions
#' @export
constrained_lm <- function(y, X, tol = 1e-12){
  nx <- nrow(X)
  FFt <- t(X)%*%X
  Ie <- matrix(1.0, nrow = nx, ncol = 1)
  FFtconstrained <- cbind(FFt, Ie)
  Xmod <- rbind(FFtconstrained, c(t(Ie), 0))
  b <- rbind(t(X)%*%y, 1)
  res <- solve(Xmod+tol*diag(nx+1),b)
  return(res)
}
#' @title
#' mix_model
#' @description
#' Function to estimate mixing ratios with uncertain end.members
#' @param gdata.gd A geochemical_dataset
#' @param end.members.gd A geochemical_dataset
#' @return
#' A numeric vector
#' @author
#' Oscar Garcia-Cabrejo \email{khaors@gmail.com}
#' @family mixing functions
#' @importFrom pracma inv
mix_model <- function(gdata.gd, end.members.gd){
  if(class(gdata.gd) != "geochemical_dataset" |
     class(end.members.gd) != "geochemical_dataset"){
    stop('ERROR: A geochemical_dataset is required as input')
  }
  dataset.mat <- as.matrix(gdata.gd$dataset)
  ndat <- nrow(dataset.mat)
  nvar <- ncol(dataset.mat)
  end.members.mat <- as.matrix(end.members.gd$dataset)
  nend <- nrow(end.members.mat)
  #1. Estimate initial mixing ratios using constrained lm
  mixing.ratios.mat <- matrix(0.0, nrow = ndat, ncol = nend)
  for(idat in 1:ndat){
    y <- matrix(dataset.mat[idat,], nrow = nvar, ncol = 1)
    mixing.ratios.mat[idat,] <- constrained_lm(y, end.members.mat, tol = 1e-12)
  }
  #2. Estimation of mu_s assuming mixing ratios known
  Gamma <- cbind(mixing.ratios.mat, -1*diag(ndat))
  Cs <- inv(Gamma%*%t(Gamma) + 1e-12*diag(ndat))
  zs <- dataset.mat
  lambdas <- -Cs%*%Gamma%*%zs
  sample.conc <-  zs - Gamma%*%lambdas
  #3. Estimate mixing ratios (Delta) assuming mu_s known

}
khaors/GQAnalyzer documentation built on May 29, 2019, 8:35 a.m.