bigutilsr: Utility Functions for Large-scale Data

################################################################################

#n=No. of Samples, p=No. of features, m=No. of distant spikes

#d-estimator for the spikes
d.eval<-function(samp.eval,m,p,n)
{
  gamma=p/n
  spike.est<-rep(0,m)
  for(i in 1:m)
  {
    temp=0
    for(j in (m+1):length(samp.eval))
      temp=temp+samp.eval[j]/(samp.eval[i]-samp.eval[j])
    spike.est[i]<-samp.eval[i]/(1+(gamma/(p-m))*temp)
  }
  return(spike.est)
}

#d-estimator for the angle between eigenvectors
d.angle<-function(spike.est,samp.eval,m,p,n)
{
  gamma=p/n
  angle.est<-rep(0,m)
  for(i in 1:m)
  {
    temp=0
    for(j in (m+1):length(samp.eval))
      temp=temp+samp.eval[j]/(samp.eval[i]-samp.eval[j])^2
    angle.est[i]<-sqrt(1/(1+(gamma*spike.est[i]/(p-m))*temp))
  }
  return(angle.est)
}

#psi function minus the sample eigenvalue
psi<-function(lambda,samp.eval,pop.nonsp,m,p,n)
{
  gamma=p/n
  temp<-0
  for(i in (m+1):length(which(pop.nonsp>0)))
  {
    temp<-temp+pop.nonsp[i]/(lambda-pop.nonsp[i])
  }
  temp<-1+temp*gamma/(p-m)
  return(lambda*temp-samp.eval)
}

#First derivative of the psi function
psiprime<-function(lambda,pop.nonsp,m,p,n)
{
  gamma=p/n
  temp<-0
  for(i in (m+1):length(which(pop.nonsp>0)))
  {
    temp<-temp+(pop.nonsp[i]/(lambda-pop.nonsp[i]))^2
  }
  return(1-temp*gamma/(p-m))
}


#lambda estimator for the spikes
l.eval<-function(samp.eval,pop.nonsp,m,p,n)
{
  spike.est<-rep(-100,m)	#A negative value in the final output would mean this is not a distant spike
  alpha<-stats::uniroot(psiprime,interval=c(pop.nonsp[1]+1/p,samp.eval[1]-1/p),pop.nonsp,m,p,n)$root	#Cutoff for distant spikes
  for(i in 1:m)
  {
    psi.alpha<-psi(alpha,samp.eval[i],pop.nonsp,m,p,n)	#This is actually psi(alpha)-d
    if(psi.alpha<0)
      spike.est[i]<-stats::uniroot(psi,interval=c(alpha,samp.eval[1]-1/p),samp.eval[i],pop.nonsp,m,p,n)$root
  }
  return(spike.est)
}

#lambda estimator for the angle between eigenvectors
l.angle<-function(spike.est,samp.eval,pop.nonsp,m,p,n)
{
  gamma=p/n
  angle.est<-rep(0,m)
  for(i in 1:m)
  {
    r2.est<-psiprime(spike.est[i],pop.nonsp,m,p,n)
    angle.est[i]<-sqrt(spike.est[i]*r2.est/samp.eval[i])
  }
  return(angle.est)
}


#OSP based estimator for the spikes
osp.eval<-function(samp.eval,m,p,n)
{
  gamma<-p/n
  a1<-0
  d<-p*samp.eval/sum(samp.eval)
  repeat{
    temp<-(d[1:m]+1-gamma)^2-4*d[1:m]
    m<-max(which(temp>=0))
    lambda<-(d[1:m]+1-gamma+sqrt((d[1:m]+1-gamma)^2-4*d[1:m]))/2
    a<-sum(lambda)+p-m
    d<-a*samp.eval/sum(samp.eval)
    if(abs(a-a1)<0.001) break
    a1<-a
  }
  nonspikes.est<-sum(samp.eval)/a
  spike.est<-lambda*nonspikes.est
  return(list(spikes=spike.est,nonspikes=nonspikes.est,n.spikes=m))
}


#OSP based estimator for the angle between eigenvectors
osp.angle<-function(spike.est,p,n)
{
  gamma=p/n
  temp1=1-gamma/(spike.est-1)^2
  temp2=1+gamma/(spike.est-1)
  return(sqrt(temp1/temp2))
}


#All functions used in Karoui's algorithm
#Inverse Steiltjes (Inverse w.r.t. z)
invsteiltjes<-function(v,z,samp.scaled,m,n)
{
  rep<-1
  repeat{
    vs=sum(1/(samp.scaled-z))/(n-m)
    vs<-c(Re(vs),Im(vs))
    v1<-Re(v)
    v2<-Im(v)
    J<-matrix(0,2,2)
    J[1,1]<-sum(((samp.scaled-Re(z))^2-Im(z)^2)/((samp.scaled-Re(z))^2+Im(z)^2)^2)/(n-m)
    J[2,2]<-J[1,1]
    J[1,2]<--sum(2*Im(z)*(samp.scaled-Re(z))/((samp.scaled-Re(z))^2+Im(z)^2)^2)/(n-m)
    J[2,1]<--J[1,2]
    znew<-c(Re(z),Im(z))-solve(J)%*%(vs-c(v1,v2))
    znew<-complex(real=znew[1],imaginary=znew[2])
    if(norm(as.matrix(vs-c(v1,v2)),type='f')<0.00001 || rep>1000) break
    z<-znew
    rep=rep+1
  }
  if(rep>1000)
  {
    return(1)
  } else {
    return(znew)
  }
}

#Produce quantiles as non-spikes from the estimated LSD
quant<-function(q,t,cuml)
{
  if (q>1 || q<0)
  {
    return(NULL)
  } else {
    for(i in 1:length(t))
    {
      if (q<=cuml[i]) break
    }
    if (q==cuml[i])
    {
      for(j in (i+1):length(t))
      {
        if(cuml[j]>cuml[i]) break
      }
      if(i==j-1)
      {
        return(t[i])
      } else {
        return(stats::runif(1,t[i],t[j-1]))
      }
    } else {
      return(t[i])
    }
  }
}

################################################################################

globalVariables("bw")

# Smoothing the LSD
popsmooth2 <- function(pop, t1, z, v, gamma, eimax, ncores) {

  # Loss function
  lossf <- function(pop, eimax, z, v, gamma) {
    gamma2 <- gamma / length(pop)
    tm1 <- eimax / pop[pop > 0]  # 1 / t
    term1 <- sapply(v, function(v_i) sum(1 / (tm1 + v_i)))
    e <- 1 / v + z - gamma2 * term1
    max(abs(Re(e)), abs(Im(e)))
  }

  seq_bw <- seq_log(length(pop) / 2, sqrt(length(pop)), 30)

  registerDoParallel(cl <- makeCluster(ncores))
  on.exit(stopCluster(cl), add = TRUE)
  pop_loss <- foreach(bw = seq_bw, .combine = 'rbind') %dopar% {
    pop <- stats::ksmooth(seq_along(pop), pop, bandwidth = bw)$y
    loss <- lossf(pop, eimax, z, v, gamma)
    list(pop, loss)
  }

  pop_loss[[which.min(pop_loss[, 2]), 1]]
}

################################################################################

karoui.nonsp <- function(samp.eval, m, p, n, ncores) {

  gamma <- p / n
  eimax <- samp.eval[m + 1]              # Largest sample eigenvalue
  eimin <- `if`(n > p, samp.eval[n], 0)  # Smallest sample eigenvalue

  samp.scaled <- samp.eval[(m + 1):n] / eimax
  rev <- seq(0, 1, 0.01)
  imv <- seq(0.001, 0.01, length.out = 5)
  v <- matrix(0, length(rev), length(imv))
  z <- matrix(mean(samp.scaled) + 1i, length(rev), length(imv))
  for (i in seq_along(rev)) {
    for (j in seq_along(imv)) {
      v[i, j] <- complex(real = rev[i], imaginary = imv[j])
      if (i == 1) {
        z[i, j] <- complex(real = mean(samp.scaled),
                           imaginary = 1 / Im(v[i, j]))
      }
      if (i == 2 && j > 1)
        z[i, j] <- z[i, (j - 1)]
      if (i > 2)
        z[i, j] <- z[(i - 1), j]

      flag.err <- 1
      try({
        z[i, j] <- invsteiltjes(v[i, j], z[i, j], samp.scaled, m, n)
        flag.err <- 0
      }, silent = TRUE)
      if (flag.err == 1) {
        try({
          z[i, j] <- complex(real = mean(samp.scaled), imaginary = 1 / Im(v[i, j]))
          z[i, j] <- invsteiltjes(v[i, j], z[i, j], samp.scaled, m, n)
          flag.err <- 0
        }, silent = TRUE)
      }
      if (Im(z[i, j]) == 0 || flag.err == 1)
        stop("The inverse stieltjes transform does not converge in Karoui's algorithm")
    }
  }
  z <- as.vector(z)

  # Recalculating v from z as the Steiljes transform
  v <- sapply(z, function(z_i) {
    sum(1 / (samp.scaled - z_i)) / (n - m)  # These are the Steiljes transforms
  })

  # Linear Programming problem
  eps <- 2^(-7)
  t <- seq(eimin / eimax, 1, 5e-3)
  ai <- seq(eimin / eimax, 1 - eps, eps)
  bi <- seq(eimin / eimax + eps, 1, eps)
  a <- c(rep(0, (length(t) + length(ai))), 1)
  A1 <- NULL
  A2 <- NULL
  A3 <- c(rep(1, (length(t) + length(ai))), 0)
  b3 <- 1
  b1 <- NULL
  b2 <- NULL
  for (i in seq_along(z)) {

    z1 <- z[i]
    v2 <- v[i] # This is the Steiljes transform for z1

    coef <- p / sum(n)
    coefs1 <- t / (1 + t * v2)
    coefs2 <- 1 / v2 - log((1 + v2 * bi) / (1 + v2 * ai)) / ((bi - ai) * v2^2)
    a11 <-  Re(coef * coefs1)
    a12 <-  Re(coef * coefs2)
    a21 <- -Im(coef * coefs1)
    a22 <- -Im(coef * coefs2)
    A1 <- rbind(A1, c(a11, a12, -1))
    A1 <- rbind(A1, c(a21, a22, -1))
    A2 <- rbind(A2, c(a11, a12,  1))
    A2 <- rbind(A2, c(a21, a22,  1))

    term1 <-  Re(v2) / (Mod(v2)^2) + Re(z1)
    term2 <- -Im(v2) / (Mod(v2)^2) + Im(z1)
    b1 <- c(b1, term1, -term2)
    b2 <- c(b2, term1, -term2)

    if (term1 < 0 || term2 > 0) break
  }
  sign <- c(rep("<=", nrow(A1)), rep(">=", nrow(A2)), rep("=", 1))

  ERROR_MSG <- "The solution for the linear programming problem is not available in Karoui's algorithm"

  flag.err <- 0
  try({
    simp <- lpSolve::lp(direction = "min", a, rbind(A1, A2, A3), sign, c(b1, b2, b3))
    flag.err <- 1
  })
  if (flag.err == 0) stop(ERROR_MSG)

  if (simp$status != 0) {
    flag.err <- 0
    try({
      simp <- boot::simplex(a, A1, b1, A2, b2, A3, b3)
      flag.err <- 2
    })
    if (flag.err == 0) stop(ERROR_MSG)
    flag.err <- `if`(simp$solved == 1, 2, 0)
  }
  if (flag.err == 0) stop(ERROR_MSG)

  t <- seq(eimin / eimax, 1, 5e-3)
  t1 <- sort(c(t, ai))
  if (flag.err == 1) {
    w1 <- simp$solution[seq_along(t)]
    w2 <- simp$solution[length(t) + seq_along(ai)]
  } else {
    w1 <- simp$soln[seq_along(t)]
    w2 <- simp$soln[length(t) + seq_along(ai)]
  }
  # Distribution function (Sample)
  cuml <- sapply(t1, function(t1_j) {
    a <- sum(w1[which(t <= t1_j)])
    ind <- which(bi <= t1_j)
    b <- sum(w2[ind])
    if ( (length(ind) == 0) || (max(ind) == length(bi)) ) {
      d <- 0
    } else {
      c <- max(ind) + 1
      d <- w2[c] * (t1_j - ai[c]) / (bi[c] - ai[c])
    }
    a + b + d
  })

  # Obtain Quantiles
  est <- sapply((p - m):1, function(i) {
    quant((i - 1) / (p - m), t1, cuml)
  })

  # Smoothing the non-spikes
  pop <- popsmooth2(est * eimax, t1, z, v, gamma, eimax, ncores = ncores)

  c(rep(pop[1], m), pop)
}

################################################################################

#' @inherit hdpca::select.nspike title description params return references
#' @param ncores Number of cores to be used. You can e.g. use [nb_cores()].
#'
#' @export
#'
pca_nspike <- function(samp.eval, p, n, n.spikes.max, ncores = 1) {

  if (length(samp.eval) == (n - 1)) samp.eval <- c(samp.eval, 0)
  if (length(samp.eval) != n) stop("'samp.eval' must have length n or (n-1).")

  m <- n.spikes.max
  repeat {
    pop.nonsp <- karoui.nonsp(samp.eval, m, p, n, ncores = ncores)
    eval.l <- l.eval(samp.eval, pop.nonsp, m, p, n)
    if (min(eval.l) > 0) {
      break
    } else {
      m <- min(which(eval.l == min(eval.l))) - 1
      if (m == 0) stop("No distant spike was found.")
    }
  }

  list(n.spikes = m,
       spikes = eval.l,
       nonspikes = pop.nonsp[(m + 1):p])
}

################################################################################

hdpc_est<-function(samp.eval,p,n,method=c("d.gsp","l.gsp","osp"),
                   n.spikes,n.spikes.max,n.spikes.out,nonspikes.out=FALSE, ncores)
{
  method<-match.arg(method)

  if(length(samp.eval)<n-1 || length(samp.eval)>n)
  {
    stop("samp.eval must have length n or (n-1)")
  } else {
    if(length(samp.eval)==n-1)	samp.eval<-c(samp.eval,0)
  }

  flag.sp.lambda<-0
  if(missing(n.spikes))
  {
    if(missing(n.spikes.max))
    {
      stop("Both n.spikes and n.spikes.max cannot be missing")
    } else {
      spike.select<-pca_nspike(samp.eval,p,n,n.spikes.max, ncores = ncores)
      n.spikes<-spike.select$n.spikes
      eval.l<-spike.select$spikes
      pop.nonsp<-spike.select$nonspikes
      pop.nonsp<-c(rep(pop.nonsp[1],n.spikes),pop.nonsp)
      angle.l<-l.angle(eval.l,samp.eval,pop.nonsp,n.spikes,p,n)
      corr.l<-angle.l*sqrt(samp.eval[1:n.spikes]/eval.l)
      shrink.l<-eval.l/samp.eval[1:n.spikes]
      flag.sp.lambda<-1
    }
  }
  if(method=="l.gsp")
  {
    if(missing(n.spikes.out))	n.spikes.out<-n.spikes
    if(n.spikes.out>n.spikes)	stop("n.spikes.out must be smaller than n.spikes")
    if(flag.sp.lambda==0)
    {
      pop.nonsp<-karoui.nonsp(samp.eval,n.spikes,p,n, ncores = ncores)
      eval.l<-l.eval(samp.eval,pop.nonsp,n.spikes,p,n)
      if(min(eval.l)<0)	stop("n.spikes is too large, the spectrum does not have that many distant spikes")
      angle.l<-l.angle(eval.l,samp.eval,pop.nonsp,n.spikes,p,n)
      corr.l<-angle.l*sqrt(samp.eval[1:n.spikes]/eval.l)
      shrink.l<-eval.l/samp.eval[1:n.spikes]
    }
    if(nonspikes.out)
    {
      return(list(spikes=eval.l[1:n.spikes.out],n.spikes=n.spikes,
                  angles=angle.l[1:n.spikes.out],correlations=corr.l[1:n.spikes.out],
                  shrinkage=shrink.l[1:n.spikes.out],nonspikes=pop.nonsp[(n.spikes+1):p]))
    } else {
      return(list(spikes=eval.l[1:n.spikes.out],n.spikes=n.spikes,
                  angles=angle.l[1:n.spikes.out],correlations=corr.l[1:n.spikes.out],
                  shrinkage=shrink.l[1:n.spikes.out]))
    }
  } else if(method=="d.gsp") {
    if(missing(n.spikes.out))	n.spikes.out<-n.spikes
    if(n.spikes.out>n.spikes)	stop("n.spikes.out must be smaller than n.spikes")
    eval.d<-d.eval(samp.eval,n.spikes,p,n)
    angle.d<-d.angle(eval.d,samp.eval,n.spikes,p,n)
    corr.d<-angle.d*sqrt(samp.eval[1:n.spikes]/eval.d)
    shrink.d<-eval.d/samp.eval[1:n.spikes]
    return(list(spikes=eval.d[1:n.spikes.out],n.spikes=n.spikes,
                angles=angle.d[1:n.spikes.out],correlations=corr.d[1:n.spikes.out],
                shrinkage=shrink.d[1:n.spikes.out]))
  } else if(method=="osp") {
    osp.out<-osp.eval(samp.eval,n.spikes,p,n)
    eval.osp<-osp.out$spikes
    pop.nonsp<-osp.out$nonspikes
    n.spikes<-osp.out$n.spikes
    osp.scaled<-eval.osp/pop.nonsp
    if(missing(n.spikes.out))	n.spikes.out<-n.spikes
    if(n.spikes.out>n.spikes)	stop("n.spikes.out must be smaller than n.spikes")
    angle.osp<-osp.angle(osp.scaled,p,n)
    corr.osp<-angle.osp*sqrt(samp.eval[1:n.spikes]/eval.osp)
    shrink.osp<-(osp.scaled-1)/(osp.scaled+p/n-1)
    if(nonspikes.out)
    {
      return(list(spikes=eval.osp[1:n.spikes.out],n.spikes=n.spikes,
                  angles=angle.osp[1:n.spikes.out],correlations=corr.osp[1:n.spikes.out],
                  shrinkage=shrink.osp[1:n.spikes.out],nonspikes=pop.nonsp))
    } else {
      return(list(spikes=eval.osp[1:n.spikes.out],n.spikes=n.spikes,
                  angles=angle.osp[1:n.spikes.out],correlations=corr.osp[1:n.spikes.out],
                  shrinkage=shrink.osp[1:n.spikes.out]))
    }
  }
}

################################################################################

#' @inherit hdpca::pc_adjust title description params references
#' @param ncores Number of cores to be used. You can e.g. use [nb_cores()].
#'
#' @return
#' A matrix containing the bias-adjusted PC scores.
#' The dimension of the matrix is the same as the dimension of `test.scores`.
#'
#' Also, an attribute `attr(*, "shrinkage")` containing the shrinkage factors.
#' Note that the number of shrinkage factors can be smaller than the number of
#' columns of `test.scores`; it corresponds to the estimated number of spikes.
#'
#' @export
#'
pca_adjust <- function(train.eval, p, n, test.scores,
                       method = c("d.gsp", "l.gsp", "osp"),
                       n.spikes.max,
                       ncores = 1) {

  stopifnot(length(dim(test.scores)) == 2)

  shrinkage <- hdpc_est(train.eval, p, n, method, n.spikes.max = n.spikes.max,
                        ncores = ncores)$shrinkage

  m <- min(length(shrinkage), ncol(test.scores))
  for (k in seq_len(m)) test.scores[, k] <- test.scores[, k] / shrinkage[k]

  structure(test.scores, shrinkage = shrinkage[1:m])
}

################################################################################