R/evolafit.R
In evola: Evolutionary Algorithm

Documented in evolafit

evolafit <- function(formula, dt, 
                     constraintsUB, constraintsLB,constraintW=NULL, 
                     b, nCrosses=50, nProgeny=20,nGenerations=20, 
                     recombGens=1, nChr=1, mutRateAllele=0, mutRateAlpha=0,
                     nQtlStart=NULL, D=NULL, lambda=0,
                     propSelBetween=NULL,propSelWithin=NULL,
                     fitnessf=NULL, verbose=TRUE, dateWarning=TRUE,
                     selectTop=TRUE, tolVarG=1e-6, 
                     Ne=50, initPop=NULL, simParam = NULL, 
                     fixNumQtlPerInd=FALSE, traceDelta=TRUE, topN=10,
                     includeSet=NULL, excludeSet=NULL,
                     ...){
  
  my.date <- "2025-11-01"
  your.date <- Sys.Date()
  ## if your month is greater than my month you are outdated
  if(dateWarning & verbose){
    if (your.date > my.date) {
      warning("Version out of date. Please update evola to the newest version using:\ninstall.packages('evola') in a new session\n Use the 'dateWarning' argument to disable the warning message.", call. = FALSE)
    }
  }
  # if(propSelBetween==0 | propSelWithin==0){stop("Please ensure that parameters propSelWithin and propSelBetween are different than zero.", call. = FALSE)}
  if(missing(formula)){stop("Please provide the formula to know traits and classifiers.", call. = FALSE)}
  if(is.null(propSelBetween)){
    propSelBetween <- stan(logspace(seq(1,-1, -2/nGenerations), p=3), ub=0.8, lb=0.2)
  }else{
    propSelBetween <- rep(propSelBetween, nGenerations)
  }
  if(is.null(propSelWithin)){
    propSelWithin <- stan(logspace(seq(1,-1, -2/nGenerations), p=3), ub=0.2, lb=0.8)
  }else{
    propSelWithin <- rep(propSelWithin, nGenerations)
  }
  if(propSelBetween[1]>0 & nCrosses==0){stop("If you apply selection between families you need to set nCrosses to a value > 0.", call. = FALSE)}
  if(nGenerations < 1){stop("nGenerations cannot be smaller than 1.")}
  if(is.null(constraintW)){
    constraintW <- rep(1, nGenerations)
  } # different weight to constraints at each generation
  
  mc <- match.call() # create a call
  # add the fitness function (current options are qa=-1 to ln>=0 )
  if(is.null(fitnessf)){
    fitnessf <- ocsFun
  };
  # get the name of the traits
  elements <- strsplit(as.character(formula), split = "[+]")#[[1]]
  elements <- lapply(elements[-c(1)], function(x){all.vars(as.formula(paste("~",x)))})
  traits <- elements[[1]]
  if(!all(traits%in%colnames(dt))){stop("Specified traits are not traits in the dataset. Please correct.", call. = FALSE)}
  classifiers <- elements[[2]]
  checkNQtls <- table(dt[,classifiers])
  if(length(which(checkNQtls > 1)) > 0){stop("You cannot provide more than one alpha value per QTL. Make sure that your x variable has only one value.", call. = FALSE)}
  if(missing(constraintsUB)){constraintsUB <- rep(Inf,length(traits))}
  if(length(constraintsUB) != length(traits)){stop(paste0("Constraints need to have the same length than traits (",length(traits),")"), call. = FALSE)}
  if(missing(constraintsLB)){constraintsLB <- rep(-Inf,length(traits))}
  if(length(constraintsLB) != length(traits)){stop(paste0("Constraints need to have the same length than traits (",length(traits),")"), call. = FALSE)}
  if(missing(b)){b <- rep(1,length(traits))}
  if(length(b) != length(traits)){stop(paste0("Weights need to have the same length than traits (",length(traits),")"), call. = FALSE)}
  if(is.null(D)){D <- Matrix::Diagonal(nrow(dt)); useD=FALSE}else{useD=TRUE}
  if(is.null(nQtlStart)){nQtlStart <- ceiling(nrow(dt)/5)}
  # check that the user has provided a single value for each QTL
  
  
  if(is.null(initPop)){
    # 1) initialize the population with customized haplotypes to ensure a single QTL per individual
    av <- 1:nrow(dt)
    haplo = Matrix::Matrix(0, nrow= Ne*2, ncol = nrow(dt)) # rbind( diag(nrow(dt)), diag(nrow(dt)) )
    for (i in seq(1,nrow(haplo),2)) {
      haplo[i,sample(av,nQtlStart)] <- 1
      # haplo[i,] <- ifelse(runif(ncol(haplo))< (nQtlStart/ncol(haplo)) ,1,0)
      haplo[(i+1),] <- haplo[i,]
    }
    colnames(haplo) = dt[,classifiers]
    
    nQtlPerChr = rep( floor(length(colnames(haplo))/nChr), nChr)
    nQtlPerChr[length(nQtlPerChr)] = nQtlPerChr[length(nQtlPerChr)] + ( length(colnames(haplo)) - sum(nQtlPerChr) )
    
    chromosome = as.vector(unlist(apply( data.frame(nQtlPerChr,1:nChr), 1, function(x){rep(x[2],x[1])})))
    position = as.vector(unlist(apply( data.frame(nQtlPerChr,1:nChr), 1, function(x){1:x[1]})))
    genMap = data.frame(markerName=colnames(haplo),
                        chromosome=chromosome,
                        position=position)
    ped = data.frame(id=paste0("I", 1:nrow(dt)),
                     mother=0, father=0)
    founderPop = importHaploSparse(haplo=haplo, 
                                   genMap=genMap,
                                   ploidy=2L)
    # founderPop = quickHaplo(nInd=Ne,nChr=1,segSites=nrow(dt), inbred = TRUE)
    SP = SimParam$new(founderPop)
    # 2) add the traits (columns from user) to take the values (rows) as marker effects
    for(iTrait in 1:length(traits)){
      SP$importTrait(markerNames =unlist(lapply(SP$genMap,names)), addEff = dt[,traits[iTrait]]) # over 2 because QTL data is diplodized or Q/2
    }
    # 3) set the population
    pop = newPop(founderPop, simParam = SP)
    if(nCrosses > 0){
      pop = randCross(pop, nCrosses = nCrosses, nProgeny = nProgeny, simParam = SP)
      pop = makeDH(pop=pop, simParam = SP)
    }
    variances = diag(varG(pop))
    if(all(SP$varG>0)){
      pop = setPheno(pop,h2=rep(.98,length(which(variances>0))), simParam = SP, traits = which(variances > 0) )
    }else{
      pop@pheno <- apply(pop@pheno,2,function(xx){rnorm(length(xx))}) # rnorm(length(pop@pheno))
    }
  }else{
    pop=initPop
    SP=simParam
    variances = diag(varG(pop))
  }
  
  # ***) creating the frame for the plot
  indivPerformance <- list() # store results by generation
  averagePerformance <- matrix(0, nrow=nGenerations,ncol=4) # to store results
  colnames(averagePerformance) <- c("Average.fitness","Best.fitness","nQTL.mu", "deltaC.mu")
  # rownames(averagePerformance) <- paste("Generation",seq(nrow(averagePerformance)))
  # 4) Starting the Generational process
  ################################
  ################################
  ################################
  ## FOR EACH GENERATION
  j =0 # in 1:nGenerations
  nonStop=TRUE
  
  if(traceDelta){ # if user wants to trace inbreeding (default is TRUE but it can be a costly operation)
    m <- Matrix::Matrix(1,nrow=1,ncol=ncol(D))
    if(useD){
      mtDm <- as.vector((m%*%Matrix::tcrossprod(D,m))/(4*(ncol(D)^2)))
    }else{
      mtDm <- as.vector((Matrix::tcrossprod(m))/(4*(ncol(D)^2)))
    }
  }
  best <- list()
  console <-  data.frame(matrix(NA,nrow=nGenerations, ncol=8))
  nin <- nCrosses*nProgeny
  initVarG = round(sum(diag(varG(pop = pop))),3)
  while(nonStop) { # for each generation we breed # j=1
    j=j+1
    
    Q <- pullQtlGeno(pop, simParam = SP, trait = 1)/2 #?/2
    Q <- as(as(as( Q,  "dMatrix"), "generalMatrix"), "CsparseMatrix") # as(Q, Class = "dgCMatrix")
    rownames(Q) <- pop@id
    
    ## use mutation rate in genome
    if(mutRateAllele > 0){
      nMutations = ceiling(mutRateAllele * nrow(dt)) # number of mutations per individual per generation
      if(nMutations == 1){
        pointMut = t(as.matrix(apply(data.frame(1:nInd(pop)), 1, function(x){
          sample(1:nrow(dt), nMutations, replace = FALSE)
        }) ))
      }else{
        pointMut = as.matrix(apply(data.frame(1:nInd(pop)), 1, function(x){
          sample(1:nrow(dt), nMutations, replace = FALSE)
        }) )
      }
      # 
      DRIFT = drift(pop, simParam=SP)$Trait1
      for(iQtl in unique(as.vector(pointMut))){
        indsToModif=which(pointMut == iQtl, arr.ind = TRUE)[,"col"]
        allele = sample(0:1, 1)
        pop = editGenome(pop, ind=indsToModif,chr=as.numeric(DRIFT[iQtl,"chr"]), segSites=DRIFT[iQtl,"ss"], simParam=SP, allele = allele)
      }
    }else{pointMut=as.data.frame(matrix(NA, nrow=0, ncol=1))}
    ## use mutation rate in alphas
    if(mutRateAlpha > 0){
      nMutations = ceiling(mutRateAlpha * nrow(dt)) # define the number of mutations
      nTraits <- SP$nTraits
      for(iTrait in 1:nTraits){ # iTrait=1
        # extract the trait to mutate
        traitToMutate <- SP$traits[[iTrait]] 
        # mutate the average allelic effects
        traitToMutate@addEff[sample(1:nrow(dt), nMutations, replace = FALSE)] = rnorm(nMutations)
        # replace the trait back
        SP$switchTrait(traitPos=iTrait, lociMap=traitToMutate, varE = NA_real_, force = TRUE)
      }
    }
    ## enf of use mutation rate
    ###########################
    ## if user wants to fix the number of QTLs activated apply the following rules
    # 1) if more than nQtlStart we silence some
    # 2) if less than nQtlStart we activate some
    if(fixNumQtlPerInd){
      Qfq <- pullQtlGeno(pop, simParam = SP, trait = 1); Qfq <- Qfq/2
      DRIFT = drift(pop, simParam=SP)$Trait1
      for(iInd in 1:nInd(pop)){ #  iInd=1 for each individual
        iQfq <- Qfq[iInd,]; 
        areZeros <- which(iQfq == 0); areOnes <- setdiff(1:ncol(Qfq),areZeros)
        howMany <- sum(iQfq) # how many QTLs are activated, we're assuming is a 0/1 matrix
        totalToAddOrRem <- abs(howMany - nQtlStart) # deviation from expectation
        if( howMany > nQtlStart ){ # if exceeded silence some
          toRem <- sample(areOnes, totalToAddOrRem) # pick which ones will be silenced
          pop = editGenome(pop, ind=iInd,chr=as.numeric(DRIFT[toRem,"chr"]), segSites=DRIFT[toRem,"ss"], simParam=SP, allele = 0)
        }else if( howMany < nQtlStart){ # if lacked activate some
          toAdd <- sample(areZeros, totalToAddOrRem) # pick which ones will be activated
          pop = editGenome(pop, ind=iInd,chr=as.numeric(DRIFT[toAdd,"chr"]), segSites=DRIFT[toAdd,"ss"], simParam=SP, allele = 1)
        } # else do nothing
      }
    }

    if(!is.null(includeSet)){
      DRIFT = drift(pop, simParam=SP)$Trait1
      pop = editGenome(pop, ind=1:nInd(pop),
                       chr=as.numeric(DRIFT[which(includeSet>0),"chr"]), 
                       segSites=DRIFT[which(includeSet>0),"ss"], simParam=SP, allele = 1)
    }
    if(!is.null(excludeSet)){
      DRIFT = drift(pop, simParam=SP)$Trait1
      pop = editGenome(pop, ind=1:nInd(pop),
                       chr=as.numeric(DRIFT[which(excludeSet>0),"chr"]), 
                       segSites=DRIFT[which(excludeSet>0),"ss"], simParam=SP, allele = 0)
    }
    ## enf of fixqtl
    
    a <- do.call(cbind, lapply(SP$traits, function(x){x@addEff}))
    colnames(a) <- traits
    pop@gv <- as.matrix(Q%*% a)
    fitnessValuePop<- do.call("fitnessf", args=list(Y=pop@gv, b=b,  Q=Q[pop@id,], 
                                                    a=a, D=D, lambda=lambda,
                                                    ... ), quote = TRUE)
    if(!is.matrix(fitnessValuePop)){
      fitnessValuePop <- Matrix::Matrix(fitnessValuePop,ncol=1) 
    }
    rownames(fitnessValuePop) <- pop@id
    nanFound <- which(is.nan( fitnessValuePop[,1] )) # check if there are nans
    if(length(nanFound) > 0){ # if so assign the worst values to those individuals/solutions
      if(selectTop){
        fitnessValuePop[nanFound,1]= -Inf
      }else{
        fitnessValuePop[nanFound,1]= Inf
      }
    }
    pop@pheno[,1] <- fitnessValuePop[,1]
    ## apply selection between and within
    structure = table(paste(pop@mother, pop@father))
    nc = length(structure)
    np = floor(median(structure))
    # Although multiple traits are enabled it is assumed that same QTLs are behind all the traits, differing only in their average allelic effects.
    if( propSelBetween[j] < 1){ 
      suppressWarnings( popF <- selectFam(pop=pop,nFam = ceiling(nc*propSelBetween[j]), trait = 1, 
                                          use = "pheno", simParam = SP, 
                                          selectTop=selectTop,... #H=H,nCities=nCities
      )@id, classes = "warning")
    }else{popF = pop@id}
    if( propSelWithin[j] < 1 ){
      suppressWarnings( popW <- selectWithinFam(pop = pop, nInd = ceiling(np*propSelWithin[j]), 
                                                trait = 1, use = "pheno", simParam = SP, 
                                                selectTop=selectTop,...#H=H,nCities=nCities
      )@id, classes = "warning")
    }else{popW=pop@id}
    ################################
    ################################
    ## FOR EACH TRAIT WE APPLY CONSTRAINTS
    constCheckUB <- constCheckLB <- matrix(1, nrow=nrow(Q), ncol=length(traits))
    
    for(iTrait in 1:length(traits)){ # iTrait=1
      # check the contraints and trace them back
      constCheckUB[,iTrait] <- ifelse( (pop@gv[,iTrait] > constraintsUB[iTrait]*(1+(1-constraintW[j])) )  , 0 , 1) # ifelse(c1+c2 < 2, 0, 1)
      constCheckLB[,iTrait] <- ifelse( (pop@gv[,iTrait] < constraintsLB[iTrait]*constraintW[j] ) , 0 , 1)
      # constCheckUB[,iTrait] <- ifelse( (pop@gv[,iTrait] > constraintsUB[iTrait])  , 0 , 1) # ifelse(c1+c2 < 2, 0, 1)
      # constCheckLB[,iTrait] <- ifelse( (pop@gv[,iTrait] < constraintsLB[iTrait]) , 0 , 1)
      nan0 <- which(is.nan( pop@gv[,iTrait]))
      if(length(nan0) > 0){ constCheckUB[nan0,iTrait] = 0; constCheckLB[nan0,iTrait] = 0 }
    } # end of for each trait
    # sum of how many trait constraints are met
    
    metConstCheck <- apply(constCheckUB,1,sum) # sum how many traits we're good to go
    didntMetConst <- which(metConstCheck < length(traits))
    
    # remove individuals that break the constraints UB
    if(length(didntMetConst)>0){ # 
      popCU <- pop@id[setdiff(1:nInd(pop),didntMetConst)]
    }else{popCU<-pop@id}
    #  remove individuals that break the constraints LB
    metConstCheckL <- apply(constCheckLB,1,sum) 
    didntMetConstL <- which(metConstCheckL < length(traits))
    
    # impute with mean value the ones that do not met the constraints
    if(length(didntMetConstL)>0){
      popCL <- pop@id[setdiff(1:nInd(pop),didntMetConstL)]
    }else{popCL<-pop@id}
    ## END OF FOR EACH TRAIT WE APPLY CONSTRAINTS
    ################################
    ################################
    parentsForSelection <- list(popF,popW,popCL, popCU)
    selected <- Reduce(intersect, parentsForSelection )
    if(length(selected) == 0){
      selected <- intersect(popCL,popCU )
    }
    if(length(selected) < 2){
      message("No legal solutions found. Selecting all individuals meeting constraints.")
      selected <- Reduce(intersect, list(popCL, popCU))
      if(length(selected) < 2){
        message("Too many constraints. No legal solutions found. Random selection p=0.5 applied.")
        selected <- pop@id[sample(1:nInd(pop), ceiling(nInd(pop)*.5) )]
      }
    }
    pop <- pop[which(pop@id %in% selected)]
    
    ## calculate trace metrics
    if(traceDelta){
      if(useD){
        qtDq <- Matrix::diag(Q%*%Matrix::tcrossprod(D,Q))
      }else{
        qtDq <- Matrix::diag(Matrix::tcrossprod(Q))
      } # [inbreedingNew (biggerVal) - inbreedingOld(smallerVal=outbred)] # qDq gets smaller and smaller with generations
      # deltaC represents that with every generation we get further away from orinal outbreeding and the lower the more inbred
      deltaC <- ( (qtDq/(4*(apply(Q/2,1,sum)^2))) - mtDm)/(1-mtDm) # numerator is equivalent to mtDm 
    }else{deltaC=NA}
    
    #################################
    # solutions selected for tracing
    best[[j]] <- selectInd(pop=pop, nInd = min(c(nInd(pop),topN)), trait = 1, 
                           use = "pheno", simParam = SP, 
                           selectTop=selectTop,... #H=H,nCities=nCities
    )
   
    mfvp =  mean(as.vector(fitnessValuePop[best[[j]]@id,]))
    indivPerformance[[j]] <- data.frame(id=best[[j]]@id, fitness=as.vector(fitnessValuePop[best[[j]]@id,]), 
                                        generation=j, nQTL=as.vector(apply(Q[best[[j]]@id,,drop=FALSE]/2,1,sum)),
                                        deltaC= as.vector(deltaC[best[[j]]@id]) ) # save individual solution performance
    
    averagePerformance[j,] <- c( mfvp , max(fitnessValuePop,na.rm=TRUE) ,  mean(apply(Q/2,1,sum),na.rm=TRUE), mean(deltaC,na.rm=TRUE) ) # save summaries of performance
    ## create new progeny
    for(k in 1:recombGens){
      if(nCrosses > 0){
        pop <- randCross(pop=pop, nCrosses = nCrosses, nProgeny = nProgeny, simParam = SP)
      }
    }
    pop <- makeDH(pop=pop, nDH = 1, simParam = SP)
    
    #############################################
    ## compute phenotypes
    if(all(SP$varG>0)){
      pop = setPheno(pop,h2=rep(.98,length(which(variances>0))), simParam = SP, traits = which(variances > 0) )
    }else{
      pop@pheno <- apply(pop@pheno,2,function(xx){rnorm(length(xx))}) # rnorm(length(pop@pheno))
    }
    ##############################################################
    ##############################################################
    #store the performance of the jth generation for plot functions
    if(nrow(pop@gv) > 0){
      totalVarG = sum(diag(varG(pop = pop)))
    }else{
      totalVarG = 0
    }
    console[j,] <- c(j, length(didntMetConst), length(didntMetConstL), 
                     round(totalVarG, 3), round(mfvp, 3), round(propSelBetween[j], 2),
                     round(propSelWithin[j], 2),  Sys.time() )
    if(verbose){
      if(j==1){
        message(paste0("Population with ", nCrosses, " crosses, ", nProgeny, " progeny, and ",ncol(Q)," QTLs"))
        message(
          cat("gener  constUB  constLB  varG%   propB   propW   fit       time")
        )
      }
      sp <- paste(rep(" ", 2), collapse = "")
      message(cat(paste(
        sp, addZeros(1:nGenerations, nr=0)[j], 
        sp, addZeros(c(length(didntMetConst),nin), nr=0)[1],
        sp, addZeros(c(length(didntMetConstL),nin), nr=0)[1],
        sp, addZeros(c(round(totalVarG/initVarG, 3),1.000))[1],
        sp, addZeros(c( round(propSelBetween[j], 2) , round(propSelBetween, 2) ))[1], 
        sp, addZeros(c( round(propSelWithin[j], 2) , round(propSelWithin, 2) ))[1], 
        sp, addZeros(c(  round(mfvp, 3) , round(console[1:j,5], 3) ))[1], 
        sp, strsplit(format(Sys.time(), "%a %b %d %X %Y"), " ")[[1]][4]
      )))
    }
    if(j == nGenerations){nonStop = FALSE}
    if(nrow(pop@gv) > 0){
      if(totalVarG < tolVarG){nonStop = FALSE; message("Variance across traits exhausted. Early stop.")}
    }else{
      nonStop = FALSE; message("All individuals discarded. Consider changing some parameter values e.g., mutRateAllele
                           or nQtlStart (initial number of QTLs) to avoid all 
                           solutions to go beyond the bounds.")
    }
  }# end of for each generation
  ################################
  ################################
  ################################
  # 7) retrieve output of best combinations
  indivPerformance <- do.call(rbind, indivPerformance)
  best <- do.call(c, best)
  # transform the Pop 
  popEvola <- as(best,"evolaPop")
  popEvola@score <- averagePerformance[1:j,,drop=FALSE]
  popEvola@pointMut <- nrow(pointMut)
  popEvola@constCheckUB <- constCheckUB
  popEvola@constCheckLB <- constCheckLB
  popEvola@traits <- traits
  
  ###################
  # Although multiple traits are enabled it is assumed that same QTLs are behind all the traits, differing only in their average allelic effects.
  # we need to recalculate fitness because we have been scaling across generations
  Q <- pullQtlGeno(popEvola, simParam = SP, trait = iTrait)/2 #?/2
  Q <- as(as(as( Q,  "dMatrix"), "generalMatrix"), "CsparseMatrix") # as(Q, Class = "dgCMatrix")
  rownames(Q) <- popEvola@id
  a <- do.call(cbind, lapply(SP$traits, function(x){x@addEff}))
  popEvola@gv <- as.matrix(Q%*% a)
  fitnessValuePop<- do.call("fitnessf", args=list(Y=popEvola@gv, b=b,  Q=Q,
                                                  a=a, D=D, lambda=lambda,
                                                  ... ), quote = TRUE)

  if(!is.matrix(fitnessValuePop)){
    fitnessValuePop <- Matrix::Matrix(fitnessValuePop,ncol=1)
  };  rownames(fitnessValuePop) <- popEvola@id
  popEvola@fitness <- as.vector(fitnessValuePop)
  indivPerformance[,"fitness"] <-  as.vector(fitnessValuePop)
  popEvola@indivPerformance <- if(is.null(indivPerformance)){data.frame()}else{indivPerformance} 
  
  res <- list(pop=popEvola, simParam=SP, call=mc, fitness=fitnessValuePop, console=console)
  class(res) <- "evolaFitMod"
  return(res)
}