R/allopolyploidy.R
In polysat: Tools for Polyploid Microsatellite Analysis

Documented in alleleCorrelations catalanAlleles mergeAlleleAssignments plotParamHeatmap plotSSAllo processDatasetAllo recodeAllopoly simAllopoly testAlGroups

# "Resolving microsatellite genotype ambiguity in populations of
# allopolyploid and diploidized autopolyploid organisms using negative
# correlations between alleles"

# function to create simulated data
simAllopoly <- function(ploidy=c(2,2),
                         n.alleles=c(4,4), n.homoplasy=0,
                         n.null.alleles=rep(0, length(ploidy)),
                         alleles=NULL,
                         freq=NULL, meiotic.error.rate=0,
                         nSam=100, locname="L1"){
    # number of different genomes in the allopolyploid (1 for autopolyploid)
    nGenomes <- length(ploidy)
    # no meiotic error if there is only one genome
    if(nGenomes==1 && meiotic.error.rate != 0)
      stop("No meiotic error possible if there is only one subgenome.")

    # make allele names if needed
    if(is.null(alleles)){
        if(length(n.alleles)!=nGenomes)
              stop("'n.alleles' must be a vector of the same length as 'ploidy'.")
          if(nGenomes>6) stop("More than six subgenomes not supported at this time.")
          GenomeNames <- c('A','B','C','D','E','F')[1:nGenomes]
          alleles <- list()
          length(alleles) <- nGenomes
          for(i in 1:nGenomes){
              alleles[[i]] <- paste(GenomeNames[i], 1:n.alleles[i], sep="-")
          }
          if(length(n.homoplasy)!=1) stop("Only one value allowed for n.homoplasy.")
          if(n.homoplasy>0){ # if there are more than 0 homoplasious alleles
              HAlleles <- paste('H', 1:n.homoplasy, sep="-")
              for(i in 1:nGenomes){ # put homoplasious alleles at end of vector
                  alleles[[i]][(n.alleles[i]-n.homoplasy+1):n.alleles[i]] <- HAlleles
              }
          }
          # if there are null alleles, put at beginning of vector
          if(length(n.null.alleles)!=nGenomes)
              stop("'n.null.alleles' must be a vector the same length as 'ploidy'.")
          if(!all(n.null.alleles<=n.alleles))
              stop("'n.null.alleles' values must be smaller than 'n.alleles' values.")
          for(i in 1:nGenomes){
              if(n.null.alleles[i]>0){
                  alleles[[i]][1:n.null.alleles[i]] <- "N"
              }
          }
    } else {
        # if alleles are provided, get n.alleles to match
        n.alleles <- sapply(alleles, length)
    }

    # randomly generate allele frequencies if needed
    if(is.null(freq)){
        freq <- list()
        length(freq) <- nGenomes
        for(i in 1:nGenomes){
            temp <- sample(100, n.alleles[i], replace=TRUE)
            freq[[i]] <- temp/sum(temp)
        }
    }

    # errors
    if(!is.list(alleles) || !is.list(freq))
        stop("alleles and freq must each be a list of vectors")
    if(!identical(sapply(freq, length), sapply(alleles, length)))
        stop("Need same number of values for alleles and frequencies.")
    if(nGenomes != length(alleles) || nGenomes != length(freq))
      stop("Number of genomes nees to match between ploidy, alleles, and freq")
    if(length(meiotic.error.rate)!=1 || meiotic.error.rate<0 ||
       meiotic.error.rate>0.5)
        stop("meiotic.error.rate should be a single value between 0 and 0.5.")
    if(meiotic.error.rate>0 && !all(ploidy %% 2 == 0))
        stop("Even (not odd) ploidy is required if simulating meiotic error.")

    # create genambig object to return at end
    gen <- new("genambig", samples=1:nSam, loci=locname)

    # fixing the sample function (Thanks, help file on "sample"!)
    resample <- function(x, ...) x[sample.int(length(x), ...)]
    # create genotypes
    for(s in 1:nSam){
        # determine if there was a meiotic error
        gameteploidy <- ploidy/2
        if(sample(c(TRUE,FALSE),size=1,
                  prob=c(meiotic.error.rate, 1-meiotic.error.rate))){
            isolocusLost <- sample(nGenomes,1)
            isolocusGained <- resample((1:nGenomes)[-isolocusLost],1)
            gamete1ploidy <- gameteploidy
            gamete1ploidy[isolocusLost] <-  gamete1ploidy[isolocusLost]-1
            gamete1ploidy[isolocusGained] <- gamete1ploidy[isolocusGained]+1
        } else {
            gamete1ploidy <- gameteploidy
        }
        if(sample(c(TRUE,FALSE),size=1,
                  prob=c(meiotic.error.rate, 1-meiotic.error.rate))){
            isolocusLost <- sample(nGenomes,1)
            isolocusGained <- resample((1:nGenomes)[-isolocusLost],1)
            gamete2ploidy <- gameteploidy
            gamete2ploidy[isolocusLost] <-  gamete2ploidy[isolocusLost]-1
            gamete2ploidy[isolocusGained] <- gamete2ploidy[isolocusGained]+1
        } else {
            gamete2ploidy <- gameteploidy
        }
        tempploidy <- gamete1ploidy + gamete2ploidy
        # get genotype
        thisgen <- unique(unlist(mapply(resample, alleles,
                                                       replace= TRUE,
                                      size = tempploidy, prob = freq,
                                        SIMPLIFY=FALSE)))
        thisgen <- thisgen[thisgen!=0 & thisgen!="N"] # remove null alleles
        if(length(thisgen)==0){
            Genotype(gen,s,1) <- Missing(gen)
        } else {
            Genotype(gen,s,1) <- thisgen
        }
    }

    return(gen)
}

# n.start is passed to the kmeans function
alleleCorrelations <- function(object, samples=Samples(object), locus=1,
                         alpha=0.05, n.subgen=2, n.start=50){
    # subset by samples if needed
    object <- object[samples,]
    # check missing data
    mymissing <- isMissing(object, loci=locus)
    if(all(mymissing)) stop("All data at locus are missing.")
    # only do one locus at a time
    if(length(locus) > 1) stop("More than one locus")
    # subset dataset object by this locus
    object <- object[!mymissing,locus]
    # perform class conversion if necessary
    if(is(object, "genambig")) object <- genambig.to.genbinary(object)
    # error if not the right class
    if(!is(object, "genbinary")) stop("genambig or genbinary object needed.")
    # set Absent to 0 and Present to 1
    Absent(object) <- as.integer(0)
    Present(object) <- as.integer(1)

    gentable <- Genotypes(object)  # get the 1/0 table of genotypes
    # remove alleles that aren't in dataset
    gentable <- gentable[,apply(gentable, 2,
                                function(x) !all(x==Absent(object))), drop = FALSE]
    gentable.out <- gentable # to be output
    # find any non-variable alleles (present in all genotypes)
    nonvar <- apply(gentable, 2, function(x) all(x==Present(object)))
    # are there too many non-variable alleles?
    mingen <- sum(nonvar)
    if(mingen > n.subgen){
        stop(paste("Locus",locus,": too many non-variable alleles"))
    }
    nAl <- dim(gentable)[2]        # number of alleles
    # get allele names for output
    mysplit <- strsplit(dimnames(gentable)[[2]], split=".", fixed=TRUE)
    alleles <- sapply(mysplit, function(x) x[2])
    # set up matrix for clusters
    clustmat1 <- matrix(0, nrow=n.subgen, ncol=nAl,
                        dimnames=list(NULL,alleles))

    # if there are non-variable alleles, assign each to its own genome
    if(sum(nonvar) > 0){
        for(i in 1:sum(nonvar)){
            clustmat1[i,alleles[nonvar][i]] <- 1
        }
        sigmatNeg <- sigmatPos <- oddsRatio <- NULL
        pValuesNeg <- pValuesPos <- mydist <- NULL
        mytotss <- mybss <- NA
        clustmethod <- "fixed alleles" # gets overwritten later if necessary
        gentable <- gentable[,!nonvar]
        nAl <- nAl - sum(nonvar) # adjust number of alleles to only variable ones.
        alleles <- alleles[!nonvar]

        # if each isolocus has a non-variable allele, but there are
        # variable alleles left, make them all homoplasious
        if(sum(nonvar)==n.subgen){
            clustmat1[,alleles] <- 1
        }
    }
    if(sum(rowSums(clustmat1)==0)==1 ||
       sum(colSums(clustmat1) == 0) < sum(rowSums(clustmat1) == 0) ){
      # if everything else can be assigned to one genome, do that.
      # or, if there aren't enough alleles left for the number of isoloci, make all homoplasious.  
      clustmat1[rowSums(clustmat1)==0,colSums(clustmat1)==0] <- 1
      sigmatNeg <- sigmatPos <- oddsRatio <- NULL
      pValuesNeg <- pValuesPos <- mydist <- NULL
      mytotss <- mybss <- NA
      clustmethod <- "fixed alleles"
    }
    if(sum(colSums(clustmat1) == 0) == sum(rowSums(clustmat1) == 0)){
      # if there are just enough alleles left for one allele per isolocus, do that
      blankAlleles <- which(colSums(clustmat1) == 0)
      blankIsoloci <- which(rowSums(clustmat1) == 0)
      for(i in 1:length(blankAlleles)){
        clustmat1[blankIsoloci[i], blankAlleles[i]] <- 1
      }
      sigmatNeg <- sigmatPos <- oddsRatio <- NULL
      pValuesNeg <- pValuesPos <- mydist <- NULL
      mytotss <- mybss <- NA
      clustmethod <- "fixed alleles"
    }
    if(all(colSums(clustmat1) > 0) && any(rowSums(clustmat1) == 0)){
      # all alleles have been assigned, but some isoloci still lack alleles.
      # expected in rare cases, i.e. if there is just one allele in whole pop.
      # add homoplasy.
      clustmat1[rowSums(clustmat1)==0,] <- 1
      sigmatNeg <- sigmatPos <- oddsRatio <- NULL
      pValuesNeg <- pValuesPos <- mydist <- NULL
      mytotss <- mybss <- NA
      clustmethod <- "fixed alleles"
    }
    # make a second copy of assignment matrix for UPGMA
    clustmat2 <- clustmat1

    ### how the function will normally work (all alleles variable): ###
    if(sum(rowSums(clustmat1)==0)>1){

        clustmethod <- "K-means and UPGMA"

    # set up matrices for results of Fisher's exact test
    pValuesNeg <- matrix(NA, nrow=nAl, ncol=nAl) # significance of negative corr.
    pValuesPos <- matrix(NA, nrow=nAl, ncol=nAl) #significance of positive corr.
    oddsRatio <- matrix(NA, nrow=nAl, ncol=nAl) # (00*11)/(01*10)

    # loop to run Fisher's exact test on each pair of alleles
    for(a in 1:(nAl-1)){
        for(b in (a+1):nAl){
            X <- fisher.test(gentable[,a], gentable[,b], alternative="less",
                             conf.int=FALSE)
            pValuesNeg[a,b] <- X$p.value; pValuesNeg[b,a] <- X$p.value
            Y <- fisher.test(gentable[,a], gentable[,b], alternative="greater",
                             conf.int=FALSE)
            pValuesPos[a,b] <- Y$p.value; pValuesPos[b,a] <- Y$p.value
            oddsRatio[a,b] <- oddsRatio[b,a] <- X$estimate
        }
    }

    # Holm-Bonferroni correction to get p-value threshold
    # Negative correlations (for assigning alleles)
    pVect <- sort(pValuesNeg[lower.tri(pValuesNeg)]) # smallest to largest pvalues
    m <- length(pVect) # number of hypothesis tests
    kTest <- pVect > alpha/(m + 1 - (1:m))
    if(all(!kTest)){ # reject all hypotheses
        sigmatNeg <- matrix(TRUE, nrow=nAl, ncol=nAl)
    } else {
        k <- min((1:m)[kTest])
        maxP <- pVect[k-1] # maximum p value for significance
        if(k > 1){
            sigmatNeg <- pValuesNeg <= maxP
        } else {
            sigmatNeg <- matrix(FALSE, nrow=nAl, ncol=nAl)
        }
    }
    # Positive correlations (for issuing warnings)
    pVect <- sort(pValuesPos[lower.tri(pValuesPos)]) # smallest to largest pvalues
    kTest <- pVect > alpha/(m + 1 - (1:m))
    if(all(!kTest)){ # reject all hypotheses
        sigmatPos <- matrix(TRUE, nrow=nAl, ncol=nAl)
    } else {
        k <- min((1:m)[kTest])
        maxP <- pVect[k-1] # maximum p value for significance
        if(k > 1){
            sigmatPos <- pValuesPos <= maxP
        } else {
            sigmatPos <- matrix(FALSE, nrow=nAl, ncol=nAl)
        }
    }
    if(!all(!sigmatPos)){
        cat(paste("Warning: Significant positive correlations between alleles at locus", Loci(object), "; population structure or scoring error may bias results."), sep="\n")
    }

    dimnames(sigmatNeg) <- list(alleles,alleles)
    dimnames(sigmatPos) <- list(alleles,alleles)
    dimnames(pValuesNeg) <- list(alleles, alleles)
    dimnames(pValuesPos) <- list(alleles, alleles)
    dimnames(oddsRatio) <- list(alleles, alleles)

    # replace NA with 0 in the p-value matrix
    mydist <- pValuesNeg
    mydist[is.na(mydist)] <- 0
    # adjust number of clusters if there were non-variable alleles
    n.subgen.adj = n.subgen - sum(rowSums(clustmat1)>0)
    # kmeans clustering of alleles based on pvalues
    kres <- kmeans(mydist, centers=n.subgen.adj, nstart=n.start)
    myclust1 <- kres$cluster
        mytotss <- kres$totss
        mybss <- kres$betweenss

    # UPGMA clustering
        myclust2 <- cutree(hclust(as.dist(mydist), method="average"),
                           k = n.subgen.adj)

    # put clusters into matrix
    for(i in (n.subgen - n.subgen.adj + 1):n.subgen){
        clustmat1[i,alleles[myclust1==(i - n.subgen + n.subgen.adj)]] <- 1
        clustmat2[i,alleles[myclust2==(i - n.subgen + n.subgen.adj)]] <- 1
    } } # end of if clause for if there aren't non-variable alleles

    return(list(locus=Loci(object), clustering.method=clustmethod,
        significant.neg=sigmatNeg, significant.pos=sigmatPos,
                p.values.neg=pValuesNeg, p.values.pos=pValuesPos,
                odds.ratio=oddsRatio,
                Kmeans.groups=clustmat1, UPGMA.groups=clustmat2,
                heatmap.dist=mydist,
                totss=mytotss, betweenss=mybss, gentable=gentable.out))
}

# function to test consistency of genotypes with allele groups
# will also make adjustment to the groups
# genobject is genambig or genbinary
# fisherResults is output of AlleleCorrelations
# SGploidy is the ploidy of each subgenome, e.g. 2 for an allotetraploid
# null.weight should be 0 if there are definitely null alleles, or 1 if
# there are definitely not.
# tolerance is the proportion of genotypes that can be allowed to be inconsistent
# with the allele assignments.
# swap indicates whether or not to use the swapping algorithm.
# R, rho, T0, and maxreps are parameters for simulated annealing.
testAlGroups <- function(object, fisherResults, SGploidy=2,
                          samples=Samples(object),
                          null.weight=0.5, tolerance=0.05,
                          swap = TRUE, R = 100, rho = 0.95, T0 = 1, maxreps = 100){
  # make sure genotype data is right class
  if(!is(object, "genambig") && !is(object, "genbinary"))
    stop("genambig or genbinary object needed.")
  object <- object[samples,]
  # locus
  L <- fisherResults$locus
  # remove missing data
  mymissing <- isMissing(object, loci=L)
  # number of individuals
  nind <- sum(!mymissing)
  # subset genotypes
  genobject <- object[!mymissing,L]
  # change class if necessary
  if(is(genobject, "genbinary")) genobject <- genbinary.to.genambig(genobject)
  # alleles
  alleles <- dimnames(fisherResults$Kmeans.groups)[[2]]
  # number of alleles
  numAlTotal <- length(alleles)
  
  # number of subgenomes
  n.subgen <- dim(fisherResults$Kmeans.groups)[1]
  
  # get the number of alleles in each genotype
  numAl <- sapply(Genotypes(genobject), length)
  # check to see that none have too many alleles
  if(!all(numAl <= n.subgen*SGploidy))
    stop(paste("One or more genotypes have more than",
               n.subgen*SGploidy, "alleles."))
  # check to see that alleles match in genobject and fisherResults
  if(!all(as.character(.unal1loc(genobject, samples=Samples(genobject),
                                 locus=L)) %in% alleles))
    stop("Alleles found in genobject that aren't in fisherResults.")
  
  # function to tally which genotypes are inconsistent with an assignment matrix
  tallyInconsistentGen <- function(G, samples=1:nind, currentInconsistent = rep(FALSE, nind)){
    tot <- 0
    for(s in samples){
      gen <- as.character(Genotype(genobject, s, L))
      subg <- G[, gen, drop = FALSE]
      if(any(rowSums(subg) == 0) || 
           any(rowSums(subg[,colSums(subg) == 1, drop = FALSE]) > SGploidy)){
        currentInconsistent[s] = TRUE
      } else {
        currentInconsistent[s] = FALSE
      }
    }
    return(currentInconsistent)
  }
  
  ## Compare K-means and UPGMA results ##
  # Sort them so groups are the same
  Kmat <- fisherResults$Kmeans.groups
  Umat <- fisherResults$UPGMA.groups
  KsortV <- UsortV <- integer(length(alleles))
  Uswap <- Kswap <- integer(n.subgen) # index = new number, number=row in matrix
  for(i in 1:n.subgen){
    KfirstLeft <- match(0, KsortV) # first unassigned allele
    UfirstLeft <- match(0, UsortV)
    # match rows to new subgenomes
    Kswap[i] <- match(1, Kmat[,KfirstLeft])
    Uswap[i] <- match(1, Umat[,UfirstLeft])
    # assign alleles to new subgenomes
    KsortV[Kmat[Kswap[i],] == 1] <- i
    UsortV[Umat[Uswap[i],] == 1] <- i
  }
  # were the results identical?
  if(identical(KsortV, UsortV)){
    G <- Kmat # use K-means assignments (same as UPGMA assignments)
  } else {
    # if K-means and UPGMA gave different results, see which is more consistent
    # with genotypes.
    Kok <- nind - sum(tallyInconsistentGen(Kmat)) # count consistent with K-means assignments
    Uok <- nind - sum(tallyInconsistentGen(Umat)) # count consistent with UPGMA assignments
    
    if(Uok > Kok){
      G <- Umat # UPGMA was better, use this matrix
    } else {
      G <- Kmat # Use K-means if it was better or if there was a tie.
    }
  }
  
  ## Check if alleles should be swapped to a different isolocus. ##
  
  # function to make a new G matrix at random, with one allele swapped
  randomNewG <- function(G){
    alToSwap <- alleles[sample(numAlTotal, 1)]
    thisSubgen <- which(G[,alToSwap] == 1)
    if(n.subgen == 2){
      newSubgen <- (1:2)[-thisSubgen]
    } else {
      newSubgen <- sample((1:n.subgen)[-thisSubgen], 1)
    }
    Gnew <- G
    Gnew[thisSubgen,alToSwap] <- 0
    Gnew[newSubgen, alToSwap] <- 1
    return(list(alToSwap, Gnew))
  }
  
  # number of possible G matrices (without homoplasy)
  nGpossible <- n.subgen ^ numAlTotal
  # function to get an index number for each possible G matrix (without homoplasy)
  indexG <- function(G){
    # set up multipliers for an integer where the base is n.subgen
    baseSetup <- n.subgen ^ (1:numAlTotal - 1)
    # get the digit for each allele
    digits <- apply(G, 2, function(x) which(x == 1) - 1)
    # get index
    if(is.list(digits)){ # if there was homoplasy
      index <- NA
    } else {
      index <- sum(baseSetup * digits) + 1
    }
    return(index)
  }
  # list of inconsistent individuals for every possible G matrix (without homoplasy)
  iiList <- list()
  length(iiList) <- nGpossible
  
  # for current assignments
  inconsInd <- tallyInconsistentGen(G) # which individuals are inconsistent with current assignments
  J <- mean(inconsInd) # current "cost" of this solution
  iG <- indexG(G) # index of this set of assignments
  if(!is.na(iG)){
    iiList[[iG]] <- inconsInd # record these inconsistent individuals to the master list
  }
  
  # only do swapping if desired by user, and only if assignments aren't already perfect
  if(swap &&  J > 0){ 
    # set up an index of which genotypes have which alleles, so we know which genotypes to check
    alleleIndex <- list()
    length(alleleIndex) <- numAlTotal
    names(alleleIndex) <- alleles
    for(a in alleles){
      alleleIndex[[a]] <- which(fisherResults$gentable[,paste(L, a, sep = ".")] == 1)
    }
    # simulated annealing algorithm
    Ti <- T0 # temperature set up
    for(rep in 1:maxreps){
      done <- TRUE # to test convergence OR finding an answer that matches all genotypes
      for(r in 1:R){
        # pick the allele to swap
        temp <- randomNewG(G)
        Gnew <- temp[[2]]
        a <- temp[[1]]
        # find out which genotypes are inconsitent with this new set of assignments
        iGnew <- indexG(Gnew)
        if(is.na(iGnew) || is.null(iiList[[iGnew]])){ # if we have never looked at this set of assignments before
          inconsIndNew <- tallyInconsistentGen(Gnew, alleleIndex[[a]], inconsInd)
          if(!is.na(iGnew)){
            iiList[[iGnew]] <- inconsIndNew
          }
        } else { # if we already have looked at this set of assignments
          inconsIndNew <- iiList[[iGnew]]
        }
        Jnew <- mean(inconsIndNew)
        if(Jnew <= J){ # keep if if we have found a better solution 
          G <- Gnew
          J <- Jnew
          inconsInd <- inconsIndNew
          #          cat(paste(iGnew, "better"), sep = "\n")
          done <- FALSE # swap happened; convergence did not occur in this rep
          if(J == 0){ # perfect solution found, stop now
            done <- TRUE
            break
          }
        } else { # if we have found a worse solution, randomly decide whether to keep it
          D <- Jnew - J
          if(sample(10000,1) <= 10000 * exp(-D/Ti)){ 
            G <- Gnew
            J <- Jnew
            inconsInd <- inconsIndNew
            #            cat(paste(iGnew, "worse"), sep = "\n")
            done <- FALSE
          }
        }
      }
      Ti <- Ti * rho # lower the temperature
      if(done) break
    }
  }
  
  AlOcc <- colSums(fisherResults$gentable)/dim(fisherResults$gentable)[1]
  
  ## Check for homoplasy ##
  # set up internal function to check consistency for adding homoplasy
  makeA <- function(){
    A <- matrix(0, nrow=n.subgen, ncol=length(alleles),
                dimnames=list(NULL,alleles))
    num.error <- 0 # number of genotypes with scoring or meiotic error
    for(s in Samples(genobject)){
      gen <- as.character(Genotype(genobject, s, L)) # the genotype
      
      # get 1/0 matrix for presence of these alleles in subgenomes
      subG <- G[,gen, drop=FALSE]
      # Start checking the genotype.
      # It is okay if no genome has more alleles than is possible AND
      # no genome has zero alleles.
      # Otherwise, figure out which alleles might need to be copied:
      if(!all(rowSums(subG[,colSums(subG) == 1, drop=FALSE])
              <= SGploidy)){
        # If one subgenome has more alleles than are possible
        donor <-
          (1:n.subgen)[rowSums(subG[,colSums(subG) == 1, drop=FALSE]) >
                         SGploidy]
        # they might be copied to a subgenome(s) that has fewer than the
        # maximum.
        recipient <- (1:n.subgen)[rowSums(subG) < SGploidy]
        if(length(recipient)==0){
          recipient <- (1:n.subgen)[rowSums(subG[,colSums(subG) == 1,
                                                 drop=FALSE]) < SGploidy]
        }
        gendonor <- gen[colSums(subG[recipient,,drop=FALSE]==0)>0 &
                          colSums(subG[donor,,drop=FALSE]==1)>0]
        A[recipient,gendonor] <- A[recipient,gendonor] + 1
        
        num.error <- num.error + 1
      } else {
        # If one subgenome has no alleles
        if(!all(rowSums(subG) > 0) && null.weight > 0){
          recipient <- (1:n.subgen)[rowSums(subG)==0]
          A[recipient,gen] <- A[recipient,gen] + null.weight
          num.error <- num.error + 1
        }
      }
    } # end of Samples loop
    
    # calculate the proportion inconsistent with allele assignments
    prop.error <- num.error/nind
    
    # Make sure not to copy alleles to where they have already been
    # assigned (problem in hexaploids).
    A[G==1] <- 0
    
    # weight the A matrix by allele occurance
    # omitted; is only helpful for rare homoplasious alleles.
    #       A <- sweep(A, 2, AlOcc, "/")
    
    return(list(A, prop.error))
  } # end of makeA function
  
  # check consistency once before starting assingnment and re-checking loop
  temp <- makeA()
  A <- temp[[1]]
  prop.error <- temp[[2]]
  
  # loop to copy alleles to other genomes, then re-check
  numloops <- 0
  while(prop.error > tolerance && numloops < 1000){
    numloops <- numloops + 1 # precaution to keep it from getting stuck
    
    if(!all(A[,AlOcc != 1] == 0)){
      A <- A[,AlOcc != 1, drop = FALSE] # don't copy fixed alleles, unless that's the only way to resolve things
    }
    
    # choose best allele to make homoplasious for this round
    tocopy <- which(A == max(A), arr.ind=TRUE)
    if(dim(tocopy)[1]==1){ # if one is clearly the best
      thisallele <- dimnames(A)[[2]][tocopy[,"col"]]
      thissubgen <- tocopy[,"row"]
    } else { # choose among several with the highest scores
      if(!is.null(fisherResults$p.values.neg)){ # use p values
        meanp <- numeric(dim(tocopy)[1])
        for(i in 1:dim(tocopy)[1]){
          # alleles already in this genome
          thisgenal <- alleles[G[tocopy[i,"row"],] == 1]
          # remove any fixed alleles
          thisgenal <- thisgenal[!thisgenal %in% alleles[AlOcc == 1]]
          # get mean p values between this allele and alleles already in genome
          thisal <- dimnames(A)[[2]][tocopy[i,"col"]]
          meanp <- mean(fisherResults$p.values.neg[thisgenal,thisal])
        }
        bestp <- which(meanp == min(meanp))
        if(length(bestp) > 1){ # if there's a tie, choose at random
          bestp <- sample(bestp, size=1)
        }
        thisallele <- dimnames(A)[[2]][tocopy[bestp,"col"]]
        thissubgen <- tocopy[bestp,"row"]
      } else { # pick at random if there are no p-values to check
        myrand <- sample(dim(tocopy)[1], size=1)
        thisallele <- dimnames(A)[[2]][tocopy[myrand,"col"]]
        thissubgen <- tocopy[myrand,"row"]
      }
    }
    G[thissubgen, thisallele] <- 1
    
    # check genotypes against new G matrix
    temp <- makeA()
    A <- temp[[1]]
    prop.error <- temp[[2]]
  } # end of while loop
  return(list(locus=L, SGploidy=SGploidy, assignments=G, 
              proportion.inconsistent.genotypes = prop.error))
}

# function to assign alleles to subgenomes based on the procedure of
# Catalan et al. 2006 (Genetics 172:1939-1953).
# If any genotypes have fewer alleles than n.subgen, there must be homoplasy,
# so exclude locus. (In their paper, <2 alleles for allotetraploid.)
# Any genotypes with as many alleles as n.subgen (2 for allotetraploid) must be
# homozygous at each isolocus, so these are used first to assign alleles to
# isoloci.
# Check that this assignment is consistent with the genotypes that have a
# number of alleles > n.subgen, or if there is homoplasy.
# This is not stated explicitly in paper, but genotypes with n.subgen*SGploidy
# alleles could be used to further resolve ambiguity if genotypes with
# n.subgen alleles are not sufficient.
catalanAlleles <- function(object, samples=Samples(object), locus=1,
                           n.subgen=2, SGploidy=2, verbose=FALSE){
    # subset object if needed
    object <- object[samples,]
    # check missing data
    mymissing <- isMissing(object, loci=locus)
    if(all(mymissing)) stop("All data at locus are missing.")
    # only do one locus at a time
    if(length(locus) > 1) stop("More than one locus")
    # subset dataset object by this locus
    object <- object[!mymissing,locus]
    # perform class conversion if necessary
    if(is(object, "genbinary")) object <- genbinary.to.genambig(object)
    # error if not the right class
    if(!is(object, "genambig")) stop("genambig or genbinary object needed.")
    # error if just one subgenome
    if(n.subgen<2) stop("Need at least two subgenomes.")

    # get the number of alleles in each genotype
    numAl <- sapply(Genotypes(object), length)
    # check to see that none have too many alleles
    if(!all(numAl <= n.subgen*SGploidy))
        stop(paste("One or more genotypes have more than",
                   n.subgen*SGploidy, "alleles."))

    # check to see if any genotypes have too few alleles, implying homoplasy
    if(!all(numAl >= n.subgen)){
        result <- list(locus=Loci(object), SGploidy=SGploidy,
    assignments="Homoplasy or null alleles: some genotypes have too few alleles")
    } else {
        # set up results matrix
        alleles <- .unal1loc(object, Samples(object), locus)
        G <- matrix(0, nrow=n.subgen, ncol=length(alleles),
                    dimnames=list(NULL, as.character(alleles)))
        # start with genotypes that have n.subgen alleles
        X <- Genotypes(object)[numAl==n.subgen,]
        if(length(X)==0){  # quit if there aren't any
            result <- list(locus=Loci(object), SGploidy=SGploidy,
                assignments="Unresolvable: no genotypes with n.subgen alleles.")
        } else {
            X <- unique(X) # get unique genotypes w/ n. subgen alleles
            # vector to rank genotypes by allele frequencies
            AlFreqInX <- sapply(alleles,
                                function(a) sum(sapply(X,
                                                       function(x) a %in% x)))
            names(AlFreqInX) <- as.character(alleles)
            Xrank <- sapply(X, function(x) sum(AlFreqInX[as.character(x)]))
            # sort genotypes, putting the most interconnected first
            X <- X[order(Xrank, decreasing=TRUE)]

            # use the first one
            gen <- as.character(X[[1]])
            for(i in 1:n.subgen){
                G[i,gen[i]] <- 1
            }
            ToAssign <- alleles[colSums(G)==0] # alleles to assign
            Xa <- X[sapply(X, function(x) sum(ToAssign %in% x)==1)]

            # try using the rest of the genotypes with n.subgen alleles
             while(length(Xa)>0){
                 gen <- as.character(Xa[[1]])
                    g <- G[,gen]
                    thissubgen <- rowSums(g)==0
                    thisallele <- gen[colSums(g)==0]
                    G[thissubgen,thisallele] <- 1
                    ToAssign <- alleles[colSums(G)==0] # alleles to assign
                    Xa <- X[sapply(X, function(x) sum(ToAssign %in% x)==1)]
             }

            # if needed, use heterozygous genotypes to further resolve.
            if(!all(colSums(G)!=0)){
                # get genotypes that will be informative
                X <- Genotypes(object)[,1]
                X <- unique(X)
                     # one allele to assign, one subgenome has no alleles yet
                Xa <- X[sapply(X, function(x) (sum(ToAssign %in% x)==1 &&
                        sum(rowSums(G[,as.character(x)])==0)==1) ||
                     # OR, all subgenomes but one have max number of alleles
                        (sum(ToAssign %in% x) > 0 &&
                    sum(rowSums(G[,as.character(x)])==SGploidy)==n.subgen-1))]
                while(length(Xa)>0){
                    # use the first genotype in the list to assign allele(s)
                    g <- G[,as.character(Xa[[1]])]
                    thissubgen <- rowSums(g) == min(rowSums(g))
                    thisallele <- dimnames(g)[[2]][colSums(g)==0]
                    G[thissubgen,thisallele] <- 1

                    # update list of unassigned alleles and useful genotypes
                    ToAssign <- alleles[colSums(G)==0]
                    Xa <- X[sapply(X, function(x) (sum(ToAssign %in% x)==1 &&
                        sum(rowSums(G[,as.character(x)])==0)==1) ||
                        (sum(ToAssign %in% x) > 0 &&
                    sum(rowSums(G[,as.character(x)])==SGploidy)==n.subgen-1))]
                }
            }

            # check results
            if(!all(colSums(G)!=0)){
                result <- list(locus=Loci(object), SGploidy=SGploidy,
                                   assignments="Unresolvable")
                if(verbose) print(G)
            } else {
                # loop through each genotype to check consistency with results
                badgen <- list()
                bgIndex <- 1
                for(s in Samples(object)){
                    g <- G[,as.character(Genotype(object, s, locus))]
                    gr <- rowSums(g)
                    if(!all(gr > 0 & gr <= SGploidy)){
                        badgen[[bgIndex]] <- Genotype(object,s,locus)
                        bgIndex <- bgIndex+1
                    }
                }
                if(length(badgen)==0){
                    result <- list(locus=Loci(object), SGploidy=SGploidy,
                                   assignments=G)
                } else {
                    if(verbose){
                        cat("Allele assignments:", sep="\n")
                        print(G)
                        cat("Inconsistent genotypes:", sep="\n")
                        print(badgen)
                    }
                    result <- list(locus=Loci(object), SGploidy=SGploidy,
                                   assignments="Homoplasy or null alleles")
                }
            }
        }
    }

    if(verbose) print(result)
    return(result)
}

# function to merge results from the same locus from multiple populations.
# x is a list of lists; each element of x is a result such as that
# produced by catalanAlleles or testAlGroups.
mergeAlleleAssignments <- function(x){
    loci <- sapply(x, function(z) z$locus)   # extract locus names
    SGp <- sapply(x, function(z) z$SGploidy) # extract ploidies
    lociU <- unique(loci)
    if(is.list(loci) || is.list(SGp) || !all(!is.na(lociU)) || !all(!is.na(SGp)))
        stop("Locus and SGploidy value required for each list element.")
    # set up results object
    results <- list()
    length(results) <- length(lociU)

    # function to help with merging
    onezero <- function(y,z){
        res <- y+z
        res[res==2] <- 1
        return(res)
    }

    # loop through each locus
    for(L in 1:length(lociU)){
        locus <- lociU[L]
        thisSGp <- unique(SGp[loci==locus]) # get ploidy for this locus
        if(length(thisSGp)>1) stop("More than one SGploidy value per locus.")
        # get allele assignments
        A <- lapply(x[loci==locus], function(z) z$assignments)
        # throw out any that are not a matrix
        A <- A[sapply(A, function(z) is.matrix(z))]
        if(length(A)==0){
            G <- "No assignment"
        } else {
        # merge assignment matrices, or just keep the first if there is only one
            # sort A so that matrices with the most alleles are used first
            A <- A[order(-sapply(A, function(x) dim(x)[2]),1:length(A))]
            # get unique alleles
            alleles <- unique(unlist(lapply(A, function(z) dimnames(z)[[2]])))
            # set up matrix
            G <- matrix(0, nrow=dim(A[[1]])[1], ncol=length(alleles),
                        dimnames=list(NULL, alleles))
            # fill in from first matrix
            G[,dimnames(A[[1]])[[2]]] <- A[[1]]
            A <- A[-1]
            # remaining matrices
            stuckcount <- 0
            while(length(A)>0){
                if(stuckcount==length(A)){
                    G <- "No assignment"
                    break
                }
                stuckcount <- 0  # to prevent endless looping
                toremove <- integer(0)
                for(i in 1:length(A)){
                    a <- A[[i]]
                    assigned <- alleles[colSums(G)>0]
                    # find alleles present in current matrix that are already assigned
                    overlap <- assigned[assigned %in% dimnames(a)[[2]][colSums(a)>0]]
                    if(length(overlap)==0 || all(G[,overlap] == 1) || all(a[,overlap] == 1)){
                        stuckcount <- stuckcount+1
                        next
                    }
                    # try to match up rows
                    groups <- kmeans(rbind(G[,overlap,drop = FALSE],a[,overlap,drop = FALSE]),dim(G)[1])
                    # check if matching makes sense
                    if(!all(1:dim(G)[1] %in% groups$cluster[1:dim(G)[1]]) ||
                       groups$betweenss/groups$totss <= 0.5){
                      stuckcount <- stuckcount + 1
                      next
                    }
                      # combine matrices
                    G[groups$cluster[1:dim(G)[1]],dimnames(a)[[2]]] <-
                      onezero(G[groups$cluster[1:dim(G)[1]],dimnames(a)[[2]]],
                              a[groups$cluster[-(1:dim(G)[1])],])
                    toremove <- c(toremove,i) # done with this matrix
                }
                # get rid of matrices that have already been used
                if(length(toremove)>0) A <- A[-toremove]
            }
        }
        results[[L]] <- list(locus=locus, SGploidy=thisSGp,
                             assignments=G)
    }
    return(results)
}

# function to plot results of K-means clustering, to get a sense of clustering quality.
# takes 2D array of results of AlleleCorrelations; locus x population
plotSSAllo <- function(AlCorrArray){
  loci <- dimnames(AlCorrArray)[[1]] # locus names
  pops <- dimnames(AlCorrArray)[[2]] # population names
  n.subgen <- dim(AlCorrArray[[1,1]]$Kmeans.groups)[1] # number of isoloci
  
  # get array of sums of squares ratios, to evaluate clustering qualities
  SSarray <- array(sapply(AlCorrArray, FUN = function(x) x$betweenss/x$totss), dim = dim(AlCorrArray),
                   dimnames = list(loci, pops))
  SSarray[is.na(SSarray)] <- 0
  # get array of index of evenness of distribution of alleles among isoloci
  Earray <- array(sapply(AlCorrArray, FUN = function(x){
    propAl <- rowMeans(x$Kmeans.groups) # proportion of alleles belonging to each isolocus
    return(1 - sum(propAl^2))
  }), dim = dim(AlCorrArray), dimnames = list(loci, pops))
  
  # identify loci with positive allele correlations
  posCor <- sapply(AlCorrArray, FUN = function(x) any(x$significant.pos, na.rm = TRUE))
  posCor <- array(posCor, dim = dim(AlCorrArray), dimnames = list(loci, pops))
  
  # colors for populations
  if(length(pops) == 1){  # black and white if one pop
    popCol = "black"
    bgCol = "white"
  } else {                # colors for multiple pops, with grey background for visibility
    bgCol = "lightgrey"
    if(length(pops) == 2){
      popCol = c("red", "blue")
    } else {
      popCol = rainbow(length(pops))
    }
  }
  
  # set background color
  oldBg = par()$bg
  par(bg = bgCol)
  
  # determine plot range
  maxE <- 1 - n.subgen * 1/n.subgen^2 # maximum evenness
  # maximum number of alleles
  maxNalleles <- max(sapply(AlCorrArray, FUN = function(x) dim(x$Kmeans.groups)[2]))
  # minimum evenness
  minE <- 1 - (n.subgen - 1) * 1/maxNalleles^2 - ((maxNalleles - n.subgen + 1)/maxNalleles)^2
  # sums of squares, minimum and maximum
  minSS <- 0
  maxSS <- 1
  # xlim and ylim
  if(length(pops) == 1){
    Xlim <- c(minSS, maxSS)
  } else { # leave room for legend if there are multiple pops
    Xlim <- c(minSS, maxSS + (maxSS - minSS)/4)
  }
  Ylim <- c(minE, maxE)
  
  # set up plot
  plot(as.vector(SSarray), as.vector(Earray), col = bgCol, main = "K-means results",
       xlab = "Sums of squares between isoloci/total sums of squares",
       ylab = "Evenness of number of alleles among isoloci", xlim = Xlim, ylim = Ylim)
  # text to indicate good vs. bad
  text(minSS, minE, labels = "Low quality allele clustering", adj = 0)
  text(maxSS, maxE, labels = "High quality allele clustering", adj = 1)
  
  # plot all loci by population
  for(p in 1:length(pops)){
    text(SSarray[,p], Earray[,p], labels = loci, col = popCol[p], font = ifelse(posCor[,p], 3, 1))
  }
  
  # add legend if there are multiple populations
  if(length(pops) > 1){
    legend(maxSS, 0.75 * (Ylim[2] - Ylim[1]) + Ylim[1], legend = pops,
           text.col = popCol, title = "Populations", title.col = "black")
  }
  
  # restore background color
  par(bg = oldBg)
  
  # return the three arrays invisibly
  return(invisible(list(ssratio = SSarray, evenness = Earray, 
                        max.evenness = maxE, min.evenness = minE,
                        posCor = posCor)))
} # end plotSSAllo function

# for one population, plot the proportion of alleles that are homoplasious for
# each locus and parameter set.  Can also be used to plot missing data rate for
# each parameter set and locus.
plotParamHeatmap <- function(propMat, popname = "AllInd", col = grey.colors(12)[12:1],
                             main = ""){
  if(length(dim(propMat)) == 3){
    # index by population if not already done
    propMat <- propMat[,popname,]
  }
  if(length(dim(propMat)) != 2){
    stop("propMat must have two dimensions.")
  }
  nloc <- dim(propMat)[1]
  nparam <- dim(propMat)[2]
  image(1:nparam, 1:nloc, t(propMat), xlab = "Parameter sets", ylab = "Loci",
        main = paste(main,popname), axes = FALSE, col = col, zlim = c(0,1))
  axis(1, at = 1:nparam)
  axis(2, at = 1:nloc, labels = dimnames(propMat)[[1]])
  
  # highlight the best parameter set for each locus
  for(i in 1:nloc){
    text(which.min(propMat[i,]),i, "best")
  }
}

# function to process an entire dataset and find optimal parameters 
# for testAlGroups
# plotsfile can be NULL
# usePops: should allele assignments be performed separately by PopInfo in the object?
processDatasetAllo <- function(object, samples = Samples(object), loci = Loci(object),
                               n.subgen = 2, SGploidy = 2, n.start = 50, alpha = 0.05,
                               parameters = data.frame(tolerance     = c(0.05, 0.05, 0.05, 0.05),
                                                       swap = c(TRUE, FALSE, TRUE, FALSE),
                                                       null.weight   = c(0.5,  0.5,  0,    0)),
                               plotsfile = "alleleAssignmentPlots.pdf", usePops = FALSE, ...){
  if(!all(samples %in% Samples(object))){
    stop("Sample names in samples and object do not match.")
  }
  if(!all(loci %in% Loci(object))){
    stop("Locus names in loci and object do not match.")
  }
  if(!all(c("tolerance", "swap", "null.weight") %in% names(parameters))){
    stop("Parameters data frame needs column headers tolerance, swap, and null.weight.")
  }
  
  object <- object[samples,]
  nparam <- dim(parameters)[1] # number of parameter sets
  
  # set up population groups
  if(usePops){
    popinfo <- PopInfo(object)
    if(any(is.na(popinfo))){
      stop("PopInfo needed in object if usePops = TRUE.")
    }
    popnamesTemp <- PopNames(object)[popinfo]
    
    allpops <- unique(popnamesTemp)          # names of all populations
    popinfo <- match(popnamesTemp, allpops)  # numbers for inviduals, corresponding to those names
    npops <- length(allpops)                 # number of populations
  } else {
    popinfo <- rep(1, length(samples))
    allpops <- "AllInd"
    npops <- 1
  }
  
  # set up arrays to contain results
  # results of alleleCorrelations; 2D array, locus x population
  CorrResults <- array(list(), dim = c(length(loci), npops), 
                       dimnames = list(loci, allpops))
  # results of testAlGroups; 3D array, locus x population x parameter set
  TAGresults <- array(list(), dim = c(length(loci), npops, nparam),
                     dimnames = list(loci, allpops, NULL))
  
  for(p in 1:npops){  # loop through populations
    thesesamples <- samples[popinfo == p] # samples in this population
    for(L in loci){  # loop through loci
      CorrResults[[L, p]] <- alleleCorrelations(object, samples = thesesamples, locus = L,
                                                alpha = alpha, n.subgen = n.subgen, n.start = n.start)
      for(pm in 1:nparam){
        TAGresults [[L, p, pm]] <- testAlGroups(object, CorrResults[[L, p]], SGploidy = SGploidy,
                                                samples = thesesamples, null.weight = parameters$null.weight[pm],
                                                tolerance = parameters$tolerance[pm], swap = parameters$swap[pm], ...)
      } # end of parameter set loop
    } # end of locus loop
  } # end of populations loop
  
  # get the proportion of alleles that are homoplasious for each locus, population, and parameter set
  propHomoplasious <- sapply(TAGresults, FUN = function(x){mean(colSums(x$assignments) > 1)})
  propHomoplasious <- array(propHomoplasious, dim = dim(TAGresults), dimnames = dimnames(TAGresults))
  
  # merge allele assignments across populations within parameter sets
  if(usePops){
    mergedAssignments <- array(list(), dim = c(length(loci), nparam),
                               dimnames = list(loci, NULL))
    for(L in loci){
      theseSamples <- samples[!isMissing(object, samples, L)]
      for(pm in 1:nparam){
        mergedAssignments[L,pm] <- mergeAlleleAssignments(TAGresults[L,,pm])
        # figure out proportion of genotypes inconsistent with the merged assignments
        totInconsistent <- 0
        if(is.matrix(mergedAssignments[[L,pm]]$assignments)){
          for(s in theseSamples){
            gen <- as.character(Genotype(object, s, L))
            subg <- mergedAssignments[[L,pm]]$assignments[, gen, drop = FALSE]
            if(any(rowSums(subg) == 0) || 
               any(rowSums(subg[,colSums(subg) == 1, drop = FALSE]) > SGploidy)){
                 totInconsistent <- totInconsistent + 1
            }
          }
        } else {
          totInconsistent <- length(theseSamples)
        }
        mergedAssignments[[L,pm]]$proportion.inconsistent.genotypes <- totInconsistent/length(theseSamples)
      }
    }
    
    # get the proportion of homoplasious alleles for each merged assignment set
    propHomoplMerged <- sapply(mergedAssignments, FUN = function(x){
      if(is.matrix(x$assignments)){
        return(mean(colSums(x$assignments) > 1))
      } else {  # if merged assignments were unresolvable
        return(1)
      }
    })
    propHomoplMerged <- array(propHomoplMerged, dim = dim(mergedAssignments),
                              dimnames = dimnames(mergedAssignments))
  } else {
    mergedAssignments <- TAGresults[,1,]
    propHomoplMerged <- propHomoplasious[,1,]
  }
  
  # find the best assignment set for each locus
  bestpm <- integer(length(loci)) # index of best parameter set for each locus
  names(bestpm) <- loci
  # matrix to hold missing data rate when data are recoded
  missRate <- matrix(1, nrow = length(loci), ncol = nparam, dimnames = list(loci, NULL))
  for(L in loci){
    theseAssign <- list() # contain all unique assignments
    theseAssignIndex <- integer(nparam) # for each parameter set, which unique assignment goes with it
    # find all unique assignments
    for(pm in 1:nparam){
      if(!is.matrix(mergedAssignments[[L,pm]]$assignments)) next # skip if unresolvable
      
      thisAssign <- data.frame(mergedAssignments[[L,pm]]$assignments)
      # sort the assignments
      thisAssign <- thisAssign[do.call(order, thisAssign),]
      # order the row names for consistency
      row.names(thisAssign) <- 1:dim(thisAssign)[1] 
      
      # check if it should be added to list of uniques assignments, and do so
      matchUn <- sapply(theseAssign, function(x) identical(x, thisAssign))
      if(length(theseAssign) == 0 || all(!matchUn)){
        theseAssign[[length(theseAssign) + 1]] <- thisAssign
        theseAssignIndex[pm] <- length(theseAssign)
      } else {
        theseAssignIndex[pm] <- which(matchUn)
      }
    }
    
    if(all(theseAssignIndex == 0)) next # skip whole locus if totally unresolvable across pops
    
    # test recoding the data using each parameter set
    for(a in 1:length(theseAssign)){
      # turn assignments back into matrix
      amat <- as.matrix(theseAssign[[a]])
      dimnames(amat)[[2]] <- gsub("X", "", dimnames(amat)[[2]])
      # recode genotypes
      r <- recodeAllopoly(object, list(list(locus = L, SGploidy = SGploidy,
                                            assignments = amat)),
                           allowAneuploidy = FALSE, loci = L)
      # proportion of genotypes present in the original that are missing now
      oldMissing <- sum(isMissing(object, loci = L))
      miss <- (sum(isMissing(r))/n.subgen - oldMissing)/(length(samples) - oldMissing)
      # add to matrix for all matching parameter sets
      missRate[L, which(theseAssignIndex == a)] <- miss
    }
    # identify the best set using missing data rate and amount of homoplasy
    bestMiss <- which(missRate[L,] == min(missRate[L,], na.rm = TRUE))
    bestpm[L] <- bestMiss[which.min(propHomoplMerged[L, bestMiss])]
  } # end of locus loop for recoding and picking best parameter set
  
  # make a list of the best assignments
  bestpm[bestpm == 0] <- 1 # for non-resolvable loci
  bestAssign <- list()
  for(l in 1:length(loci)){
    bestAssign[[l]] <- mergedAssignments[[l, bestpm[l]]]
  }
  
  ## plots: distribution of betweenss/totss, heatmaps
  pdf(plotsfile) # open connection to PDF file for making plots
  plotSS <- plotSSAllo(CorrResults) # make plot of sums-of-squares ratio vs. allele distribution evenness
  # heatmaps
  for(L in loci){
    for(p in allpops){
      if(!is.null(CorrResults[[L,p]]$heatmap.dist)){
        heatmap(CorrResults[[L,p]]$heatmap.dist, main = paste(L, p, sep = ", "))
      } else {
        plot(NA, xlim = c(0,10), ylim = c(0,10), main = paste(L, p, sep = ", "))
        text(5,5, "Fisher's exact tests not performed.\nOne or more alleles are present in all individuals.")
      }
    }
  }
  for(p in allpops){
    plotParamHeatmap(propHomoplasious[,p,], popname = p, main = "Proportion homoplasious loci:")
  }
  if(usePops){
    plotParamHeatmap(propHomoplMerged, popname = "Merged across populations", main = "Proportion homoplasious loci:")
  }
  plotParamHeatmap(missRate, popname = "All Individuals", main = "Missing data after recoding:")
  dev.off()      # close connection to PDF file
  
  return(list(AlCorrArray = CorrResults, TAGarray = TAGresults, plotSS = plotSS,
              propHomoplasious = propHomoplasious, mergedAssignments = mergedAssignments,
              propHomoplMerged = propHomoplMerged, missRate = missRate, bestAssign = bestAssign))
} # end of processDatasetAllo function

# function to rewrite genambig object using allele assignments
# object = genambig or genbinary object
# x = list of results (e.g. list of catalanAlleles outputs)
# allowAneuploidy = let individuals be aneuploid vs. inserting missing data
recodeAllopoly <- function(object, x, allowAneuploidy=TRUE,
                           samples=Samples(object), loci=Loci(object)){
    # data conversion and error checking
    if(is(object, "genbinary")) object <- genbinary.to.genambig(object)
    if(!is(object, "genambig"))
        stop("'genbinary' or 'genambig' object required.")
    object <- object[samples, loci]
    lociA <- sapply(x, function(z) z$locus)   # extract locus names
    SGp <- sapply(x, function(z) z$SGploidy) # extract ploidies
    if(is.list(lociA) || is.list(SGp) || !all(!is.na(lociA)) ||
       !all(!is.na(SGp)))
        stop("Locus and SGploidy value required for each list element in x.")
    if(!all(lociA %in% loci)){
        warning(paste("Loci", paste(lociA[!lociA %in% loci], collapse=" "),
                      "found in 'x' but not 'loci'."))
        x <- x[lociA %in% loci]
        if(length(x)==0)
            stop("No recoding possible because locus names do not match")
        lociA <- sapply(x, function(z) z$locus)   # extract locus names
        SGp <- sapply(x, function(z) z$SGploidy) # extract ploidies
    }
    # merge assignments if any loci are represented more than once
    if(!all(table(lociA)==1)){
        x <- mergeAlleleAssignments(x)
        lociA <- sapply(x, function(z) z$locus)   # extract locus names
        SGp <- sapply(x, function(z) z$SGploidy) # extract ploidies
    }
    # finish getting info by locus set up.
    # lociA = loci to recode; SGp = ploidy of isoloci, by locus;
    # nSG = number of isoloci, by locus; A = allele assignments, by locus.
    names(SGp) <- lociA
    nSG <- sapply(x, function(z) ifelse(is.matrix(z$assignments),
                                        dim(z$assignments)[1], NA))
    names(nSG) <- lociA
    A <- lapply(x, function(z) z$assignments)
    names(A) <- lociA
    # get rid of loci with no assignments
    lociA <- lociA[!is.na(nSG)]
    SGp <- SGp[lociA]
    nSG <- nSG[lociA]
    A <- A[lociA]

    # set up new object
    # duplicate the loci that should be duplicated
    newloci <- paste(rep(lociA, times=nSG),unlist(lapply(nSG, function(y) 1:y)),
                     sep="_")
    extraloci <- loci[!loci %in% lociA]
    newloci <- c(newloci, extraloci)
    result <- new("genambig", samples, newloci)
    PopNames(result) <- PopNames(object)
    PopInfo(result) <- PopInfo(object)[samples]
    Description(result) <- Description(object)
    Missing(result) <- Missing(object)
    newploidies <- matrix(rep(SGp, times=nSG), nrow=length(samples),
                             ncol=sum(!newloci %in% extraloci),
                             byrow=TRUE)
    # transfer ploidy data for loci that aren't being recoded
    if(length(extraloci) > 0){
        if(is(object@Ploidies, "ploidymatrix")){
            newploidies <- cbind(newploidies,
                                 Ploidies(object, samples=samples,
                                          loci=extraloci))
        }
        if(is(object@Ploidies, "ploidylocus")){
            newploidies <- cbind(newploidies,
                                 matrix(Ploidies(object, loci=extraloci),
                                        nrow=length(samples), ncol=length(extraloci),
                                        byrow=TRUE))
        }
        if(is(object@Ploidies, "ploidysample") || is(object@Ploidies, "ploidyone")){
            newploidies <- cbind(newploidies,
                                 matrix(Ploidies(object, loci=extraloci),
                                        nrow=length(samples), ncol=length(extraloci),
                                        byrow=FALSE))
        }
    }
    Ploidies(result) <- newploidies

    # collapse ploidy if we don't need it as a matrix
    if(!allowAneuploidy && plCollapse(result@Ploidies, na.rm=FALSE, returnvalue=FALSE)){
      result <- reformatPloidies(result, output="collapse")
    }

    # start assignments
    for(L in loci){
        # get positions of isoloci in new object
        Lindex <- which(sapply(strsplit(newloci, "_"), function(y) y[1]) == L)
        # copy over microsat repeat lengths
        Usatnts(result)[Lindex] <- Usatnts(object)[L]
        if(length(Lindex)==1){ # for loci with no assignments:
            Genotypes(result, loci=Lindex) <- Genotypes(object, loci=L)
            Ploidies(result)[,Lindex] <- Ploidies(object, Samples(object), L)
        } else {               # for loci with assignments:
            # test if this locus is numeric or integers, to convert back later
            xsamples <- samples[!isMissing(object, samples, L)]
            LisInt <- all(sapply(Genotypes(object, xsamples, L),
                                 is.integer, USE.NAMES=FALSE))
            LisNum <- all(sapply(Genotypes(object, xsamples, L),
                                 is.numeric, USE.NAMES=FALSE))
            # If there are any alleles that are unassigned, make them
            # fully homoplasious.
            AL <- A[[L]]
            allelesL <- as.character(.unal1loc(object, xsamples, L))
            allelesUA <- allelesL[!allelesL %in% dimnames(AL)[[2]]]
            if(length(allelesUA)>0){
                toadd <- matrix(1, nrow=length(Lindex), ncol=length(allelesUA),
                                dimnames=list(NULL, allelesUA))
                AL <- cbind(AL, toadd)
            }
        for(s in xsamples){
            thisgen <- as.character(Genotype(object, s, L))
            thisA <- AL[,thisgen, drop=FALSE]
            # too many alleles overall?
            if(length(thisgen) > nSG[L]*SGp[L]){
                next # just skip, or see if there is anything useful?
            }

            SGcomplete <- rep(FALSE, nSG[L]) # mark where genotype is known
                                             # for certain
            # set up list to contain genotypes
            alBySG <- rep(list(character(0)), nSG[L])
            # give up on trying to assign?
            stumped <- FALSE
            # Allele assigment loop
           while(sum(!SGcomplete)>0 & !stumped){
                stumped <- TRUE
                # sub genomes that don't yet have definite genotypes:
                SGtoassign <- (1:nSG[L])[!SGcomplete]
            # see if any isoloci have max number of alleles w/o homoplasy.
            for(sg in SGtoassign){
                sgAl <- thisgen[thisA[sg,]==1 &    # Belong to this subgenome,
                                colSums(thisA)==1] # and non-homoplasious.
                if(length(sgAl)>=SGp[L]){
                    alBySG[[sg]] <- sgAl # assign these alleles to this isolocus
                    SGcomplete[sg] <- TRUE # mark as done
                    stumped <- FALSE
                    # Change thisA to reflect that this isolocus can't have
                    # any other alleles.
                    thisA[sg,-match(sgAl, thisgen)] <- 0
                } else {
                # Also check if there is only one allele that could be assigned
                    # to this isolocus.
                    sgAl <- thisgen[thisA[sg,]==1]
                    if(length(sgAl)==1 || # only one allele could be assigned,
                      # or there's no chance of homoplasy (tetrasomic or higher)
                       all(colSums(thisA[,sgAl])==1)){
                         alBySG[[sg]] <- sgAl
                         SGcomplete[sg] <- TRUE
                         stumped <- FALSE
                    }
                }
            }}

            # check if any have >SGp alleles, and whether we are allowing
            # aneuploidy; change ploidy values if necessary.
            nAl <- sapply(alBySG, length)
            if(!all(nAl <= SGp[L])){
                if(allowAneuploidy){
                    newploidies <- nAl
                    while(sum(newploidies) < nSG[L]*SGp[L]){
                newploidies[newploidies==min(newploidies[newploidies!=0])] <-
                  newploidies[newploidies==min(newploidies[newploidies!=0])] + 1
                    }
                    Ploidies(result)[s,Lindex] <- newploidies
                } else {
                    next # leave as missing data if no aneuploidy allowed
                }
            }

            # convert alleles back to numbers if necessary
            if(LisInt){
                alBySG <- lapply(alBySG, as.integer)
            } else {
                if(LisNum){
                    alBySG <- lapply(alBySG, as.numeric)
                }
            }
            # convert zero-length genotypes to missing data
            alBySG[nAl==0] <- Missing(result)

            # add genotypes to genambig object
            Genotypes(result, samples=s,loci=Lindex) <- alBySG
        } # end of sample loop
        } # end of "else" clause for if allele assignments need to be done
    } # end of locus loop
    return(result)
} # end of function

# polysat should have its own theme song to the tune of "Smelly Cat" from
# Friends.
Any scripts or data that you put into this service are public.
polysat documentation built on Aug. 23, 2022, 5:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
polysat
Tools for Polyploid Microsatellite Analysis

R/allopolyploidy.R
In polysat: Tools for Polyploid Microsatellite Analysis

Defines functions recodeAllopoly processDatasetAllo plotParamHeatmap plotSSAllo mergeAlleleAssignments catalanAlleles testAlGroups alleleCorrelations simAllopoly

Documented in alleleCorrelations catalanAlleles mergeAlleleAssignments plotParamHeatmap plotSSAllo processDatasetAllo recodeAllopoly simAllopoly testAlGroups

Try the polysat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polysat Tools for Polyploid Microsatellite Analysis

R/allopolyploidy.R In polysat: Tools for Polyploid Microsatellite Analysis

Defines functions recodeAllopoly processDatasetAllo plotParamHeatmap plotSSAllo mergeAlleleAssignments catalanAlleles testAlGroups alleleCorrelations simAllopoly

Documented in alleleCorrelations catalanAlleles mergeAlleleAssignments plotParamHeatmap plotSSAllo processDatasetAllo recodeAllopoly simAllopoly testAlGroups

Try the polysat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polysat
Tools for Polyploid Microsatellite Analysis

R/allopolyploidy.R
In polysat: Tools for Polyploid Microsatellite Analysis