R/individual_distance.R

Defines functions Bruvo2.distance assignClones meandistance.matrix2 genotypeProbs find.na.dist.not.missing find.missing.gen find.na.dist meandist.from.array meandistance.matrix Lynch.distance Bruvo.distance

Documented in assignClones Bruvo2.distance Bruvo.distance find.missing.gen find.na.dist find.na.dist.not.missing genotypeProbs Lynch.distance meandistance.matrix meandistance.matrix2 meandist.from.array

Bruvo.distance <- function(genotype1, genotype2, maxl=10, usatnt=2, missing=-9) {
    
  if(is.na(usatnt)) stop("Bruvo.distance needs info from Usatnts slot.")

  if(identical(genotype1, genotype2) && genotype1[1]!=missing){
    dist <- 0 # If genotypes are identical, just return a distance of zero without doing the rest of the calculation.
  } else {
    if((length(genotype1) > maxl && length(genotype2) > maxl) | # If genotypes are both longer than 10, skip this calculation because it will take over a minute.
       genotype1[1] == missing | genotype2[1] == missing){
      dist <- NA
    } else {
      # Whichever genotype has more alleles, make this genotypeL (long) and the other genotypeS (short).
      # convert alleles into repeat counts by dividing by usatnt
      if(length(genotype1) >= length(genotype2)){
        genotypeL <- genotype1/usatnt
        genotypeS <- genotype2/usatnt
      } else {
        genotypeL <- genotype2/usatnt
        genotypeS <- genotype1/usatnt
      }

      kl <- length(genotypeL) # sets the ploidy level for this genotype comparison
      ks <- length(genotypeS) # number of alleles in the shorter genotype

      allele.distances <- array(0 , c(kl,ks))
      # Create an empty matrix to contain the raw distances between alleles

      for(n in 1:kl){
         for(m in 1:ks){
           allele.distances[n,m] <- genotypeL[n] - genotypeS[m]
        }
      }
      # fills the array with the differences in allele repeat count
      
#      allele.distances <- sweep(matrix(genotypeL, nrow = kl, ncol = ks), 2, genotypeS) 
      # alternative calculation differences in repeat count; seems to be slower in many cases
      
      geometric.distances <- 1 - 2^-abs(allele.distances)
      # geometric transformation based on mutation probabilities

      #Next, find the minimum distance sum among all permutations
      getalldist <- function(dmat, nrow){ # recursive function for going through all permutations
        thesedist <- dmat[,1]
        if(dim(dmat)[2] > 1){
          newdist <- numeric(0)
          for(i in 1:nrow){
            newdist <- c(newdist, thesedist[i] + getalldist(dmat[-i,-1,drop=FALSE], nrow - 1))
          }
          thesedist <- newdist
        }
        return(thesedist) # returns all possible distances
      }
      mindist <- min(getalldist(geometric.distances, kl))
      dist <- (mindist+kl-ks)/kl
      # add 1 for each infinite virtual allele, then divide by the ploidy
    }
  }
return(dist)
}

Lynch.distance<-function(genotype1,genotype2,usatnt=NA,missing=-9){
    if(genotype1[1]==missing || genotype2[1]==missing){
        # return NA if there is any missing data
        distance<-NA
    } else {
        # make sure each allele is only listed once
        genotype1 <- unique(genotype1)
        genotype2 <- unique(genotype2)
        # get the average number of bands for the two genotypes
        meanbands<-(length(genotype1)+length(genotype2))/2
        # find how many bands the genotypes have in common
        commonbands<- sum(genotype1 %in% genotype2)
        # calculate the distance
        distance<- 1-(commonbands/meanbands)
    }

    # return distance
    return(distance)
}


meandistance.matrix <- function(object, samples=Samples(object),
                                 loci=Loci(object), all.distances=FALSE,
                                 distmetric=Bruvo.distance,
                                progress=TRUE, ...){
    # error if not a genambig object
    if(!is(object, "genambig")) stop("\"genambig\" object needed.")
    # subset the object so that samples can be numbered
    object <- object[samples, loci]

    # create an array containing all distances by locus and sample
    loci.matrices<-array(dim=c(length(loci),length(samples),length(samples)),
                         dimnames=list(loci,samples,samples))
    ms <- Missing(object)
    for(L in loci){
      thisind <- genIndex(object, locus = L) # get all unique genotypes
      un <- Usatnts(object)[L]
      Nunique <- length(thisind$uniquegen)
      if(progress) cat(paste("Beginning locus", L, "..."), sep = "\n")
      for(m in 1:Nunique){ # loop through pairs of unique genotypes
        for(n in m:Nunique){
          thisdistance <- distmetric(thisind$uniquegen[[m]], thisind$uniquegen[[n]],
                                     usatnt = un, missing = ms, ...)
          loci.matrices[L, thisind$genindex == m, thisind$genindex == n] <- thisdistance
          loci.matrices[L, thisind$genindex == n, thisind$genindex == m] <- thisdistance
        }
      }
      if(progress) cat(paste("Locus", L, "complete"), sep = "\n")
    }

    # calculate the mean matrix across all loci
    mean.matrix <- meandist.from.array(loci.matrices)

    # return either the mean matrix and possibly the array as well
    if(all.distances){
        return(list(DistByLoc=loci.matrices, MeanMatrix=mean.matrix))
    } else {
        return(mean.matrix)
    }
}

meandist.from.array<-function(distarray, samples=dimnames(distarray)[[2]],
                              loci=dimnames(distarray)[[1]]){
    # get the array to be averaged
    subarray<-distarray[loci,samples,samples]
    # calculate means
    mean.matrix <- colMeans(subarray, na.rm=TRUE)
    return(mean.matrix)
}

find.na.dist<-function(distarray, samples=dimnames(distarray)[[2]],
                              loci=dimnames(distarray)[[1]]){
    # number of missing distances
    nmissing <- sum(is.na(distarray[loci, samples, samples]))
    # set up vectors for data frame to contain info on where missing data is
    Locus <- character(nmissing)
    Sample1 <- character(nmissing)
    Sample2 <- character(nmissing)
    # current row in the data frame
    currrow<-1
    # go through the array, find NA, and put the index into the data frame
    for(L in loci){
        for(s1 in samples){
            for(s2 in samples){
                if(is.na(distarray[L,s1,s2])){
                    Locus[currrow]<-L
                    Sample1[currrow]<-s1
                    Sample2[currrow]<-s2
                    currrow<-currrow+1
                }
            }
        }
    }
    #return data frame
    return(data.frame(Locus=Locus, Sample1=Sample1, Sample2=Sample2, stringsAsFactors=FALSE))
}

find.missing.gen<-function(object, samples=Samples(object),
                           loci=Loci(object)){
    # set up vectors to contain the indices to put into the data frame
    Locus <- c("")
    Sample <- c("")
    # current row in the data frame
    currrow<-1
    # find which data are missing
    for(L in loci){
        for(s in samples){
            if(isMissing(object, s, L)){
                Locus[currrow]<-L
                Sample[currrow]<-s
                currrow<-currrow+1
            }
        }
    }
    # return the data frame
    return(data.frame(Locus=Locus,Sample=Sample, stringsAsFactors=FALSE))
}

# find NA distances that aren't the result of missing data
find.na.dist.not.missing<-function(object, distarray,
                                   samples=dimnames(distarray)[[2]],
                                   loci=dimnames(distarray)[[1]]){
    # get the data frames of NA distances and missing genotypes
    na.dist<-find.na.dist(distarray,samples=samples,loci=loci)
    missing.gen<-find.missing.gen(object,samples=samples,loci=loci)
    # set up vectors for data frame to contain results
    Locus<-""
    Sample1<-""
    Sample2<-""
    currrow<-1
    # for each row of na.dist, look for that locus and samples in missing.gen
    for(i in 1:length(na.dist$Locus)){
        L<-na.dist$Locus[i]
        s1<-na.dist$Sample1[i]
        s2<-na.dist$Sample2[i]
        missthislocus<-missing.gen[missing.gen$Locus==L,]
        if(identical(c(s1,s2) %in% missthislocus$Sample, c(FALSE,FALSE))){
            Locus[currrow]<-L
            Sample1[currrow]<-s1
            Sample2[currrow]<-s2
            currrow<-currrow+1
        }
    }
    # return data frame
    return(data.frame(Locus=Locus, Sample1=Sample1, Sample2=Sample2,
                      stringsAsFactors=FALSE))
}

genotypeProbs <- function(object, sample, locus, freq=NULL, gprob=NULL,
                          alleles=NULL){
    pl <- Ploidies(object,sample,locus) # get ploidy
    gen <- Genotype(object, sample, locus) # get ambiguous genotype

    # determine which method we are using
    u <- c(is.null(freq),is.null(gprob),is.null(alleles))
    ok <- FALSE
    if(identical(u, c(FALSE,TRUE,TRUE))){
        selfing <- FALSE
        ok <- TRUE
    }
    if(identical(u, c(TRUE, FALSE,FALSE))){
        selfing <- TRUE
        ok <- TRUE
    }
    if(!ok) stop("Supply freq, or gprob and alleles.")

    # Errors
    if(is.na(pl)) stop("Ploidies required.")
    if(length(gen) >  pl)
        stop(paste("There are too many alleles for ploidy:",sample,locus))

    # What to do with unambiguous genotypes
    if(length(gen)==1){
        if(pl==0 && isMissing(object, sample, locus)) pl <- 1
               # 8/30/14 - above edit is in anticipation of recodeAllopoly
        results <- list(probs=1, genotypes=matrix(rep(gen, pl), nrow=1,
                                 ncol=pl))
    }
    if(length(gen)==pl){
        results <- list(probs=1, genotypes=matrix(gen, nrow=1, ncol=pl))
    }

    # What to do with ambiguous genotypes
    if(!length(gen) %in% c(1,pl)){
        if(!selfing){
          pop <- PopNames(object)[PopInfo(object)[sample]]
          f <- freq[pop, paste(locus, gen, sep=".")] # get allele frequencies
          f <- as.vector(f/sum(f)) # normalize frequencies
          names(f) <- gen
        }
        a1 <- length(gen) # number of alleles
        a2 <- pl-a1 # number of slots to fill

        # set up function for recursive building of genotypes
        makegen <- function(gmat){
            gmat2 <- matrix(nrow=0, ncol=dim(gmat)[2]+1)
            for(i in 1:dim(gmat)[1]){
                alsofar <- gmat[i,]
                lastallele <- alsofar[length(alsofar)]
                for(j in gen[match(lastallele,gen):length(gen)]){
                    gmat2 <- rbind(gmat2, matrix(c(alsofar,j),nrow=1,
                                                 ncol=dim(gmat2)[2]))
                }
            }
            if(dim(gmat2)[2]==a2){
                return(gmat2)
            } else {
                return(makegen(gmat2))
            }
        }
        # alleles in ambiguous slots
        allelefill <- matrix(gen, nrow=a1, ncol=1)
        if(a2 > 1) allelefill <- makegen(allelefill)
        ng <- dim(allelefill)[1] # number of genotypes
        results <- list(probs=c(), genotypes=matrix())
        # genotypes (unambiguous + ambiguous alleles)
        results[["genotypes"]] <- cbind(matrix(rep(gen,each=ng),
                                               nrow=ng,ncol=a1),
                                        allelefill)
        # Probability of each genotype & allele sorting
        for(i in 1:ng){
            # sort the alleles in this genotype
            results$genotypes[i,] <- sort(results$genotypes[i,])
            if(!selfing){  ## random mating method
              # multiply allele probabilities
              fmult <- prod(f[as.character(allelefill[i,])])
              # get the polynomial coefficient
              allelecopies <- c()
              for(a in unique(allelefill[i,])){
                  allelecopies <- c(allelecopies,
                                  length(allelefill[i,][allelefill[i,]==a]))
              }
              pc <- factorial(a2)/prod(mapply(factorial, allelecopies))
              # get the probability of this genotype
              results[["probs"]][i] <- pc * fmult
            } else {  ## partial selfing method
                # get unambig genotype in terms of genlist numbers
                ugen <- match(as.integer(results$genotypes[i,]),alleles)
                results$probs[i] <- gprob[INDEXG(ugen, length(alleles), pl)]
            }
        }
        if(selfing){
            # with selfing method, need to normalize probabilities.
            results$probs <- results$probs/sum(results$probs)
        }
    }

    # Return a list containing unambiguous genotypes and their probabilities
    return(results)
}

meandistance.matrix2 <- function(object, samples=Samples(object),
                                 loci=Loci(object),
                            freq=simpleFreq(object, samples, loci), self=0,
                            all.distances=FALSE, distmetric = Bruvo.distance,
                            progress=TRUE, ...){
    # Errors
    if(!all(!is.na(Ploidies(object,samples,loci))))
        stop("Ploidies needed.")
    if(!all(!is.na(PopInfo(object)[samples])))
        stop("PopInfo needed.")
    if(!all(PopNames(object)[unique(PopInfo(object)[samples])] %in%
            row.names(freq))) stop("PopNames must match row.names of freq.")

    # calculate probabilities of every unambiguous genotype from all genotypes
    gprobs <- array(list(NA), dim=c(length(samples), length(loci)),
                    dimnames=list(samples, loci))
    if(self == 0){  # calculate genotype probabilities under random mating
        for(s in samples){
            for(L in loci){
                gprobs[[s,L]] <- genotypeProbs(object, s, L, freq=freq)
            }
        }
    } else {  # calculate genotype probabilities under partial selfing
        pops <- PopNames(object)[unique(PopInfo(object)[samples])]
        for(p in pops){
            psamples <- samples[samples %in% Samples(object, populations=p)]
            
            for(L in loci){
                m2 <- unique(Ploidies(object, psamples, L))
                if(length(m2) != 1)
                  stop("Only one ploidy allowed per pop*locus when self > 0.")
                if(is.na(m2))
                    stop("Function requires information in Ploidies slot.")
                if(m2 %% 2 != 0) stop("Ploidy must be even.")
                
                cat("Setting up genotype probabilities...",sep="\n")
                subfreq <- as.matrix(freq[p, grep(paste("^", L, "\\.", sep = ""), names(freq))])
                alleles <- gsub(paste("^", L, "\\.", sep = ""), "", colnames(subfreq))
                # let's just not do null allelles for now
                # (Hey, I'm trying to graduate.)
                alleles <- sort(as.integer(alleles[alleles != "null"]))
                na1 <- length(alleles)
                # setting stuff up from De Silva method
                ng <- na1 # number of genotypes
                for(j in 2:m2){
                    ng <- ng*(na1+j-1)/j
                }
                ag <- GENLIST(ng, na1, m2)
                temp <- RANMUL(ng, na1, ag, m2)
                rmul <- temp[[1]]
                arep <- temp[[2]]
                rm(temp)
                smat <- SELFMAT(ng, na1, ag, m2)
                m <- m2/2
                smatdiv <- (G(m-1,m+1))^2
                p1 <- subfreq[1, paste(L, alleles, sep = ".")]
                p1 <- p1/sum(p1) # normalize to sum to 1
                rvec <- rep(0,ng)
                for(g in 1:ng){
                    rvec[g] <- rmul[g]
                    for(j in 1:m2){
                        rvec[g] <- rvec[g]*p1[ag[g,j]]
                    }
                }
                id <- diag(nrow=ng)
                smatt <- smat/smatdiv
                s3 <- id - self * smatt
                s3inv <- solve(s3)
                gprob <- (1-self) * s3inv %*% rvec
                for(s in psamples){
                    gprobs[[s,L]] <- genotypeProbs(object, s, L, gprob=gprob,
                                                   alleles=alleles)
                }
            }
        }
    }

    # set up a 3D array to hold distances by sample x sample x locus
    loci.matrices<-array(dim=c(length(loci),length(samples),length(samples)),
                         dimnames=list(loci,samples,samples))

    # cycle through calculations
    for(L in loci){
      if(progress) cat(paste("Beginning locus", L, "..."), sep = "\n")
      u <- Usatnts(object)[L]
      thisind <- genIndex(gprobs, L) # index of unique genotypes in gprobs for this locus
      nUnique <- length(thisind$uniquegen) # number of unique genotypes
      udist <- matrix(NA, nrow = nUnique, ncol = nUnique) # to hold distances between unique genotypes
      for(j in 1:nUnique){ # estimate pairwise distances between all unique genotypes
        for(k in j:nUnique){
          d <- distmetric(thisind$uniquegen[[j]], thisind$uniquegen[[k]],
                          usatnt=u, missing=Missing(object), ...)
          udist[j,k] <- d
          udist[k,j] <- d
        }
      }
      for(m in samples){ # get weighted pairwise distances between samples
        for(n in samples[match(m,samples):length(samples)]){
          totdist <- 0 # total distance so far
          # cycle through all unambiguous genotypes for these two samples
          for(i in 1:length(gprobs[[m,L]][["probs"]])){
            for(j in 1:length(gprobs[[n,L]][["probs"]])){
              # lookup raw distance
              d <- udist[thisind$genindex[[m]][i], thisind$genindex[[n]][j]]
              # probability of this combination
              p <- gprobs[[m,L]][["probs"]][i] *
                gprobs[[n,L]][["probs"]][j]
              totdist <- totdist + (d*p)
            }
          }
          loci.matrices[L,m,n] <- totdist
          loci.matrices[L,n,m] <- totdist
        }
      }
      if(progress) cat(paste("Locus", L, "complete"), sep = "\n")
    }

    # calculate the mean matrix across all loci
    mean.matrix <- meandist.from.array(loci.matrices)

    # return either the mean matrix and possibly the array as well
    if(all.distances){
        return(list(DistByLoc=loci.matrices, MeanMatrix=mean.matrix))
    } else {
        return(mean.matrix)
    }
}
# expect ambiguous genotypes to be a non-zero distance from themselves!

assignClones <- function(d, samples=dimnames(d)[[1]], threshold=0){
    # set up results vector
    results <- rep(NA, length(samples))
    names(results) <- samples

    results[1] <- 1 # the first individual is clone 1
    numclones <- 1 # there is one clone so far

     # for just one sample, we're done
    if(length(samples)>1){

    # assign the rest
    for(i in 2:length(samples)){
        s <- samples[i] # i is the numerical index, s is the character index
        sam <- samples[1:(i-1)] # samples that have already been assigned
        if(all(is.na(d[s,sam]))) next # skip missing data

        if(min(d[s, sam], na.rm=TRUE) > threshold){ # new clone
            numclones <- numclones + 1
            results[i] <- numclones
        } else { # match to existing clone(s)
            msam <- sam[d[s,sam] <= threshold] # samples that match
            cl <- unique(results[msam]) # the matching clone(s)
            cl <- cl[!is.na(cl)]
            if(length(cl) == 1){ # these are all one clone
                results[i] <- cl
            } else { # multiple clones are being merged due to this individual
                results[i] <- cl[1]
                results[results %in% cl[2:length(cl)]] <- cl[1]
            }
        }
    }

    # cleanup; just give sequential numbers
    clones <- unique(results)
    clones <- clones[!is.na(clones)]
    results <- match(results, clones)
    names(results) <- samples
  }

    return(results)
}

# Function for the genotype loss/addition model of the Bruvo measure.
# Figures out all the virtual alleles, then passes equal-length genotypes
# to Bruvo.distance.
# See Fig. 1b-1c and Eqn. 6 of Bruvo et al. 2004.
Bruvo2.distance <- function(genotype1, genotype2, maxl=7, usatnt=2, missing=-9,
                            add=TRUE, loss=TRUE){
    if(length(genotype1)==length(genotype2) ||
       genotype1[1] == missing ||
       genotype2[1] == missing ||
       (!add && !loss)){
        # Let normal Bruvo.distance return NA for missing genotypes
        # or pass directly to Bruvo.distance if there are no virtual alleles,
        # or if user isn't using genome loss or genome addition model.
        d <- Bruvo.distance(genotype1, genotype2, maxl=maxl, usatnt=usatnt,
                            missing=missing)
    } else {
        if(length(genotype1) > maxl || length(genotype2) > maxl){
            # if any has two many alleles, don't bother with enumeration
            d <- NA
        } else {
            # Assign the long and short genotypes
            if(length(genotype1) >= length(genotype2)) {
                genotypeL <- genotype1
                genotypeS <- genotype2
            } else {
                genotypeL <- genotype2
                genotypeS <- genotype1
            }
            # get the difference in length
            diff <- length(genotypeL) - length(genotypeS)

            # get allele combinations to try
            alcomb <- function(genotype){
                lg <- length(genotype)
                mat <- matrix(nrow=diff, ncol=lg^diff)
                for(i in 1:diff){
                    mat[i,] <- rep(genotype, times=lg^(i-1), each=lg^(diff-i))
                }
                if(diff > 1){
                  mat <- apply(mat, 2, sort)
                }
                return(mat)
            }

            if(add){
                alleleadd <- alcomb(genotypeS)
            }
            if(loss){
                alleleloss <- alcomb(genotypeL)
            }
            # set up vectors to hold results
            distadd <- rep(NA, times=length(genotypeS)^diff)
            distloss <- rep(NA, times=length(genotypeL)^diff)
            # loop through calculations
            if(add){
            for(i in 1:length(distadd)){
                if(length(unique(alleleadd[,i])) > 1){
                    # check to see if this calculation has already been done
                    for(j in 1:(i-1)){
                        if(identical(alleleadd[,i], alleleadd[,j])){
                            distadd[i] <- distadd[j]
                            break
                        }
                    }
                }
                if(is.na(distadd[i])){
                  # do the calculation is this allele combo is new
                distadd[i] <- Bruvo.distance(genotypeL,
                                             c(genotypeS, alleleadd[,i]),
                                             maxl=maxl, usatnt=usatnt,
                                             missing=missing)}
            }}
            if(loss){
            for(i in 1:length(distloss)){
                if(length(unique(alleleloss[,i])) > 1){
                    for(j in 1:(i-1)){
                        if(identical(alleleloss[,i], alleleloss[,j])){
                            distloss[i] <- distloss[j]
                            break
                        }
                    }
                }
                if(is.na(distloss[i])){
                distloss[i] <- Bruvo.distance(genotypeL,
                                             c(genotypeS, alleleloss[,i]),
                                             maxl=maxl, usatnt=usatnt,
                                             missing=missing)}
            }}
            # get average
            d <- mean(c(mean(distadd), mean(distloss)), na.rm=TRUE)
        }
    }
    return(d)
}

Try the polysat package in your browser

Any scripts or data that you put into this service are public.

polysat documentation built on June 8, 2018, 1:04 a.m.