R/gd.smouse.r

Defines functions gd.smouse

Documented in gd.smouse

#' Individual genetic distance calculation based on Smouse and Peakall 1999
#' 
#' Calculate pairwise genetic distances between all individuals using the
#' individual genetic distance measure of Smouse and Peakall (1999). This
#' function should produce the same results as the individual genetic distances
#' calculated using GenAlEx and choosing the interpolate missing data option.
#' Note that depending on your computers capabilities, you may run into memory
#' errors when this function is used on datasets with large numbers of
#' individuals (>1000). Additionally, the time for this function to run can be
#' quite lengthy. Using a PC with a 3.5 GHz processor to calculate pairwise
#' genetic distances between individuals with 36 diploid loci it took 2 seconds
#' for 100 individuals, 51 seconds for 500 individuals, 200 seconds for 1000
#' individuals, 836 seconds (~14 minutes) for 2000 individuals, and 1793
#' seconds (~30 minutes) for 3000 individuals.  Please note that for each of
#' your population groupings there must be at least one individual with
#' genotyped alleles for each locus (it doesn't have to be the same individual
#' for every locus). If a population doesn't meet this requirement, it will be
#' dropped from the analysis and a warning message will be generated.
#' 
#' 
#' @param population this is the \code{\link{genind}} object that the analysis
#' will be based on.
#' @param verbose information on progress. For large data sets this could be
#' informative as it allows the user to estimate the amount of time remaining
#' until the function is complete. The counter shows the number of the
#' population for which genetic distances are currently being determined.
#' @return Returns pairwise individual genetic distances for each individual
#' within a population.
#' @author Aaron Adamack, [email protected]@canberra.edu.au
#' @seealso \code{\link{popgenreport}}
#' @references Smouse PE, Peakall R. 1999. Spatial autocorrelation analysis of
#' individual multiallele and multilocus genetic structure. Heredity 82:
#' 561-573.
#' @examples
#' 
#' #not run:
#' #data(bilby)
#' #popgenreport(bilby, mk.gd.smouse = TRUE, mk.pdf=FALSE)
#' #to get a pdf output you need to have a running Latex version installed on your system.
#' #popgenreport(bilby, mk.gd.smouse = TRUE, mk.pdf=TRUE)
#' @export
gd.smouse <- function(population, verbose=TRUE){
  # check to see if the passed data is of the right type
  if (class(population) != "genind") {
    stop("You did not provide a valid genind object! Script stopped!")
  }
  
  # get initial estimates of number of individuals and the maximnum number of loci
  initmaxind<-(dim(population@tab))[1]
  maxloci<-length(locNames(population))
  
  # create a local matrix with the genomes so we can make changes if necessary and not affect other parts of script
  genomes<-population@tab

  # determine the initial population details
  indpop<-population@pop[1:initmaxind,drop=TRUE] # gets a list of the individual's population
  initnumpops<-length(attributes(indpop)$levels) # number of populations
  poplist<-attributes(indpop)$levels

  # create an empty list which will contain all pops 
  indsrmv<-c()
  popsgone<-c()
  
  # need to find the intervals for each loci...
  cs <- cumsum(population@loc.n.all) # this determines the end of each loci frame for each ind's genotype
  cs.l <- c(1,cs[-length(cs)]+1) # this determine the beginning of each loci frame for each ind's genotype

  # check to see that the populations have more than one individual (probably need to make it more 
  # sophisticated later on e.g. check to see that at least one individual has loci values)
  for (i in 1:initnumpops){
    indcnt<-length(indpop[which(indpop==poplist[i])])
    if (indcnt<2){
      indrmv<-which(indpop==poplist[i])
      indsrmv<-c(indsrmv, indrmv)
      popsgone<-c(popsgone,i)
    }
    if(indcnt>=2){
      popi<-which(indpop==poplist[i])
      genos<-population@tab[popi,]
      cntlow<-0
      for (j in 1:maxloci){
        totalleles<-sum(genos[,cs.l[j]:cs[j]],na.rm=TRUE)
        if(totalleles<1) cntlow<-cntlow+1
      }
      if (cntlow>0) {
        indsrmv<-c(indsrmv, popi)
        popsgone<-c(popsgone, i)
      }
    }
  }

  if(length(popsgone)>0){
    message("WARNING!!! The following populations were dropped due to insufficient numbers of individuals")
    for (i in 1:length(popsgone)){
        message(poplist[popsgone[[i]]])
    }
  }

  # remove individuals if the population only has one individual
  if(length(indsrmv)>0){
    indsrmv<-indsrmv*(-1)
    indpop<-factor(indpop[indsrmv])
    genomes<-genomes[indsrmv,]
    poplist<-attributes(indpop)$levels
  }

  # what is the total number of individuals
  maxind<-(dim(genomes))[1]

  # calculate the number of pops after cleaning list
  numpops<-length(attributes(indpop)$levels) # number of populations

  # matrix to put smoused distances in
  smoused<-matrix(NA,nrow=maxind,ncol=maxind)



  for (i in 1:numpops){   #numpops this is looping over pops
    for (j in i:numpops){ #numpops this is looping over pops
      if(verbose) cat("\r","Comparing population ",levels(population@pop)[i]," with population ",levels(population@pop)[j])
      pop1<-which(indpop==poplist[i]) # gets a list of individuals from pop1
      pop2<-which(indpop==poplist[j]) # gets a list of individuals from pop2
      smoused.loci<-array(NA,c(length(pop2),length(pop1),maxloci))
    
      # this if statement calculates genetic distances between individuals in same population, only half the 
      # calculations need to be done as the other half are repeats (e.g. calculate distance between a and b and
      # then calculate the distance between b and a)
      if (i==j){
        if (length(pop1)>1 && length(pop2)>1){
          for (l in 1:(length(pop1)-1)){    # this is looping over columns, skip last column because no comparison to be made
            for (k in (l+1):length(pop2)){  # this is looping over rows, skip l=k and assign value later to save calcs 
              # extract the genotypes of the two individuals
              i1 <- genomes[pop1[l],]
              i2 <- genomes[pop2[k],]          
              # calculate genetic distances between individuals using Smouse and Peakall 1999
              pairdist<-(i1-i2)^2
              sdists<-sapply(1:maxloci, function(x,cs,cs.l,pairdist) sum(pairdist[cs.l[x]:cs[x]])*0.5  , cs, cs.l,pairdist)
              for (m in 1:maxloci){
                if(is.na(sdists[m])) smoused.loci[k,l,m]<-(-99) else smoused.loci[k,l,m]<-sdists[m]    
              }    
            }
          }
        }
      } 
      # this if statement calculates genetic distances between individuals in different populations. All 
      # calculations have to be done here because there aren't the repeated calculations in the above if
      # statement (e.g. no a = b and then b = a)
      if (i!=j){
        for (k in 1:length(pop1)){
          for (l in 1:length(pop2)){
            # extract the genotypes of the two individuals
            i1 <- genomes[pop1[k],]
            i2 <- genomes[pop2[l],]          
            # calculate genetic distances between individuals using Smouse and Peakall 1999
            pairdist<-(i1-i2)^2
            sdists<-sapply(1:maxloci, function(x,cs,cs.l,pairdist) sum(pairdist[cs.l[x]:cs[x]])*0.5  , cs, cs.l,pairdist)
            for (m in 1:maxloci){
              if(is.na(sdists[m])) smoused.loci[l,k,m]<-(-99) else smoused.loci[l,k,m]<-sdists[m]    
            }    
          }
        }
      }
      # this step ID's all array elements for which we have a distance and allows us to mask other elements 
      # e.g. (the upper triangle elements) so that we can solve missing values
      check<-smoused.loci[,,,drop=FALSE]>=0  
      # now we come up with a value for missing genetic distances between individuals in same population
      if (i==j){
        if (length(pop1)>1 && length(pop2)>1){
          for(k in 1:maxloci){ 
            set<-as.matrix(smoused.loci[,,k]) # get the matrix for a particular loci
            set[upper.tri(set)]<-NA           # block the upper tri angle
            frame<-check[,,k,drop=FALSE]
            locimean<-sum(set[frame],na.rm=TRUE)/sum(frame[lower.tri(frame)],na.rm=TRUE)  
            for(m in 1:(length(pop1)-1)){ # loop over columns
              for (l in (m+1):length(pop2)){ # loop over rows
                if (smoused.loci[l,m,k]<0) smoused.loci[l,m,k]=locimean
              }
            }
          }
        }
        for (k in 1:length(pop1)){
          for (l in k:length(pop2)){
            if(k==l) smoused[pop2[l],pop1[k]]<-0
            if(k!=l){
              smoused[pop2[l],pop1[k]]<-sum(smoused.loci[l,k,])
              smoused[pop1[k],pop2[l]]<-sum(smoused.loci[l,k,])
            }
          }
        }    
      }
      # come up with a value for missing genetic distances between individuals in different populations
      if(i!=j){
        for (k in 1:maxloci){
          set<-as.matrix(smoused.loci[,,k])
          subset<-set[1:length(pop2),1:length(pop1)]
          frame<-as.matrix(check[,,k])
          locimean<-sum(subset[frame[,]],na.rm=TRUE)/sum(frame,na.rm=TRUE)      
          for(l in 1:length(pop2)){
            for (m in 1:length(pop1)){
              if (smoused.loci[l,m,k]<0) smoused.loci[l,m,k]=locimean
            }
          }  
        }
        for (k in 1:length(pop2)){
          for (l in 1:length(pop1)){
            smoused[pop2[k],pop1[l]]<-sum(smoused.loci[k,l,])
            smoused[pop1[l],pop2[k]]<-sum(smoused.loci[k,l,])
          }
        }
      }
    }
  }

  if(length(indsrmv)>0){
    # put names on the rows and columns on d.fast (!!!!make sure to use only non-removed individuals)
    colnames(smoused)<-indNames(population)[indsrmv]
    rownames(smoused)<-indNames(population)[indsrmv]
    # calculate geographical distance
    #geodist<-as.matrix(dist([email protected]$utm[indsrmv,]))/1000
  } else if(is.null(indsrmv)){
    colnames(smoused)<-indNames(population)
    rownames(smoused)<-indNames(population)
  }

  # force upper triangle to NA
  smoused[upper.tri(smoused,diag=FALSE)]<-NA
  smoused<-as.dist(smoused)
  return(smoused)
}                                                         

Try the PopGenReport package in your browser

Any scripts or data that you put into this service are public.

PopGenReport documentation built on May 29, 2017, 9:09 p.m.