sgsR: Spatial genetic structure analyses in R

Documented in sgs

###############################
#### Main SGS analysis function
###############################

## This is the meat of the package, the main function to do the main analysis.
# It performs several steps along the way (finding distance intervals, calculating SGS,
# conducting permutation tests). Calls several C++ functions to speed up the process.

### Make it clear what checkDi's is looking for and how to interpret the cautionary values.

# Returns an sgsOut object that holds summary information about the results of the analysis

#' Run spatial genetic struture analysis
#'
#' \code{sgs} is the primary function for analysis of spatial genetic structure in the sgsR package. This function takes the data structure generated by \code{\link{createSgsObj}} as its primary argument, runs an analysis of spatial genetic structure (see Details below), and outputs the results.
#'
#' @param sgsObj A data structure created by \code{\link{createSgsObj}}
#' @param distance_intervals either a vector with upper limits of distance intervals, or a negative integer with number of distance intervals to generate with approximately equal pairwise comparisons
#' @param nperm Number of permutations of spatial locations among individuals to perform
#' @param dist_mat A user-supplied distance matrix (Optional)
#'
#' @return
#'
#' A list of the class 'sgsOut' that contains the following elements, which can be accessed with the $ operator (see examples):
#' \itemize{
#' \item sgsObj - the original sgsObj data structure used in the sgs analysis
#' \item di - summary of distance intervals used in analysis (see Details for more information)
#' \item fij_obs - observed Fij values in each distance interval for each locus, and averaged across loci
#' \item sp_obs - observed values of the Sp statistic in each distance interval for each locus, and averaged across loci
#' \item slope_obs - observed values of the slope of the regression of pairwise Fij on log distance for each distance interval and locus, and also averaged across loci
#' \item fij_perm_avg - average value of fij for each locus and distance interval after permutation of spatial locations among individuals
#' \item fij_perm_025 - lower 2.5\% quantile of fij for each locus and distance interval after permutation of spatial locations among individuals
#' \item fij_perm_975 - upper 97.5\% quantile of fij for each locus and distance interval after permutation of spatial locations among individuals
#'
#' }
#' @export
#'
#'
#' @section Details:
#'
#' \emph{\strong{di}}
#'
#' A summary of information about each distance interval. Includes:
#' \itemize{
#' \item Max distance - maximum pairwise distance within distance interval.
#' \item Average distance - average pairwise distance within distance interval.
#' \item Number of pairs - number of pairwise comparisons within distance interval.
#' \item \% participation - the proportion of all individuals represented at least once in each distance interval
#' \item CV participation - the coefficient of variation (CV) of the number of times each individual is represented in each distance interval
#' }
#'
#' Values are checked against the rules of thumb presented by in the SPAGeDi manual and Vekemans and Hardy 2004 that the number of pairwise comparisons should be at least 100, % participation > 0.5, and CV participation <= 1 for each distance interval. Cautionary messages are printed if the choice of distance intervals violates one of these guidelines.
#'
#'
#' @seealso
#' \code{\link{createSgsObj}}, \code{\link{summary.sgsOut}}, \code{\link{plot.sgsOut}}
#'
#' @examples
#'
#' ## Simulate genetic data
#' Nind = 100 # Number of individuals
#' Nloci = 5 # Number of loci
#' Nallele = 10 # Number of alleles per loci
#'
#' ## Set up data frame and generate random spatial locations
#' dat <- data.frame(id = 0:(Nind - 1))
#' dat$x = runif(Nind, 0, 100)
#' dat$y = runif(Nind, 0, 100)
#'
#' ## Simulate Random genetic data and assign loci names
#' for (loci in 1:Nloci) {
#'  loci_name_a = paste("Loc", loci, "_A", sep = "")
#'  loci_name_b = paste("Loc", loci, "_B", sep = "")
#'  dat[loci_name_a] <- sample.int(Nallele, Nind, replace = TRUE)
#'  dat[loci_name_b] <- sample.int(Nallele, Nind, replace = TRUE)
#'}
#'
#' ## Convert to sgsObj
#' sgsObj = createSgsObj(sample_ids = dat$id,
#'                      genotype_data = dat[, 4:(Nloci*2 + 3)],
#'                      ploidy = 2,
#'                      x_coords = dat$x,
#'                      y_coords = dat$y)
#'summary(sgsObj)
#'
#'
sgs <- function(sgsObj,
                distance_intervals,
                nperm = 999,
                dist_mat = NULL){

  # Check to make sure data is sgs object
  if(!is(sgsObj, "sgsObj")){
    stop("Input data must be of class sgsObj... \n")
  }

  ## Set up output data structure
  sgsOut <- structure(list(), class = "sgsOut")

  sgsOut$sgsObj <- sgsObj

  #####
  ## DISTANCE INTERVALS
  ####
  if(!is.null(dist_mat)){
    cat("Using user-supplied distance matrix... \n")

    # Error checking

    if(!is.matrix(dist_mat)){
     stop("User-supplied distance matrix must be a matrix...\n")
    }

    if(any(dist_mat < 0)){
      stop("User-supplied distance matrix must contain only positive values... \n")
    }

    if(dim(dist_mat)[1] != sgsObj$Nind | dim(dist_mat)[2] != sgsObj$Nind){
      stop("The number of rows and columns of user-supplied distance matrix must equal number of individuals... \n")
    }

    Mdij = dist_mat

  } else {
    ## Calculate pairwise distances
      Mdij = calcPairwiseDist(sgsObj$x, sgsObj$y, sgsObj$Nind ) ## Distance matrix - C++ func
  }

      # If equalized distance interval option is chosen (by setting option to negative)
      # Find new distance intervals with approximately equal number of pairwise comparisons
    if(distance_intervals[1] < 0){
      cat("Finding --", -distance_intervals, "-- distance intervals with approximately equal pairwise comparisons...\n")
      distance_intervals =  findEqualDIs(Mdij, distance_intervals, sgsObj$Nind) # C++ func
    }

      ## Add a new max distance interval if last interval isn't already large enough
      if(max(Mdij) > max(distance_intervals)){
        distance_intervals <- c(distance_intervals, max(Mdij) * 1.001)
        cat("----------------------------\n")
        cat("Adding an aditional distance interval --", max(Mdij) * 1.001,
            "-- to encompass all pairwise distances.. \n\n")
      }

      ## Make sure that first distance interval include all pairs of neighbors. As suggested by
      ## Vekemans and Hardy 2004 for calculating Sp statistics

        # Make distance matrix symmetric
        dist_mat <- Mdij + t(Mdij)
        diag(dist_mat) <- NA # Set diagonal to NA

        nn_dist <- apply(dist_mat, 1, min, na.rm = TRUE)

        if(any(nn_dist > distance_intervals[1])){
          cat("----------------------------\n")
          cat("Caution - first distance interval does not include all pairs of neighbors, as suggested by Vekemans and Hardy 2004 for calculating Sp statistic. \n")
          cat("First distance interval would have to be at least:", max(nn_dist), "to include all pairs of neighbors. \n\n")
        }


       ### Find distance intervals
      Mcij = findDIs(Mdij, distance_intervals, sgsObj$Nind) # C++ func

    # Calculate summary of distance intervals
      DIsummary = summarizeDIs(Mdij, Mcij, distance_intervals, Nind = sgsObj$Nind) # C++ func
      rownames(DIsummary) <- c("Distance class", "Max distance", "Average distance",
                               "Number of pairs", "% participation", "CV participation")

      sgsOut$di <- round(DIsummary, 2) ## Save summary information about distance intervals

      ## Add column names to di output
      colnames(sgsOut$di) <- paste("D", 1:length(distance_intervals), sep = "")

      ## Check to make sure DIs follow rules of thumbs in Spagedi manual
      checkDIs(sgsOut$di)



  #####
  ## REFERENCE ALLELE FREQUENCY
  ####


    ## reference genotypes as a list
    ##
      ## Could probably speed up a bit by converting loop to C++
    ref_gen <- list()
    row = 1
    for(locus in seq(1, (sgsObj$Nloci*2), by = 2)){
      ref_gen[[row]] = calcAlleleFreqPop(sgsObj$gen_data_int[, locus],
                                         sgsObj$gen_data_int[ , locus + 1],
                                         Nallele = sgsObj$Nallele[row],
                                         Ngenecopies = sgsObj$Ngenecopies[row])
      row = row + 1
    }

    ## Check to make sure allele frequencies at each locus sum to one
    if(any(sapply(ref_gen, sum) != 1)){
      warning("Reference allele frequencies do not sum to 1... \n")
    }


  #####
  ## PAIRWISE RELATEDNESS COEFFICIENT
  ####

  ## Calculate pairwise Fij across entire population
  # This is a huge C++ function that perfroms the brunt of the SGSG analysis

  ## Returns a matrix with information on Fij estimates, permutation results, for each loci,
    # averaged across loci for each distance class
    ## number of columns is equal to length of distance_intervals

    ## Sets of data - each set is Nloci + 1 rows long
      # 1) Fij - observed by distance class
      # 2) Fij - Permutation average by distance class
      # 3) Fij - 2.5% permutation quantile by distance class
      # 4) Fij - 97.5% permutation quantile by distance class
      # 5) Sp - observed values - only first column
      # 6) Slope - observed values - only first clumn

  fijsummary = calcFijPopCpp(genotype_data = as.matrix(sgsObj$gen_data_int),
                       distance_intervals = distance_intervals,
                       Mdij = Mdij,
                       Mcij = Mcij,
                       ref_gen = ref_gen,
                       Nloci = sgsObj$Nloci,
                       Nallele = sgsObj$Nallele,
                       Nind = sgsObj$Nind,
                       Ngenecopies = sgsObj$Ngenecopies,
                       Nperm = nperm,
                       x_coord = sgsObj$x,
                       y_coord = sgsObj$y)



    rownames(fijsummary) <- c(sgsObj$loci_names, "ALL LOCI",
                        paste(sgsObj$loci_names, "_perm_avg", sep = ""), "ALL_LOCI_perm_avg",
                        paste(sgsObj$loci_names, "_perm_025", sep = ""), "ALL_LOCI_perm_025",
                        paste(sgsObj$loci_names, "_perm_975", sep = ""), "ALL_LOCI_perm_975",
                        paste(sgsObj$loci_names, "_sp", sep = ""), "ALL_LOCI_sp",
                        paste(sgsObj$loci_names, "_slope", sep = ""), "ALL_LOCI_slope")

    ## Add column names to fijsummary
    colnames(fijsummary) <- paste("D", 1:length(distance_intervals), sep = "")

    ## Subset out specific sections
    sgsOut$fij_obs <- fijsummary[1:(sgsObj$Nloci + 1), ]

    sgsOut$sp_obs <- fijsummary[(sgsObj$Nloci*4 + 5):(sgsObj$Nloci*5 + 5), 1, drop = FALSE ]
    colnames(sgsOut$sp_obs) <- c("Observed")
    sgsOut$slope_obs <- fijsummary[(sgsObj$Nloci*5 + 6):(sgsObj$Nloci*6 + 6), 1, drop = FALSE ]
    colnames(sgsOut$slope_obs) <- c("Observed")

    if(nperm > 0){ # If permutation selected, add it to the output
      sgsOut$fij_perm_avg <- fijsummary[(sgsObj$Nloci + 2):(sgsObj$Nloci*2 + 2), ]
      sgsOut$fij_perm_025 <- fijsummary[(sgsObj$Nloci*2 + 3):(sgsObj$Nloci*3 + 3), ]
      sgsOut$fij_perm_975<- fijsummary[(sgsObj$Nloci*3 + 4):(sgsObj$Nloci*4 + 4), ]
    }


  ## Output a summary table with distance classes, Fij estimates, permutation results, etc
   return(sgsOut)

}