R/OLD/spatial_sumstats_functions_alvaradoserrano.R

Defines functions mantel_SSS monmonier_SSS semivar_SSS moran_SSS sPCA.dist stats.AFS tryCatch.W.E

#script to calculate AFS statistics based on arlequin result files from SPLATCHE
#Diego F. Alvarado-S.
#March 4, 2014
#Modified: Aug 25, 2014

#########################################################################################################################
#function to catch errors
tryCatch.W.E <- function(expr){
  W <- NULL
  w.handler <- function(w){ # warning handler
    W <<- w
    invokeRestart("muffleWarning")
  }
  list(value = withCallingHandlers(tryCatch(expr, error = function(e) e),
                                   warning = w.handler),
       warning = W)
}
#########################################################################################################################



#########################################################################################################################
stats.AFS = function(data, sstype, nsamples, pops.xy){
  
  #function to calculate AFS summary statisitics
  # data: table of SNP sequences generated with my python script called arfile_SNPs_extract.py
  # sstype: type of summary statistics wanted (either 'nss' --> non-spatial ss or 'sss' --> spatial ss)
  # nsamples: number of individual seqs per population (assumes same number of seqs in all populations)
  # pops.xy: coordinates of population samples
  #suppressMessages(require(gdata))
  
  #calculating AFS:    
  AFS.fol = matrix(rep(NA, (ceiling(nsamples%/%2)+1)*nrow(pops.xy)), nrow=nrow(pops.xy))      #the ceiling is required to deal with uneven numbers of samples
  #rownames(AFS.fol) = seq(nrow(pops.xy))
  rownames(AFS.fol) = as.character(unique(data$st))
  for (I in 1:nrow(AFS.fol)){
    #!# Slight change here, pop names aren't 1-25
    #freqs = colSums(data[which(data[,1]==I),3:ncol(data)]) / nsamples
    freqs = colSums(data[which(data[,1]==unique(data[,1])[I]),3:ncol(data)]) / nsamples
    freq.counts = table(freqs)
    head.freqs = as.numeric(names(freq.counts))
    head.AFS = c(0,hist(seq(1:nsamples)/nsamples, breaks=nsamples, plot=F)$breaks)
    AFS.unfol = rep(0,nsamples+1)
    names(AFS.unfol) = head.AFS
    for (J in 1:length(head.freqs)){
      AFS.unfol[as.character(head.freqs[J])] = as.numeric(freq.counts[J])
    }
    colnames(AFS.fol) = head.AFS[1:((nsamples/2)+1)]
    for (L in 1:(ncol(AFS.fol)-1)){
      AFS.fol[I,L] = AFS.unfol[L] + AFS.unfol[length(AFS.unfol)-L]
    }
    AFS.fol[I,ncol(AFS.fol)] = AFS.unfol[length(AFS.unfol)]
    #rownames(AFS.fol)[I] = paste('P',I,sep='')
  }
  
  #univariate AFS stats:
  AFS.unistats = matrix(rep(NA,(nrow(AFS.fol)*2)),ncol=2)
  colnames(AFS.unistats) = c('W.Mean', 'W.SD')
  rownames(AFS.unistats) = rownames(AFS.fol)
  for (x in 1:nrow(AFS.unistats)){
    AFS.unistats[x,1] = mean(AFS.fol[x,] * seq(ncol(AFS.fol)))
    AFS.unistats[x,2] = sd(AFS.fol[x,] * seq(ncol(AFS.fol)))
  }
  AFS.uni = unmatrix(t(AFS.unistats))
  
  #pairwise AFS stats:
  #comp = combn(nrow(pops.xy),2)
  comp = combn(pops.xy$pop,2)
  AFS.pairstats = matrix(rep(NA,(2*ncol(comp))),ncol=2)
  colnames(AFS.pairstats) = c('W.Mean.Diff', 'W.SD.Diff')
  rownames(AFS.pairstats) = seq(nrow(AFS.pairstats))
  AFS.pairdiffs = NULL
  for (z in 1:ncol(comp)){
    #diff = abs(AFS.fol[paste('P',comp[1,z],sep=''),] - AFS.fol[paste('P',comp[2,z],sep=''),])
    diff = abs(AFS.fol[comp[1,z],] - AFS.fol[comp[2,z],])
    AFS.pairdiffs = rbind(AFS.pairdiffs, diff)
    AFS.pairstats[z,1] = mean(diff * seq(ncol(AFS.fol)))
    AFS.pairstats[z,2] = sd(diff * seq(ncol(AFS.fol)))
    rownames(AFS.pairstats)[z] = paste(rownames(AFS.fol)[comp[1,z]],rownames(AFS.fol)[comp[2,z]],sep='_')
  }
  AFS.pair = unmatrix(t(AFS.pairstats))
  
  AFS.stats = c(AFS.uni,AFS.pair)
  if (sstype == 'nss'){
    return(AFS.stats)
  }
  
  
  #Modification of Smouse and Peakall 2009 spatial autocorrelation method
  rownames(AFS.pairdiffs) = rownames(AFS.pairstats)
  Matrix = matrix(rep(NA,(nrow(pops.xy)^2)),ncol=nrow(pops.xy))
  rownames(Matrix) = rownames(AFS.unistats); colnames(Matrix) = rownames(AFS.unistats)
  Matrix[upper.tri(Matrix, diag=T)] = rep(0,(nrow(AFS.pairdiffs)+nrow(pops.xy)))
  Matrix[lower.tri(Matrix)] = rowSums(AFS.pairdiffs)
  Matrix = Matrix + t(Matrix)
  
  Cov.mat = Matrix - Matrix
  for (c in 1:ncol(Matrix)){
    for (r in 1:nrow(Matrix)){
      #using formula (13) from Smouse and Peakall 2009
      Cov.mat[r,c] = (-Matrix[r,c] + (sum(Matrix[,c]) + sum(Matrix[r,]))/length(Matrix)
                      - sum(Matrix)/(length(Matrix)^2))
    }}
  
  #!# CHANGE HERE -- pops.xy first col is pop ID, y and x locations are #2 and #3
  #geo.matrix = as.matrix(dist(cbind(as.numeric(pops.xy[,1]),as.numeric(pops.xy[,2])),diag=T,upper=T))
  geo.matrix = as.matrix(dist(cbind(as.numeric(pops.xy[,2]),as.numeric(pops.xy[,3])),diag=T,upper=T))
  #geo.matrix = as.matrix(dist(pops.xy[,2:3], diag= T, upper = T))   #!# This would be equivalent, somewhat easier to read

  geo.breaks = unique(as.integer(as.numeric(attributes(table(geo.matrix))$dimnames$geo.matrix)[-1]))
  r.coeffs = rep(0,length(geo.breaks))
  names(r.coeffs) = geo.breaks
  for (z in 1:length(geo.breaks)){
    X.matrix = Matrix - Matrix
    for (c in 1:ncol(geo.matrix)){
      for (r in 1:nrow(geo.matrix)){
        #following analogous weighting scheme Fig.2 from Smouse and Peakall 2009
        if (as.integer(geo.matrix[r,c])==geo.breaks[z]){
          X.matrix[r,c] = 1
        }}}
    diag(X.matrix) = rowSums(X.matrix)
    X_C.matrix = X.matrix * Cov.mat
    #using formula (13) from Smouse and Peakall 2009
    r.coeffs[z] = (sum(X_C.matrix) - sum(diag(X_C.matrix))) / sum(diag(X_C.matrix))
  }
  
  #Spatial correlation stats:
  r.mean = mean(r.coeffs)	#standard deviation correlation
  r.SD = sd(r.coeffs)	 
  r.max = max(r.coeffs)		#max abs correlation 
  r.lagmax = min(as.numeric(which(r.coeffs==max(r.coeffs))))	#spatial lag at which max abs correlation occur
  
  AFS.spatial.stats = c(r.mean, r.SD, r.max, r.lagmax)
  names(AFS.spatial.stats) = c('r.mean', 'r.SD', 'r.max', 'r.lagmax')
  
  if (sstype == 'sss'){
    return(AFS.spatial.stats)
  }
}
#########################################################################################################################





#########################################################################################################################
sPCA.dist = function(data, pops.xy, nsamples, cpos=2, cneg=2, plot=T){
  
  #function to calculate the eculidean distance in sPCA (Jombart et al. 2008) space between individual SNP sequences
  # data: table of SNP sequences generated with my python script called XXX
  # pops.xy: coordinates of population coordinates
  # nsamples: number of individual seqs per population (assumes same number of seqs in all populations)
  # cpos: number of sPCA global components to retain
  # cneg: number of sPCA local components to retain
  # plot: defines whether sPCA plot for global and local (if significant) axes should be plotted
  
  #getting the data into a gneind format 
  data2 = data[,3:ncol(data)]
  ###used an alternative way to do names because our data set was incompatible with this naming system
  names = rep(NA,nrow(data2))
  #for (n in 1:nrow(data2)){
  #  if (n/2 > n%/%2){
  #    names[n] = paste(data[n,2],'a',sep='')
  #  } else {
  #    names[n] = paste(data[n,2],'b',sep='')
  #  }}
  names <- seq(1,nrow(data2))
  row.names(data2) = names
  pops1 = data[,1]
  DATA = genind(data2,pop=pops1,ploidy=1, type="PA")
  
  #!# CHANGE HERE: adding pop column to XY object for use later in the b/w and w/i distance calcs
  XY = data.frame(pop=rep(as.character(unique(data[,1])),nsamples),y=rep(pops.xy[,2],nsamples), x= rep(pops.xy[,3],nsamples))
  #XY = XY[order(as.numeric(as.character(XY$pop))),]
  tmpo <- unlist(strsplit(as.character(XY$pop), "-"))
  XY = XY[order(as.numeric(tmpo[tmpo != "pop"])),]
  #for (i in 1:nrow(pops.xy)){
  #  row = cbind(as.numeric(pops.xy[i,2]),as.numeric(pops.xy[i,3]))
  #  for (j in 1:nsamples){
  #    XY = rbind(XY,row)
  #  }
  #}

  mySpca.prelim = spca(obj = DATA, xy = XY[,-1], type = 7, a=2, dmin=0.1, scannf = FALSE, plot.nb = FALSE)
  mySpca.prelim
  #barplot(mySpca.prelim$eig, main = "Eigenvalues of sPCA")
  
  #testing significance of global and local components
  myGtest <- global.rtest(DATA$tab, mySpca.prelim$lw, nperm = 99)
  if (myGtest$pvalue > 0.05){
    cpos = 2        #NOTE here that even if global structures are not significant, at least the first two components are kept!!!
  }
  #cat(paste('\nGlobal test p-value:', myGtest$pvalue, ' --> ', cpos, 'global components retained\n'))
  #plot(myGtest)
  
  myLtest <- local.rtest(DATA$tab, mySpca.prelim$lw, nperm = 99)
  if (myLtest$pvalue > 0.05){
    cneg = 0
  }
  #cat(paste('Local test p-value:', myLtest$pvalue, ' --> ', cneg, 'local components retained\n'))
  #plot(myLtest)
  
  #running definitive SPCA
  mySpca = spca(obj = DATA, xy = XY[,-1], type = 7, a=2, dmin=0.1, scannf = FALSE, plot.nb = FALSE, nfposi = cpos, nfnega = cneg)
  #cat(paste('Components retained:', mySpca$nfposi, 'pos. and', mySpca$nfnega, 'neg.\n'))
  if (as.logical(plot)){
    plot(mySpca$li[,1],mySpca$li[,2])
  }
    #source(paste(scriptPath,'bounding_ellipse.R',sep='/'))
    #samples = row.names(mySpca$li)
    #color.range = colorRampPalette(rainbow(7))(nrow(pops.xy))
    #colors = rep(NA,length(samples))
    #for (s in 1:length(samples)){
      #samples[s] = strsplit(samples[s], '_')[[1]][1]
      #colors[s] = color.range[as.numeric(strsplit(samples[s], '_')[[1]][1])]
    #}
    #par(mfrow=c(2,1))
    #plot(mySpca$li[,1:2],pch=samples, col=colors, main='Global components scatterplot')
    #for (P in 1:nrow(pops.xy)){
      #mvee(mySpca$li[which(samples==as.character(P)),1:2], color=color.range[P],do.points=F,do.add=T,marker=P)
    #}
    #if (mySpca$nfnega == 2){
      #plot(mySpca$li[,3:4],pch=samples, col=colors, main='Local components scatterplot')
      #for (P in 1:nrow(pops.xy)){
        #mvee(mySpca$li[which(samples==as.character(P)),3:4], color=color.range[P],do.points=F,do.add=T,marker=P)
      #}
    #}
  #}
  
  #obtaining median distance between and within pops
  #!# CHANGE HERE: Need to bind XY, not pops.xy
  #Spca.scores = cbind(pops.xy,mySpca$li)
  Spca.scores = cbind(XY, mySpca$li)
  Spca.scores
  #within distances
  spca.Wit.mean = rep(NA,nrow(pops.xy))
  spca.Wit.dsd = spca.Wit.mean
  for (j in 1:nrow(pops.xy)){
    #!# Names aren't 1-npops, columns 1 through 3 are not PCA scores
    #comp.set = Spca.scores[Spca.scores[,1]==toString(j),2:ncol(Spca.scores)]
    comp.set = Spca.scores[Spca.scores[,1] == pops.xy$pop[j],4:ncol(Spca.scores)]
    spca.Wit.mean[j] = mean(dist(comp.set))
    #!# Slight change to naming stats, instead of (j-1) append pop ID
    names(spca.Wit.mean)[j] = paste('Spca.Dmean',pops.xy$pop[j], sep='_')
    spca.Wit.dsd[j] = sd(dist(comp.set))
    names(spca.Wit.dsd)[j] = paste('Spca.Dsd',pops.xy$pop[j], sep='_')
  }
  #between distances
  #!# Slight chane, combinations of the pop ID
  #comp = combn(nrow(pops.xy),2)
  comp = combn(pops.xy$pop,2)
  spca.Bet.mean = rep(NA,ncol(comp))
  names(spca.Bet.mean) = paste(comp[1,],comp[2,],sep='_')
  spca.Bet.dsd = spca.Bet.mean
  for (k in 1:ncol(comp)){
    #!# As above, slight change here to 4:ncol(Spca.scores) instead of 2:ncol(Spca.scores)
    comp.set = rbind(Spca.scores[Spca.scores[,1]==toString(comp[1,k]),4:ncol(Spca.scores)], Spca.scores[Spca.scores[,1]==toString(comp[2,k]),4:ncol(Spca.scores)])
    spca.Bet.dist = as.matrix(dist(comp.set, upper=T))[1:(nrow(comp.set)/2),((nrow(comp.set)/2)+1):nrow(comp.set)]
    spca.Bet.mean[k] = mean(spca.Bet.dist)
    #!# Slight change in naming here too... same as abovce
    names(spca.Bet.mean)[k] = paste('Spca.Dmean',comp[1,k],comp[2,k],sep='_')
    spca.Bet.dsd[k] = sd(spca.Bet.dist)
    names(spca.Bet.dsd)[k] = paste('SPCA.Dsd',comp[1,k],comp[2,k],sep='_')
  }
  spca.stats = c(spca.Wit.mean, spca.Wit.dsd, spca.Bet.mean, spca.Bet.dsd, myGtest$pvalue, myLtest$pvalue)
  names(spca.stats)[(length(spca.stats)-1):length(spca.stats)] = c('Gtest.pv','Ltest.pv')
  return(spca.stats)
}
#########################################################################################################################




#########################################################################################################################
moran_SSS = function(NSS,xy){  
  #function to calculate Moran's I coefficient as spatial summary statistic
  #NSS: vector of non-spatial summary stats used as input
  #xy: matrix of coordinates of sampled populations
  
  suppressMessages(require(ape)); suppressMessages(require(gdata))
  
  xy.dist.inv = as.matrix(1/dist(xy))
  moran.I.TC = tryCatch.W.E(Moran.I(NSS,xy.dist.inv))
  if (!is.null(moran.I.TC$value$message)){
    print(moran.I.TC$warning)
    moran = NaN
  } else{    
    moran = moran.I.TC$value$observed	        #for details on the function check its help site
  }
  names(moran) = 'Moran.Beta'
  return(moran)
}
#########################################################################################################################





#########################################################################################################################
semivar_SSS = function(NSS, xy, pdf='F'){
  #function to calculate spatial summary statistics based on the semivariogram
  #NSS: vector of non-spatial summary stats used as input
  #xy: matrix of coordinates of sampled populations
  #pdf: whether to generate or not a pdf
  
  suppressMessages(require(geoR))
  do.pdf = as.logical(pdf)
  
  xy.dist = dist(xy)
  bins = seq(0,max(xy.dist),by=max(xy.dist)/round(sqrt(length(xy.dist))))
  
  vario.TC = tryCatch.W.E(variog(coord=xy, data=NSS, breaks=bins))
  vario.fit = tryCatch.W.E(variofit(vario.TC$value,fix.nugget=F))
  if (!is.null(vario.TC$value$message) || is.null(summary(vario.fit$value)$spatial.component[1])){
    print(vario.TC$warning)
    vario.B = NaN
    vario.N = NaN
    vario.S = NaN
    vario.R = NaN
  } else {
    vario.B = vario.TC$value$beta.ols	#for details on the function check its help site
    vario.N = summary(vario.fit$value)$nugget.component   #tausq = nugget
    vario.S = summary(vario.fit$value)$spatial.component[1] #sigmasq = sill
    vario.R = summary(vario.fit$value)$spatial.component[2]   #phi = range
    if (do.pdf){
      pdf('Variogram_plot.pdf')
      plot(vario.TC$value)
      lines(vario.fit)
      dev.off()
    }}
  vario.stats = list('Var.Beta'=vario.B, 'Var.Nugget'=vario.N, 'Var.Sill'=vario.S, 'Var.Range'=vario.R)
  return(vario.stats)
}
#########################################################################################################################




#########################################################################################################################
monmonier_SSS = function(NSS, xy, boundLength, pdf='F'){
  #function to calculate spatial summary statistics based on a monmonier function
  #NSS: vector of non-spatial summary stats used as input
  #xy: matrix of coordinates of sampled populations
  #boundLength: vector of bins' upper limits for semivariogram
  #pdf: whether to generate or not a pdf   
  
  suppressMessages(require(adegenet)); suppressMessages(require(tripack))
  do.pdf = as.logical(pdf)
  
  #this step introduces minor displacements to sample coordinates which are required when populations are lined-up in 1-dimension because there is no triangulation possible in that case. The minor distortion do not significantly change the neighbor calculation
  if (sd(xy[,1])!=0 & sd(xy[,2])!=0){
    trimesh.TC = tryCatch.W.E(summary(tri.mesh(xy[,'X'],xy[,'Y']))$na)
    if (!is.null(trimesh.TC$message) | length(grep('Error',trimesh.TC$values))==0){
      n.tries = nrow(xy)-1
    } else {
      n.tries = summary(tri.mesh(xy[,'X'],xy[,'Y']))$na	#provides the number of total possible arcs in the Delauny triangulation to inform the monomier function of all possible start points to try)
    }}
  if (sd(xy[,1]) == 0){
    xy[,1] = xy[,1] + seq(0.001,(0.001*nrow(xy)), by=0.001)
    n.tries = nrow(xy)-1
  }
  if (sd(xy[,2]) == 0){
    xy[,2] = xy[,2] + seq(0.001,(0.001*nrow(xy)), by=0.001)
    n.tries = nrow(xy)-1
  }
  CN = chooseCN(xy, type=1, plot.nb=F)	#This function builds a connection network using in this case a Delauny triangulation
  
  monmonier.TC = tryCatch.W.E(optimize.monmonier(xy, NSS, CN, threshold=0, ntry=n.tries, display.graph=F, bd.length=boundLength))	#for more details on the function check its help site   ###NOTE THAT I TAKE THIS OUT: bd.length=10 of the arguments!!!!
  monmonier = monmonier.TC$value
  if (!is.null(monmonier$message)){
    print(monmonier.TC$warning)
    mon.steps = NaN
    mon.Tdist = NaN
    mon.AVGdist = NaN
    mon.SDdist = NaN
    mon.AVGx = NaN
    mon.SDx = NaN
    mon.AVGy = NaN
    mon.SDy = NaN
  } else {
    mon.lengths = c(monmonier$run1$dir1$values,monmonier$run1$dir2$values)	#stores the lengths of the vectorsof the best break path		
    mon.xy = rbind(monmonier$run1$dir1$path,monmonier$run1$dir2$path)	#stores the lengths of the vectorsof the best break path
    mon.steps = length(mon.lengths)
    mon.Tdist = sum(mon.lengths)
    mon.AVGdist = mean(mon.lengths)
    mon.SDdist = sd(mon.lengths)
    mon.AVGx = mean(mon.xy[,'x'])
    mon.SDx = sd(mon.xy[,'x'])
    mon.AVGy = mean(mon.xy[,'y'])
    mon.SDy = sd(mon.xy[,'y'])
    if (do.pdf == 'yes'){
      pdf('Monmonier_plot.pdf')
      plot(monmonier)
      dev.off()
    }}
  mon.stats = list('Mon.Nsteps'=mon.steps, 'Mon.Totdist'=mon.Tdist, 'Mon.Avgdist'=mon.AVGdist, 'Mon.Sddist'=mon.SDdist, 'Mon.AvgX'=mon.AVGx, 'Mon.SdX'=mon.SDx, 'Mon.AvgY'=mon.AVGy, 'Mon.SdY'=mon.SDy)
  return(mon.stats)
}
#########################################################################################################################




#########################################################################################################################
mantel_SSS = function(NSS,xy){
  #function to calculate Mantel correlogram values as spatial summary statistics
  #NSS: vector of non-spatial summary stats used as input
  #xy: matrix of coordinates of sampled populations
  
  suppressMessages(require(vegan))
  
  Mantel.TC = tryCatch.W.E(mantel.correlog(NSS,XY=xy))
  Mantel = Mantel.TC$value
  if (!is.null(Mantel$message)){
    print(Mantel.TC$warning)
    MantCor.x = NaN
    MantCor.sd = NaN
    MantCor.max = NaN
    MantCor.ml = NaN
  } else {
    MantCor.x = mean(Mantel$mantel.res[,'Mantel.cor'], na.rm=T)
    MantCor.sd = sd(Mantel$mantel.res[,'Mantel.cor'], na.rm=T)
    MantCor.max = max(Mantel$mantel.res[,'Mantel.cor'], na.rm=T)
    MantCor.ml = as.numeric(which(Mantel$mantel.res[,'Mantel.cor']==max(Mantel$mantel.res[,'Mantel.cor'],na.rm=T))[1])
  }
  mant.stats = list('Man.AvgCor'=MantCor.x, 'Man.SdCor'=MantCor.sd, 'Man.MaxCor'=MantCor.max, 'Man.MaxLag'=MantCor.ml)
  return(mant.stats)
}
#########################################################################################################################
stranda/holoSimCell documentation built on Aug. 4, 2023, 1:12 p.m.