R/additional_stats_functions.R

Defines functions graph_theory psiCalc gssa_raggedness data_reformat maj_min_fix

#Additional functions to calculate spatial statistics for holostats
#functions all run in Holostats R script

#Fix major (0) and minor (1) allele designations from simulated data
  maj_min_fix <- function(x){
    if(mean(x) > 0.5){
      x <- -1*(x-1)
    }
    x
  }


#Change SNP table into Alvarado-Serrano format
data_reformat <- function(raw_data){
  st <- raw_data$deme
  ones <- rep(1:1,each=length(st))
  act_snps <- raw_data[,-c(1,2)] 
  act_snps <- apply(act_snps,2,as.character)
  act_snps <- apply(act_snps,2,as.numeric)
  act_snps <- apply(act_snps,2,maj_min_fix)
  SNPs_reformat <- as.data.frame(cbind(st,ones,act_snps))
  
  SNPs_reformat[,-1] <- apply(SNPs_reformat[,-1],2,as.character)
  SNPs_reformat[,-1] <- apply(SNPs_reformat[,-1],2,as.numeric)
  SNPs_reformat

}

#Calculate GSSA and raggedness index (Alvarado-Serrano & Hickerson 2018)
gssa_raggedness <- function(data=NULL,dist_mat=NULL){
  split <- vector("list", length(unique(data$st)))
  for(p in 1:length(unique(data$st))) {
    split[[p]] <- data[data$st == unique(data$st)[p],]
    names(split)[p] <- unique(split[[p]]$st)
  }
  maID <- function(x) {
    as.numeric(names(which(table(x) == min(table(x)))))[1]
  }
  ma_vec <- apply(data[,-c(1,2)], 2, maID)
  
  getMAFreq <- function(x=NULL, ma=NULL) {
    length(which(x == ma))
  }
  
  ma_freq_mat = matrix(data = NA, nrow = length(ma_vec), ncol = length(split))
  for(pop in 1:length(split)) {
    ma_freq_mat[,pop] = mapply(getMAFreq, split[[pop]][,-c(1,2)], ma_vec)
  }
  
  gssa_vecs = vector("list",length(split))
  
  for(pop in 1:length(split)) {    
    tmp = rep(0, sum(sapply(ma_freq_mat[ma_freq_mat[,pop] > 1,pop],choose,k=2)))
    gssa_vecs[[pop]] = append(gssa_vecs[[pop]],tmp)
    
    other = c(1:length(gssa_vecs))
    other = other[-pop]
    for(op in other) {
      times = sum(ma_freq_mat[,pop]*ma_freq_mat[,op])
      gssa_vecs[[pop]] = append(gssa_vecs[[pop]], rep(dist_mat[pop,op],times))
      rm(times)	
    }
  }
  
  breakpts = seq(range(dist_mat)[1],range(dist_mat)[2],length=nclass.Sturges(dist_mat))

 
  
  index <- as.data.frame(rep(0,nrow(dist_mat)))
  colnames(index) <- "HRi"  
  index$gssa_mean <- NA
  index$gssa_var <- NA 
  i <- 1
  for(i in 1:nrow(dist_mat)){
    hist_snps <- hist(gssa_vecs[[i]],breaks = breakpts,plot = FALSE)
    snps_bins <- hist_snps$counts
    dist_mat_i <- dist_mat[i,]
    hist_dist <- hist(dist_mat_i,breaks = breakpts,plot = FALSE)
    dist_bins <- hist_dist$counts
    
    gen_bin_corr <- abs(lm(as.numeric(snps_bins)~as.numeric(dist_bins))$residuals)
    
    non_zero_bins <- as.numeric(which(dist_bins!=0.0))
    snps_bin1 <- gen_bin_corr[non_zero_bins]
    snps_bin1 <- snps_bin1/sum(snps_bin1)    
    comb <- combn(length(snps_bin1),2)
    paired <- combn(snps_bin1,2)
    paired <- paired[,which(colDiffs(comb)==1)]
    diffs2 <- (colDiffs(paired))^2
    ragged <- sum(diffs2)
    index$HRi[i] <- ragged
    
    index$gssa_mean[i] <- mean(gssa_vecs[[i]])
    index$gssa_var[i] <- var(gssa_vecs[[i]])
  }
  index
}

#Function to calculate directionality index - psi - Peter & Slatkin (2013) 
psiCalc = function(data, samplen=20) {
  combos <- combn(colnames(data),2)
  
  combo_names <- apply(combos,2,FUN=function(x){paste0(x[1],".",x[2])})
 
  psi = c()
  pair <- 1
  for(pair in 1:length(combos[1,])) {
    tmp_locMAF <- data[,c(combos[1,pair], combos[2,pair])]
    varloci <- which(rowSums(tmp_locMAF == 1) == 0 & rowSums(tmp_locMAF == 0) == 0)
    if(length(varloci) > 0) {
      tmp_locMAF <- tmp_locMAF[varloci,]
      if(length(varloci) == 1) {
        psi[pair] = tmp_locMAF[1]-tmp_locMAF[2]
      } else {
        psi[pair] = mean(tmp_locMAF[,1])-mean(tmp_locMAF[,2])
      }
    } else {
      psi[pair] = NA
    }
  }
  names(psi) <- paste0("Psi_",combo_names)
  psi
}


#Use popgraph package to calculate conditional genetic distance, betweenness centrality, closeness centrality
graph_theory <- function(data = data, stats = c("cGD", "betweenness", "closeness"), plot = FALSE) {
	indat <- apply(data[,-c(1,2)], 2, as.numeric)
	indat2 <- matrix(data = NA, nrow = nrow(indat)/2, ncol = ncol(indat))
	allele1row <- seq(1,nrow(indat),2)
	for(r in 1:nrow(indat2)){
		indat2[r,] <- indat[allele1row[r],]+indat[(allele1row[r]+1),]
	}
	pops <- as.character(data$st[allele1row])
	lat <- rep(NA, nrow(indat2))
	long <- rep(NA, nrow(indat2))
	colnames(indat2) <- paste0("loc",c(1:ncol(indat2)))
	writeme <- data.frame(Population = pops, Latitude = lat, Longitude = long, indat2)
	if (FALSE) 
	{
	  tmpfilename <- paste0("tmpsnp-",round(runif(1),5),".csv")
	  write.csv(writeme, tmpfilename, quote = FALSE, row.names = FALSE)
	  indat3 <- read_population(tmpfilename, type = "snp", locus.columns = c(4:ncol(writeme)))
	  file.remove(tmpfilename)
	} else {# use text connection
	  vec=  c(paste0(paste(names(writeme),collapse=","),"\n"),sapply(1:nrow(writeme),function(l){paste0(paste(writeme[l,],collapse=", "),"\n")}))
	  indat3 <- read_population(textConnection(vec), type = "snp", locus.columns = c(4:ncol(writeme)))
	}
	
  indat3$Population <- writeme$Population
	indat4 <- to_mv(indat3)
	pops <- indat3$Population
	

	myg <- popgraph(x = indat4, groups = pops)
		
	if(plot == TRUE) {
		plot(myg)
	}

	tmpout <- c()
	if("cGD" %in% stats) {
		cGD <- to_matrix(myg, mode = "shortest path")
		combo_names <- combn(rownames(cGD),2)
		combo_names <- apply(combo_names, 2, FUN = function(x){paste0("cGD-",x[1],".",x[2])})
		cGD_out <- cGD[lower.tri(cGD) == TRUE]
		cGD_out <- as.vector(cGD_out)
		names(cGD_out) <- combo_names
		tmpout <- append(tmpout, cGD_out)
	}

	if("betweenness" %in% stats) {
		bwness <- betweenness(myg)
		names(bwness) <- paste0("bwness-", names(bwness))
		tmpout <- append(tmpout, bwness)
	}	

	if("closeness" %in% stats) {
		cness <- closeness(myg)
		names(cness) <- paste0("cness-", names(cness))
		tmpout <- append(tmpout, cness)
	} 

	tmpout
}
stranda/holoSimCell documentation built on Aug. 4, 2023, 1:12 p.m.