R/GDM_Table_Funcs.R
In Gdm01: Create Generalised Dissimilarity Models

Documented in createsitepair gdm.formatsitepair

##
## This version is for the R Extension
##
## Edit 10:02 03/10/2013		1: altered call to use PACKAGE dll gdm
## Edit 10:26 08/10/2013                1: added wordsize function to determine if 32bit or 64bit OS. 
## Edit 08:00 22/11/2013                1: fixed complaints from R-Forge checking
## Edit 13:27-US-EST 24/2/14            1: Added formatGDMData and createSitePair functions -MDL
#


"gdm" <- 
function (data, geo=FALSE, splines=NULL, quantiles=NULL) 
{
	options(warn.FPU = FALSE)
##
##	sanity check on the data table
##
  if(class(data)!="data.frame"){
    warning("The input site-pair data table is not a data frame. Due to this the GDM results may not be correct.")
  }
  
	if ( ncol(data) < 6 )
	    stop("Not enough columns in data")
	if ( nrow(data) < 1 )
	    stop("Not enough rows in data")


##
##	check that the response data is [0..1]
##
	rtmp <- data[,1]
	if (length(rtmp[rtmp<0]) > 0)
 	    stop("Response data has negative values")
	if (length(rtmp[rtmp>1]) > 0)
	    stop("Response data has values greater than 1")


##
##	current data format is response,weights,X0,Y0,X1,Y1 before any predictor data (thus 6 leading columns)
##
	LEADING_COLUMNS <- 6
	if ( geo ) 
        {
	    nPreds <- ( ncol(data) - LEADING_COLUMNS ) / 2 + 1		
	}
	else 
        {
	    nPreds <- ( ncol(data) - LEADING_COLUMNS ) / 2
	}

	if ( nPreds < 1 )
	    stop("Data has NO predictors")


##
## 	setup the predictor name list
##
	if ( geo ) 
        {
	    if ( nPreds > 1 )
		predlist <- c("Geographic", names(data)[(LEADING_COLUMNS+1):(LEADING_COLUMNS+nPreds-1)])
	    else
		predlist <- c("Geographic")
	}
	else 
        {
  	    predlist <- names(data)[(LEADING_COLUMNS+1):(LEADING_COLUMNS+nPreds)]
	}


##
##	deal with the splines and quantiles
##
	if (is.null(quantiles))
	{
	    ##
	    ## generate quantiles internally from the data
	    ##
	    if ( is.null(splines) ) 
            {
		nSplines <- 3
		quantvec <- rep(0, nPreds * nSplines)
		splinvec <- rep(nSplines, nPreds)

		if ( geo ) 
                {
      		    ## get quantiles for the geographic distance
            	    v <- sqrt((data[,3]-data[,5])^2 + (data[,4]-data[,6])^2)
		    quantvec[1] <- min(v)
      		    quantvec[2] <- median(v)
            	    quantvec[3] <- max(v)

		    if ( nPreds > 1 ) 
                    {
		        ## get quantiles for the environmental predictors
			for (i in seq(from = 1, to = nPreds-1, by = 1)) 
                        {
            		    v <- c(data[,i+LEADING_COLUMNS], data[,i+LEADING_COLUMNS+nPreds-1])                 
      	            	    index = i * nSplines
		            quantvec[index+1] <- min(v)
		      	    quantvec[index+2] <- median(v)
      	      	      	    quantvec[index+3] <- max(v)
	      	      	}
		    }
    		}
		
		else 
                {
		    ## get quantiles for the environmental predictors after skipping geographic preds
	      	    for (i in seq(from = 1, to = nPreds, by = 1)) 
                    {
			v <- c(data[,i+LEADING_COLUMNS], data[,i+LEADING_COLUMNS+nPreds])                   
            	      	index = (i-1) * nSplines
	            	quantvec[index+1] <- min(v)
	      	        quantvec[index+2] <- median(v)
            	      	quantvec[index+3] <- max(v)
		    }
		}
	    }
		
	    else 
            {
		##
		##	otherwise check that the supplied splines vector has enough data and minumum spline values of 3
		##
		if ( length(splines) != nPreds ) 
                {
		    stop(paste("splines argument has", length(splines), "items but needs", nPreds, "items."))
		}

  		## count the total number of user defined splines to dimension the quantiles vector
		quantvec <- rep(0, sum(splines))
		splinvec <- splines

		if ( geo ) 
                {
		    if ( splines[1] < 3 )
			stop("Must greater than or equal to 3 splines per predictor")

                    ## get quantiles for the geographic distance
      	      	    v <- sqrt((data[,3]-data[,5])^2 + (data[,4]-data[,6])^2)
		    quantvec[1] <- min(v)		## 0% quantile
      	      	    quantvec[splines[1]] <- max(v)	## 100% quantile

		    quant_increment <- 1.0 / (splines[1]-1)
		    this_increment <- 1
		    for (i in seq(from = 2, to = (splines[1]-1), by = 1)) 
                    {  
		        ## mid % quantiles
			quantvec[i] <- quantile(v,quant_increment*this_increment)
			this_increment <- this_increment + 1
		    }

		    if ( nPreds > 1 ) 
                    {
			## get quantiles for the environmental predictors
			current_quant_index <- splines[1]
			for (i in seq(from = 1, to = nPreds-1, by = 1)) 
                        {
      	      		    num_splines <- splines[i+1]
			    if ( num_splines < 3 )
			        stop("Must greater than or equal to 3 splines per predictor")

			    v <- c(data[,i+LEADING_COLUMNS], data[,i+LEADING_COLUMNS+nPreds-1])                 
			    quantvec[current_quant_index+1] <- min(v)	            ## 0% quantile
	      		    quantvec[current_quant_index+num_splines] <- max(v)	    ## 100% quantile

			    quant_increment <- 1.0 / (num_splines-1)
			    this_increment <- 1
			    for (i in seq(from = 2, to = (num_splines-1), by = 1)) 
                            {  
				## mid % quantiles
				quantvec[current_quant_index+i] <- quantile(v,quant_increment*this_increment)
				this_increment <- this_increment + 1
			    }
			
			    current_quant_index <- current_quant_index + num_splines
			}
		    }
                }

		else 
                {
		    ## get quantiles for the environmental predictors
		    current_quant_index <- 0
		    for (i in seq(from = 1, to = nPreds, by = 1)) 
                    {
      	      	        num_splines <- splines[i]
			if ( num_splines < 3 )
			    stop("Must greater than or equal to 3 splines per predictor")

                        v <- c(data[,i+LEADING_COLUMNS], data[,i+LEADING_COLUMNS+nPreds])          
			quantvec[current_quant_index+1] <- min(v)	        ## 0% quantile
	      		quantvec[current_quant_index+num_splines] <- max(v)	## 100% quantile

			quant_increment <- 1.0 / (num_splines-1)
			this_increment <- 1
			for (i in seq(from = 2, to = (num_splines-1), by = 1)) 
                        {  
			    ## mid % quantiles
			    quantvec[current_quant_index+i] <- quantile(v,quant_increment*this_increment)
			    this_increment <- this_increment + 1
			}
			current_quant_index <- current_quant_index + num_splines
		    }
		}
	    }
	}

	else
	{
	    ##
	    ## user defined quantiles supplied as an argument
	    ##
	    if ( is.null(splines) ) 
            {
	        ## check that there are nPreds * 3 quantiles in the user defined vector
		if ( length(quantiles) != (nPreds * 3) ) 
                {
		    stop(paste("There should be", (nPreds * 3), "items in the quantiles argument, not", length(quantiles), "items."))
		}

		## now check that each of the three quantiles for each predictor are in ascending order
		for (i in seq(from = 1, to = nPreds, by = 1)) 
                {
                    index = i * 3
		    if ((quantiles[index-1] < quantiles[index-2] ) || 
                        (quantiles[index] < quantiles[index-2]) || 
                        (quantiles[index] < quantiles[index-1])) 
                    {
			stop(paste("Wrong quantiles for predictor:", predlist[i]))
		    }
		}

                nSplines <- 3
		quantvec <- quantiles
		splinvec <- rep(nSplines, nPreds)
	    }
		
	    else 
            {
		## check that there are sum(splines) quantiles in the user defined vector
		if ( length(quantiles) != sum(splines) ) 
                {
		    stop(paste("There should be", sum(splines), "items in the quantiles argument, not", length(quantiles), "items."))
		}
				
		## now check that each of the quantiles for each predictor are in ascending order
		index = 0
		for (i in seq(from = 1, to = nPreds, by = 1)) 
                {
		    for (j in seq(from = 2, to = splines[i], by = 1)) 
                    {
                        if (quantiles[index+j] < quantiles[index+j-1]) 
                        {
			    stop(paste("Wrong quantiles for predictor:", predlist[i]))
			}
		    }
		    index <- index + splines[i]
		}

		quantvec <- quantiles
		splinvec <- splines
	    }
	}

      	p1 <- 0
	p2 <- 0
    	p3 <- 0
    	p4 <- 0
    	p5 <- rep(0,times=length(quantvec))
      	p6 <- rep(0,times=nrow(data))
      	p7 <- rep(0,times=nrow(data))
      	p8 <- rep(0,times=nrow(data))
	
##
##  Call the dll function
##
	z <- .C( "GDM_FitFromTable",
                 paste(getwd()),
                 as.matrix(data),
                 as.integer(geo),
                 as.integer(nPreds), 
                 as.integer(nrow(data)), 
                 as.integer(ncol(data)),
                 as.integer(splinvec),
                 as.double(quantvec),                 
                 gdmdev = as.double(p1),
                 nulldev = as.double(p2),
                 expdev = as.double(p3),
                 intercept = as.double(p4),         
                 coeffs = as.double(p5),
                 response = as.double(p6),
                 preddata = as.double(p7),
                 ecodist = as.double(p8), 
                 PACKAGE = "gdm"
               )

##	call <- match.call()
        m <- match.call(expand.dots = F)
        
    	gdmOb = structure(list(dataname = m[[2]],
                       geo = geo,
                       sample = nrow(data),
                       gdmdeviance = z$gdmdev,
                       nulldeviance = z$nulldev,
                       explained = z$expdev,
                       intercept = z$intercept,
                       predictors = predlist,
                       coefficients = z$coeffs,
                       quantiles = quantvec,
                       splines = splinvec,
                       creationdate = date(),
                       observed = z$response,
                       predicted = z$preddata,
                       ecological = z$ecodist))         
  
  class(gdmOb) = c("gdm", "list")
  
  return(gdmOb)
}





"plot.gdm" <- 
function (model, plot.layout = c(2,2), plot.color = rgb(0.0,0.0,1.0), plot.linewidth=2.0) 
{
      	options(warn.FPU = FALSE)
      	PSAMPLE <- 200
      	preddata <- rep(0,times=PSAMPLE)
			
	##
	## establish what plot layout to use
	##  
	thisplot <- 0
	one_page_per_plot <- FALSE
	if ((plot.layout[1]==1) && (plot.layout[2]==1)) 
		one_page_per_plot <- TRUE
	else
	  par(mfrow=plot.layout)	


      	##
      	## apply the link function and plot.....
      	##
      	plot( model$ecological, 
              model$observed, 
              xlab="Predicted Ecological Distance", 
              ylab="Observed Compositional Dissimilarity", type="n" )

        points( model$ecological, model$observed, pch=20, cex=0.25, col=plot.color )

        overlayX <- seq( from=min(model$ecological), to=max(model$ecological), length=PSAMPLE )
        overlayY <- 1 - exp( - overlayX )
        lines( overlayX, overlayY, lwd=plot.linewidth ) 
	thisplot <- thisplot + 1


        ##
        ## use the raw data and plot.....
        ##
	if (one_page_per_plot)
        {
      		dev.new()
        	dev.next()
	}
        plot( model$predicted, 
              model$observed, 
              xlab="Predicted Compositional Dissimilarity", 
              ylab="Observed Compositional Dissimilarity", type="n" )

        points( model$predicted, model$observed, pch=20, cex=0.25, col=plot.color )

        overlayX <- overlayY <- seq( from=min(model$predicted), to=max(model$predicted), length=PSAMPLE )
        lines( overlayX, overlayY, lwd=plot.linewidth ) 
	thisplot <- thisplot + 1


        ##
        ## determine the max of all the predictor data
        ##
        preds <- length(model$predictors)
        predmax <- 0
        splineindex <- 1
        for ( i in 1:preds ) 
        {  
            ## only if the sum of the coefficients associated with this predictor is > 0.....
            numsplines <- model$splines[i]
            if ( sum(model$coefficients[splineindex:(splineindex+numsplines-1)]) > 0 ) 
            {
	        ## get predictor plot Y-data                            
                z <- .C( "GetPredictorPlotData", 
                         pdata = as.double(preddata),
                         as.integer(PSAMPLE),
                         as.double(model$coefficients[splineindex:(splineindex+numsplines-1)]),
                         as.double(model$quantiles[splineindex:(splineindex+numsplines-1)]),
                         as.integer(numsplines), 
                         PACKAGE = "gdm"
                       )
		                        
		v <- max(z$pdata)
                if (v > predmax ) predmax <- v
            }
            splineindex <- splineindex + numsplines
	}

	
        ##
        ## plot the predictors with non-zero sum of coefficients
        ##      
        splineindex <- 1
	for ( i in 1:preds ) 
        {  
            ## only if the sum of the coefficients associated with this predictor is > 0.....
            numsplines <- model$splines[i]
            if ( sum(model$coefficients[splineindex:(splineindex+numsplines-1)]) > 0 ) 
            {
		if (one_page_per_plot)
                {
      		    dev.new()
		    dev.next()
		}
		
   	        else 
                {
                    thisplot <- thisplot + 1
		    if (thisplot > (plot.layout[1] * plot.layout[2])) 
                    {	
			dev.new()                              				
                        dev.next()					
			thisplot <- 1
                        par(mfrow=plot.layout)	
		    }
		}
                  
		## get predictor plot Y-data    
                 z <- .C( "GetPredictorPlotData", 
                           pdata = as.double(preddata),
                           as.integer(PSAMPLE),
                           as.double(model$coefficients[splineindex:(splineindex+numsplines-1)]),
                           as.double(model$quantiles[splineindex:(splineindex+numsplines-1)]),
                           as.integer(numsplines),
                           PACKAGE = "gdm"
                        )

                        
                  plot( seq(from=model$quantiles[[(i*3)-2]],to=model$quantiles[[(i*3)]], length=PSAMPLE),
                  	z$pdata, 
                        xlab=model$predictors[i], 
                        ylab=paste("f(", model$predictors[i], ")", sep="" ), 
            	  	ylim=c(0,predmax), type="l" )
      	    }
	      	splineindex <- splineindex + numsplines
      	}       
}






"predict.gdm" <- 
function (model, data) 
{
    options(warn.FPU = FALSE)

#
#  Call the dll function
#
    predicted <- rep(0,times=nrow(data))
    z <- .C( "GDM_PredictFromTable",
             as.matrix(data),
             as.integer(model$geo),
             as.integer(length(model$predictors)), 
             as.integer(nrow(data)), 
             as.double(model$quantiles),
             as.integer(model$splines),
             as.double(c(model$intercept,model$coefficients)),
             preddata = as.double(predicted),
             PACKAGE = "gdm"
           )
    return(z$preddata)
}



"gdm.transform" <- 
function (model, data) 
{
    options(warn.FPU = FALSE)

#
#  Call the dll function
#
    transformed <- matrix(0,nrow(data),ncol(data))
    z <- .C( "GDM_TransformFromTable",
             as.integer(nrow(data)), 
             as.integer(ncol(data)),
             as.integer(model$geo),
             as.integer(length(model$predictors)), 
             as.integer(model$splines),             
             as.double(model$quantiles),             
             as.double(model$coefficients),
             as.matrix(data),
             trandata = as.double(transformed),
             PACKAGE = "gdm"
           )
    ##
    ## Convert transformed from a vector into a data frame before returning...
    ##	
    nRows <- nrow(data)
    nCols <- ncol(data)
    ##
    ## z$trandata is the transformed data vector created in GDM_TransformFromTable
    ##
    myVec <- z$trandata
    pos <- 1
    for (i in seq(from = 1, to = nCols, by = 1)) 
    {
        tmp <- myVec[seq(from=pos, to=pos+nRows-1)]
        transformed[,i] <- tmp
        pos <- pos + nRows
    }
    return(transformed)
}



"summary.gdm" <- 
function (model) 
{
        print( "", quote=F )    
        print( "", quote=F )    
        print( "GDM Modelling Summary", quote=F );
        print( paste( "Creation Date: ", model$creationdate ), quote=F );
        print( "", quote=F )    
##        call <- match.call()
        m <- match.call(expand.dots = F)
        print( paste( "Name: ", m[[2]] ), quote=F )
        print( "", quote=F )    
        print( paste( "Data: ", model$dataname ), quote=F )
        print( "", quote=F )    
        print( paste( "Samples: ", model$sample ), quote=F )
        print( "", quote=F )    
        print( paste( "Use Geographical Distance: ", model$geo ), quote=F )
        print( "", quote=F )    
        print( paste( "NULL Deviance: ", model$nulldeviance ), quote=F )
        print( paste( "GDM Deviance: ", model$gdmdeviance ), quote=F )  
        print( paste( "Deviance Explained: ", model$explained ), quote=F )
        print( "", quote=F )    
        print( paste( "Intercept: ", model$intercept ), quote=F )
        print( "", quote=F )    
        thiscoeff <- 1
        thisquant <- 1
        for ( i in 1:length(model$predictors) ) {
        print( paste( "Predictor ",i,": ",model$predictors[[i]], sep="" ), quote=F )            
        print( paste( "Splines: ",model$splines[[i]], sep="" ), quote=F )
        numsplines <- model$splines[[i]]
        for ( j in 1:numsplines ) {
                if ( j == 1 ) print( paste( "Min Quantile: ",model$quantiles[[thisquant]], sep="" ), quote=F )          
                else if ( j == numsplines ) print( paste( "Max Quantile: ",model$quantiles[[thisquant]], sep="" ), quote=F )
                else print( paste( round(100/(numsplines-1),digits=2),"% Quantile: ",model$quantiles[[thisquant]], sep="" ), quote=F )
                thisquant <- thisquant + 1
        }
        for ( j in 1:numsplines ) {
            print( paste( "Coefficient[",j,"]: ",model$coefficients[[thiscoeff]], sep="" ), quote=F )
            thiscoeff <- thiscoeff + 1
        }
        print( "", quote=F )                
    }   
}




##"gdm.getwordsize" <- 
##function()
##{
##    p1 <- 0
##    z <- .C( "GetWordSize", wsize = as.integer(p1), PACKAGE = "gdm" )
##    return(z$wsize)
##}

gdm.formatsitepair <- function(bioData, bioFormat, dist="bray", abundance=F, siteColumn=NULL, XColumn, YColumn, sppColumn=NULL, abundColumn=NULL, sppFilter=0, predData, distPreds=NULL, weightType="def", custWeightVect=NULL, maxPairs=NULL){
  ##Takes species data from a variety of commonly used formats and transforms it into the
  ##required format for GDM
  ##Input Variables:
  ##bioData = the input species data for the gdm
  ##bioFormat = the format of which the species data has been given to the function
  ##dist = the dissimularity distance, defined by the vegdist fuction in the vegan package
  ##abundance = rather the species data represents abundance or presense-absense
  ##XColumn = the x coordinates of the sample site
  ##YColumn = the y coordinates of the sample site
  ##siteColumn = the name of the column in the species data which holds the site identification, needed for table type 1
  ##abundColumn = the name of the column which holds the species data, needed for table type 2
  ##sppColumn = species code column, like name, needed for table type 2
  ##sppFilter = number of spp as mimimum 
  ##predData = environmental data to be tacked onto the species data
  ##distPreds = a list of prediction matrices
  ##weightType = how the weights of the site-pair table are calculated
  ##custWeightVect = custom weights vector
  ##Output Variables:
  ##outTable = the fully calculated site-pair table for GDM
  ###########################
  #bioData <- sitePair15
  #bioFormat <- 4
  #dist <- "bray"
  #abundance <- F
  #siteColumn <- NULL
  #XColumn <- "xCoord.S1"
  #YColumn <- "yCorrd.S1"
  #sppColumn <- NULL
  #sppFilter <- 0
  #abundColumn <- NULL
  #predData <- envDat
  #distPreds <- predMats
  #weightType <- "def"
  #custWeightVect <- NULL
  #maxPairs <- 360
  ###########################
  ##required libraries
  #require(raster)
  #require(reshape2)
  #require(vegan)
  #print(class(predData))
  ##if rows of bioData and predData do not match, exit function
  ##if bioFormat is not a number, exit function
  if(is.numeric(bioFormat)==FALSE){
    stop("bioFormat variable not a number")
  }
  ##if maxPairs is not a number, then exit function
  if(is.numeric(maxPairs)==FALSE & is.null(maxPairs)==FALSE){
    stop("maxPairs varialbe is not a number")
  }
  ##makes sure that sppFilter is a number, if not exit function
  if(is.numeric(sppFilter)==FALSE){
    stop("sppFilter variable not a number")
  }
  
  if(weightType!="def" & weightType!="cust" & weightType!="rich"){
    stop("weightType variable not acceptable")
  }
  
  if(bioFormat==3 & weightType=="rich"){
    stop("cannot calculate richness without knowing species data")
  }
  ##warns if distPreds are not matrices
  for(mat in distPreds){
    if(class(mat)!="matrix"){
      warning("Not all of the given dissimularity matricies are not of class 'matrix'. Therefore, IF the data is incorperated without error, it maybe incorrect.")
    }
  }
  ##checks input data format
  ##species data as site-species matrix
  if(bioFormat==1){
    ##processes site-species matrix
    siteNum <- which(colnames(bioData)==siteColumn)
    
    ##point locations
    x <- which(colnames(bioData)==XColumn)
    ##checks to see if coordinates are found with bioData, if not extracts them from envData
    ##coordinates need to be in bioData if envData is raster
    if(is.na(x[1])==T){
      x <- which(colnames(predData)==XColumn)
      y <- which(colnames(predData)==YColumn)
      locs <- predData[c(x,y)]
      sppDat <- bioData[,-c(siteNum)]
    }else{
      y <- which(colnames(bioData)==YColumn)
      locs <- bioData[c(x,y)]
      sppDat <- bioData[,-c(siteNum, x, y)]
    }
    
    ##filters out sites with low species count
    ##prep for filtering
    headDat <- bioData[,c(siteNum)]
    #sppDat[as.numeric(sppDat)>=1]=1
    #sppDat[as.numeric(sppDat)==0]=0
    sppDat[sppDat>=1]=1
    sppDat[sppDat==0]=0
    sppTotals <- cbind(as.data.frame(headDat), apply(sppDat, 1, function(m){sum(as.numeric(m))}))
    ##filters out data
    filterBioDat <- subset(sppTotals, sppTotals[colnames(sppTotals)[2]] >= sppFilter)
    spSiteCol <- filterBioDat["headDat"]
    colnames(spSiteCol) <- siteColumn
    ##reassembles data after filtering
    bioData <- merge(spSiteCol, bioData, by=siteColumn)
    
    ##checks unique sites against rasters
    if(class(predData)=="RasterStack" | class(predData)=="RasterLayer" | class(predData)=="RasterBrick"){
      #spSiteCol = bioData[siteColumn]
      siteRaster <- predData[[1]]
      ##raster based site
      #x = which(names(bioData)==XColumn)
      #y = which(names(bioData)==YColumn)
      #locs = bioData[c(x,y)]
      cellID <- as.data.frame(cellFromXY(siteRaster, locs))
      names(cellID)[names(cellID)=="cellFromXY(siteRaster, locs)"] <- "cellName"
      names(spSiteCol)[names(spSiteCol)==siteColumn] <- "siteY"
      
      ##checks size, and if needed correcting
      ##if cellID less than siteColumn
      if(length(unique(cellID$cellName))<length(unique(spSiteCol$siteY))){
        warning("less unique cells identified than unique sites in the site column, using cell location as sites")
      }else if(length(unique(cellID$cellName))>length(unique(spSiteCol$siteY))){
        ##if siteColumn less than cellID
        warning("more unique cells identified than unique sites in site column, defaulting sites to cells")
      }
      
      ##aggregates bioData data into classes by raster cells
      newDataTable <- bioData[-c(siteNum, x, y)]
      modDataTable <- cbind(cellID, newDataTable)
      #uniqueCells = unique(cellID$cellName)
      if(abundance==T){
        ##compress by cell
        byCell <- aggregate(modDataTable, modDataTable[1], FUN=mean)
        ##removes unneeded columns
        inDataTable <- byCell[-c(1, 2)]
      }else{
        ##compress by cell
        byCell <- aggregate(modDataTable, modDataTable[1], FUN=sum)
        ##transforms data to binary
        byCell <- byCell[-c(1, 2)]
        byCell[byCell>=1] <- 1
        inDataTable <- byCell
      }
      
    }else{
      predData <- merge(spSiteCol, predData, by=siteColumn)
      predSite <- which(names(predData)==siteColumn)
      predData <- predData[-predSite]
      #x = which(names(bioData)==XColumn)
      #y = which(names(bioData)==YColumn)
      inDataTable <- bioData[-c(siteNum, x, y)]
    }
    ##creates distance matrix
    if(abundance==F){
      distData <- vegdist(inDataTable, dist, binary=T)
    }else{
      distData <- vegdist(inDataTable, dist, binary=F)
    }
    
    ########################################################################
    ##species data as x,y,species list
  }else if(bioFormat==2){
    ##insert data if not available
    if(is.null(abundColumn)){
      warning("No abundance column specified, assuming all data is presence only")
      bioData["reallysupercoolawesomedata"] <- 1
      abundColumn <- "reallysupercoolawesomedata"
    }
    ##point locations
    x <- which(names(bioData)==XColumn)
    y <- which(names(bioData)==YColumn)
    locs <- bioData[c(x,y)]
    
    ##processes corrds, species list
    ##if environmental data is raster, checks size of data to makes sure there is not problems with end table binding
    if(class(predData)=="RasterStack" | class(predData)=="RasterLayer" | class(predData)=="RasterBrick"){
      ##raster based site
      ##site raster
      siteRaster <- predData[[1]]
      cellID <- as.data.frame(cellFromXY(siteRaster, locs))
      names(cellID)[names(cellID)=="cellFromXY(siteRaster, locs)"] <- "cellName"
      
      ##if siteColumn has been provided
      if(!is.null(siteColumn)){
        ##species site
        spSiteCol <- bioData[siteColumn]
        names(spSiteCol)[names(spSiteCol)==siteColumn] <- "siteY"
        ##checks size, and if needed correcting
        ##if cellID less than siteColumn
        if(length(unique(cellID$cellName))<length(unique(spSiteCol$siteY))){
          warning("less unique cells identified than unique sites in the site column, using cell location as sites")
        }else if(length(unique(cellID$cellName))>length(unique(spSiteCol$siteY))){
          ##if siteColumn less than cellID
          warning("more unique cells identified than unique sites in site column, defaulting sites to cells")
        }
        ##removes unneeded columns
        siteNum <- which(names(bioData)==siteColumn)
        newDataTable <- bioData[-c(siteNum, x, y)]
      }else{
        ##removes unneeded columns
        newDataTable <- bioData[-c(x, y)]
      }
      
      ##casts data into site-species matrix
      modDataTable <- cbind(cellID, newDataTable)
      names(modDataTable)[names(modDataTable)==sppColumn] <- "spcodeUltimateCoolness"
      castData <- dcast(modDataTable, cellName~spcodeUltimateCoolness, value.var=abundColumn)
      
      ##aggregate data by cellID, and filters data by low spp number
      if(abundance==T){
        ##compress by cell
        cellName <- which(names(castData)=="cellName")
        byCell <- aggregate(castData, castData[cellName], FUN=mean)
        byCell <- byCell[-2]
        
        ##filters species data
        sppDat <- byCell[-c(cellName)]
        headDat <- byCell[c(cellName)]
        sppDat[sppDat>=1] <- 1
        sppDat[is.na(sppDat)] <- 0
        sppTotals <- cbind(headDat, rowSums(sppDat))
        ##filters out data
        filterBioDat <- subset(sppTotals, sppTotals["rowSums(sppDat)"] >= sppFilter)
        spSiteCol <- filterBioDat[cellName]
        ##reassembles data after filtering
        cellXY <- xyFromCell(siteRaster, spSiteCol$cellName)
        spSiteCol <- cbind(spSiteCol, cellXY)
        bioData <- merge(spSiteCol, byCell, by=cellName)
        
        ##removes unneeded columns
        inDataTable <- byCell[-c(1)]
      }else{
        ##compress by cell
        cellName <- which(names(castData)=="cellName")
        byCell <- aggregate(castData, castData[cellName], FUN=sum)
        ##remove redunancy
        byCell <- byCell[-2]
        
        ##filters species data
        sppDat <- byCell[-c(cellName)]
        headDat <- byCell[c(cellName)]
        sppDat[sppDat>=1] <- 1
        sppTotals <- cbind(headDat, rowSums(sppDat))
        ##filters out data
        filterBioDat <- subset(sppTotals, sppTotals["rowSums(sppDat)"] >= sppFilter)
        spSiteCol <- filterBioDat[cellName]
        ##reassembles data after filtering
        cellXY <- xyFromCell(siteRaster, spSiteCol$cellName)
        spSiteCol <- cbind(spSiteCol, cellXY)
        bioData <- merge(spSiteCol, byCell, by=cellName)
        
        ##transforms data to binary
        byCell <- byCell[-c(1)]
        byCell[byCell>=1] <- 1
        inDataTable <- byCell
      } 
    }else{
      ##if environmental data is a table
      ##remove coords if needed
      newDataTable <- bioData[-c(x, y)]
      
      ##renames required columns
      names(newDataTable)[names(newDataTable)==siteColumn] <- "siteUltimateCoolness"
      names(newDataTable)[names(newDataTable)==sppColumn] <- "spcodeUltimateCoolness"
      castData <- dcast(newDataTable, siteUltimateCoolness~spcodeUltimateCoolness, value.var=abundColumn)
      
      ##filters species data
      siteName <- which(names(castData)=="siteUltimateCoolness")
      sppDat <- castData[-c(siteName)]
      headDat <- castData[c(siteName)]
      sppDat[sppDat>=1] <- 1
      sppTotals <- cbind(headDat, rowSums(sppDat))
      ##filters out data
      filterBioDat <- subset(sppTotals, sppTotals["rowSums(sppDat)"] >= sppFilter)
      spSiteCol <- filterBioDat[siteName]
      ##reassembles data after filtering
      bioSiteCol <- which(names(bioData)==siteColumn)
      holdData <- bioData[c(bioSiteCol,x,y)]
      holdData <- aggregate(holdData, holdData[1], FUN=mean)
      holdData <- holdData[-1]
      siteXY <- merge(spSiteCol, holdData, by.x="siteUltimateCoolness", by.y=siteColumn, all=F)
      bioData <- merge(siteXY, castData, by="siteUltimateCoolness")
      
      predData <- merge(spSiteCol, predData, by.x="siteUltimateCoolness", by.y=siteColumn)
      predSite <- which(names(predData)=="siteUltimateCoolness")
      predData <- predData[-predSite]
      
      inDataTable <- bioData[-c(1,2,3)]
    }
    XColumn <- "x"
    YColumn <- "y"
    ##creates distance matrix
    inDataTable[is.na(inDataTable)] <- 0
    inDataTable <- inDataTable[apply(inDataTable, 1, function(x){sum(x)>=sppFilter}),]
    if(abundance==F){
      distData <- vegdist(inDataTable, dist, binary=T)
    }else{
      distData <- vegdist(inDataTable, dist, binary=F)
    }
    ########################################################################
    
    ##species data as site-site distance matrix
  }else if(bioFormat==3){
    ##site-site distance already calculated
    #castData = bioData
    distData <- lower.tri(as.matrix(bioData), diag=FALSE)
    distData <- as.vector(bioData[distData])
    ########################################################################
    
    ##site pair table, already preped 
  }else if(bioFormat==4){
    ##site-pair distance value
    outTable <- bioData
    ########################################################################
    
  }else{
    ##return error, bioFormat argument out of bounds
    stop(paste("bioFormat varable of '", as.character(bioFormat), "' is not an accepted input value", sep=""))
  }
  ########################################################################
  
  ##With the dissim distance calculated, creates and fills the table in gdm format
  if(bioFormat!=4){
    ##creates base site-pair table
    outTable <- as.data.frame(createsitepair(dist=distData, spdata=bioData, envInfo=predData, dXCol=XColumn, dYCol=YColumn, siteCol=siteColumn, weightsType=weightType, custWeights=custWeightVect))
  }else{
    outTable <- bioData
  }
  
  ##first checks that the size of the dissimularity matrices, if any were provided
  ##then pastes any dissimularity matrices onto the created site-pair table
  if(length(distPreds)>0){
    baseMat <- distPreds[[1]]
    ##checks to size of each dissimularity matrix, to make sure they are all the same
    lapply(distPreds, function(mat, mat1){if(length(mat1)!=length(mat)){stop("The dimensions of your predictions matrices are not the same size.")}}, mat1=baseMat)
    
    ##checks the size of the dissimularity matrices against the size of distData
    baseMatDat <- lower.tri(as.matrix(baseMat),diag=FALSE)
    baseMatDat <- as.vector(baseMat[baseMatDat])
    if(nrow(outTable)!=length(baseMatDat)){
      stop("The dimensions of your prediction matrices do not match the created distance data from the provided biological data")
    }
      
    #outTableHold = outTable
      
    for(num in 1:length(distPreds)){
      #num <- 2
      ##isolate matrix
      matrixDat <- lower.tri(as.matrix(distPreds[[num]], diag=FALSE))
      sweetFreakenPredMatrix <- as.vector(distPreds[[num]][matrixDat])
      ##add matrix to table
      if(ncol(outTable)>6){
        ##break table up to insert matrix data columns
        baseSitePair <- outTable[,1:6]
        otherSitePair <- outTable[,7:ncol(outTable)]
        otherNames <- colnames(otherSitePair)
        s1SitePair <- as.data.frame(otherSitePair[,1:(ncol(otherSitePair)/2)])
        colnames(s1SitePair) <- otherNames[1:(ncol(otherSitePair)/2)]
        s2SitePair <- as.data.frame(otherSitePair[,(ncol(otherSitePair)/2+1):ncol(otherSitePair)])
        colnames(s2SitePair) <- otherNames[(ncol(otherSitePair)/2+1):ncol(otherSitePair)]
        ##formats data from dissimularity matrices 
        s1Zeros <- as.data.frame(rep(0,length(sweetFreakenPredMatrix)))
        colnames(s1Zeros) <- paste("s1.matrix_", num, sep="")
        s2Mat <- as.data.frame(sweetFreakenPredMatrix)
        colnames(s2Mat) <- paste("s2.matrix_", num, sep="")
        ##restructures the output table
        outTable <- cbind(baseSitePair, s1SitePair, s1Zeros, s2SitePair, s2Mat)
      }else{
        ##formats data from dissimularity matrices 
        s1Zeros <- as.data.frame(rep(0,length(sweetFreakenPredMatrix)))
        colnames(s1Zeros) <- paste("s1.matrix_", num, sep="")
        s2Mat <- as.data.frame(sweetFreakenPredMatrix)
        colnames(s2Mat) <- paste("s2.matrix_", num, sep="")
        ##restructures the output table
        outTable <- cbind(outTable, s1Zeros, s2Mat)
      }
    }
  }
  
  ##if maxPairs has a numeric value, then randomly samples the site-pair table for that number of site-pairs
  if(is.null(maxPairs)==FALSE){
    randRows <- sample(1:nrow(outTable), maxPairs)
    outTable <- outTable[c(randRows),]
  }
  
  ##return output table
  class(outTable) <- c("gdmData", "data.frame")
  return(outTable)
}
##########################################################################


##########################################################################
createsitepair <- function(dist, spdata, envInfo, dXCol, dYCol, siteCol, weightsType, custWeights){
  ##Used in formatGDMData to transform data from a site-site distance matrix into
  ##a site pair format
  ##Input Variables:
  ##dist = the distance object representing the pair-wise dissimularity of the species data
  ##envInfo = environmental data
  ##spdata = needed as may have X,Y coords
  ##dXCol = x coordinate
  ##dYCol = y coordinate
  ##siteCol = site column, taken from either the species or environmental tables
  ##predMatrices = Prediction matrices to be added to the site-pair table
  ##weightsType = the method of determining the site-pair weights
  ##custWeights = custom wieghts, as a vector, if given
  ##Output Variables:
  ##gdmTableFill = the complete site-pair table
  ###########################
  #dist = distData
  #spdata = bioData
  #envInfo = predData
  #dXCol = XColumn
  #dYCol = YColumn
  #siteCol = siteColumn
  #weightsType = weightType
  #custWeights = custWeightVect
  ###########################
  ##required libraries
  #require(raster)
  
  ##Create gdm ready table
  weightsType <- as.character(weightsType)
  distance <- as.vector(dist)
  ##calculates richness total, the sums of the two most populus sites
  if(weightsType[1]=="rich"){
    sppOnly <- spdata[-c(1,2,3)]
    sppSums <- rowSums(sppOnly)
    sppSiteSums <- cbind(spdata[1], sppSums)
    orderedSums <- sppSiteSums[order(-sppSiteSums[,2]),]
    richTotal <- orderedSums[1,2]+orderedSums[2,2]
  }
  
  ##Builds index needed for output gdm table format
  xCoord.S1 <- yCoord.S1 <- xCoord.S2 <- yCoord.S2 <- NULL
  s1 <- s2 <- NULL
  
  ##get coordinates for each site
  if(class(envInfo)=="RasterStack" | class(envInfo)=="RasterLayer" | class(envInfo)=="RasterBrick"){
    #print("in raster option")
    ##collects xy data from the original species table
    rlocs <- spdata[c(dXCol, dYCol)]
    getCell <- as.data.frame(cellFromXY(envInfo[[1]], rlocs))
    names(getCell)[names(getCell)=="cellFromXY(envInfo[[1]], rlocs)"] <- "site"
    
    ##get information from raster
    rastXY <- xyFromCell(envInfo[[1]], unique(getCell$site))
    exData <- as.data.frame(extract(envInfo, rastXY))
    if(nlayers(envInfo)==1){
      colnames(exData) <- names(envInfo)
    }
    creatRastDat <- cbind(rastXY, exData)
    
    ##Builds index needed for output gdm table format
    count <- seq((nrow(as.matrix(creatRastDat))-1),1,-1)
    s1 <- unlist(sapply(seq(length(count),1), function(y){c(s1, rep((max(count)-y)+1, times=y))}))
    s2 <- unlist(sapply(seq(length(count),1), function(y){c(s2, (max(count)-y+2):(max(count)+1))}))
    
    ##create gdm table and weights
    if(weightsType[1]=="def"){
      weights <- rep(1, times=length(distance))
    }else if(weightsType[1]=="cust"){
      weights <- custWeights
    }else{
      weights <- (sppSiteSums[s1, "sppSums"] + sppSiteSums[s2, "sppSums"]) / richTotal
    }
    gdmTable <- cbind(distance, weights)
    
    ##from environmental table, copy coordinates
    xCoord.S1 <- creatRastDat[s1, "x"]
    xCoord.S2 <- creatRastDat[s2, "x"]
    yCoord.S1 <- creatRastDat[s1, "y"]
    yCoord.S2 = creatRastDat[s2, "y"]
    
    if(length(xCoord.S1)!=nrow(gdmTable)){
      stop("error with number of records of distances and records of coordinates")
    }
    
    ##sets up output table
    gdmForm <- cbind(gdmTable, xCoord.S1, yCoord.S1, xCoord.S2, yCoord.S2)
    
    ##fills output table
    gdmTableFill <- as.data.frame(cbind(gdmForm, exData[s1,1:ncol(exData)], exData[s2,1:ncol(exData)]))
    names.s1 <- paste("s1.",names(as.data.frame(exData)[1:ncol(exData)]), sep="")
    names.s2 <- paste("s2.",names(as.data.frame(exData)[1:ncol(exData)]), sep="")
    names(gdmTableFill) <- c(names(gdmTableFill)[1:6], names.s1, names.s2)
    
  }else{
    #print("in table option")
    ##Builds index needed for output gdm table format
    count <- seq((nrow(as.matrix(envInfo))-1),1,-1)
    s1 <- unlist(sapply(seq(length(count),1), function(y){c(s1, rep((max(count)-y)+1, times=y))}))
    s2 <- unlist(sapply(seq(length(count),1), function(y){c(s2, (max(count)-y+2):(max(count)+1))}))
    
    if(length(s1)!=length(distance)){
      stop("error with number of records of distances and records of coordinates")
    }
    
    ##create gdm table and weights
    ##Error caused here, not sure why yet, indexing?????
    #if(weightsT[1]=="rich"){
    #  weights = (sppSiteSums[s1, "sppSums"] + sppSiteSums[s2, "sppSums"]) / richTotal
    #}else if(weightsT[1]=="cust"){
    #  weights = custWeights
    #}else{
    #  weights <- rep(1, times=length(distance))
    #}
    if(weightsType[1]=="def"){
      weights <- rep(1, times=length(distance))
    }else if(weightsType[1]=="cust"){
      weights <- custWeights
    }else{
      weights <- (sppSiteSums[s1, "sppSums"] + sppSiteSums[s2, "sppSums"]) / richTotal
    }
    gdmTable <- cbind(distance, weights)
    
    ##from environmental table, copy coordinates
    xCoord.S1 <- envInfo[s1, dXCol]
    xCoord.S2 <- envInfo[s2, dXCol]
    yCoord.S1 <- envInfo[s1, dYCol]
    yCoord.S2 = envInfo[s2, dYCol]
    
    ##sets up output table
    gdmForm <- cbind(gdmTable, xCoord.S1, yCoord.S1, xCoord.S2, yCoord.S2)
    xhold <- which(names(envInfo)==dXCol)
    yhold <- which(names(envInfo)==dYCol)
    sitehold <- which(names(envInfo)==siteCol)
    envInfo <- envInfo[-c(xhold, yhold, sitehold)]
    
    ##fills output table
    
    if(ncol(envInfo)>0){
      gdmTableFill <- cbind(gdmForm, envInfo[s1,1:ncol(envInfo)], envInfo[s2,1:ncol(envInfo)])
      names.s1 <- paste("s1.",names(envInfo[1:ncol(envInfo)]), sep="")
      names.s2 <- paste("s2.",names(envInfo[1:ncol(envInfo)]), sep="")
      colnames(gdmTableFill) <- c(colnames(gdmTableFill)[1:6], names.s1, names.s2)
    }else{
      gdmTableFill <- gdmForm
    }
  }
  
  ##returns results
  return(gdmTableFill)
}
##########################################################################