R/oxy-SSNtools.R

Defines functions GlobalFlatteningRatio LocalFlatteningRatio Kfullfillment NDScanMatrix NDScanManhattan NDScanKNearest NDScanRadius edgeScanMatrix edgeScanManhattan edgeScanKNearest edgeScanRadius data processEdge processNode

Documented in data edgeScanKNearest edgeScanManhattan edgeScanMatrix edgeScanRadius GlobalFlatteningRatio Kfullfillment LocalFlatteningRatio NDScanKNearest NDScanManhattan NDScanMatrix NDScanRadius processEdge processNode

#------------------------------------------#
#---------------Main Functions ------------#
#------------------------------------------#

#' @title processNode
#' @description process user_nodes to correct format for hotspot detection algorithm
#' @param data user_nodes, which should be a R dataframe that contains a column of node label, a column of node longitude, and a column of node latitude
#' @param label_name column name (string) that contains the node labels
#' @param lon_name column name (float) that contains the node longitude 
#' @param lat_name column name (float) that contains the node latitude
#' @param bipartite_name (optional) column name (binary) that contains which set the node is in if available. 1 represents the set that the heat is visualized on.
#' @return a list of named lists, in which the name for each list is the node label. The sublist contains all info of a node
#' @details DETAILS
#' @examples 
#' \dontrun{
#' nodes = processNode(user_nodes, 'label', 'lon', 'lat')
#' nodes[[1]] 
#' nodes[['YOUR NODE LABEL']][['lon']]
#' }
#' @rdname processNode
#' @export 
processNode = function(data, label_name, lon_name, lat_name, bipartite_name = NA) {
  #convert columns to the right data format
  names(data)[names(data) == label_name] <- "label"
  names(data)[names(data) == lon_name] <- "lon"
  names(data)[names(data) == lat_name] <- "lat"
  data$label = as.character(data$label)
  data$lon = as.numeric(as.character(data$lon))
  data$lat = as.numeric(as.character(data$lat))
  
  if(!is.na(bipartite_name)) {
    names(data)[names(data) == bipartite_name] <- "bipartite"
    data$bipartite = as.numeric(as.character(data$bipartite))
    #order so that bipartite = 1 is on top
    data = data[order(data$bipartite, decreasing=T),]
  }
  
  #convert data frame to a list of named lists
  # Apply the process_row function to each row of the data frame
  nodes <- lapply(split(data, seq(nrow(data))), processNodeHelper, bipartite_name)
  
  # Merge the lists using do.call
  nodes <- do.call(c, nodes)
  
  # Remove the prefixes added by the split() function
  names(nodes) <- gsub("^[0-9]+\\.", "", names(nodes))
  
  #old version with for loop
  #convert data frame to a list of named lists
  # data2 = as.list(data)
  # nodes = list()
  # 
  # for (i in 1:nrow(data)) { 
  #   temp = list()
  #   label = as.character(data2$label[i])
  #   if(is.na(bipartite_name)) {
  #     node = list('label' = label, 'lon' = data2$lon[i], 'lat'= data2$lat[i])
  #   } else {
  #     if(is.null(data2$bipartite[i])) {
  #       stop('Node bipartite value is not available. Please check if node table contains bipartite values and if the name of the bipartite column is provided in the processNode function')
  #     }
  #     node = list('label' = label, 'lon' = data2$lon[i], 'lat'= data2$lat[i], 'bipartite' = data2$bipartite[i])
  #   }
  #   
  #   temp[[label]] <- node #this is the only way to assign key by variable name
  #   nodes = append(nodes, temp)
  # }
  return(nodes)
}

#data: R dataframe 
#source_name: a string that indicates the column with the source node label 
#target_name: a string that indicates the column with the target node label
#' @title processEdge
#' @description convert a R dataframe with two columns (Source name and Target name) to correct formats for following algorithms
#' @param data user_edges, which should be a R dataframe that contains a column of source (string of node label) and a column of target (string of node label)
#' @param source_name column name (string) that contains the source node labels
#' @param target_name column name (string) that contains the target node labels
#' @param weight_name (optional) column name (float) that contains edge weight if available
#' @return a list of lists. Each sublist contains all info of an edge
#' @details DETAILS
#' @examples 
#' \dontrun{
#' edges = processEdge(user_edges, 'Source', 'Target')
#' }
#' @rdname processEdge
#' @export 
processEdge = function(data, source_name, target_name, weight_name = NA) {
  #convert columns to the right data format
  names(data)[names(data) == source_name] <- "Source"
  names(data)[names(data) == target_name] <- "Target"
  edges <- lapply(split(data, seq_len(nrow(data))), processEdgeHelper, source_name, target_name, weight_name)
  return(edges)
}

#' @title data
#' @description load a built-in dataset
#' @param data the name of the built-in dataset
#' @return the built-in dataset
#' @details DETAILS
#' #' @examples 
#' \dontrun{
#' data(NYCMafiaNodes) 
#'  }
data = function(data) {
  return(data)
}

#' @title EdgeScanRadius
#' @description edgeScan - generates heatmap of edges within maxRadius of every node in a graph
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param radius radius in the unit of coordinates of search window
#' @param min (optional) minimum number of nodes in the searching window
#' @param weighted (optional) boolean value of whether a weighted column has been included.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' edgeScanRadius(nodes, edges, 500) 
#'  }
#' @rdname edgeScanRadius
#' @export 
edgeScanRadius = function(nodes, edges, radius, min=3, weighted=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  numedges = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    numNodesInRadius = numberNodesWithinRadius(nodes, nodes[[i]], radius, bipartite)
    
    if (numNodesInRadius < min) {
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, NA)
    } else {
      numEdges = getNumEdgesInRange(nodes, edges, nodes[[i]], radius, weighted)
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, numEdges)
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = numedges)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  
  source = c()
  target = c()
  weight = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if(weighted) {weight = c(weight, edge[['Weight']])}
    if (euclidDistance(nodes[[edge[['Source']]]], nodes[[edge[['Target']]]]) < radius) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  if(weighted) {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'Weight' = weight, 'WithinWindow' = withinwindow)
  } else {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  }
  
  return(list(heat, edgeWithin))
}

#' @title edgeScanKNearest
#' @description calculate number of edges for each independent node in a graph in a range of k nearest nodes
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param k number of nodes in search window
#' @param weighted (optional) boolean value of whether a weighted column has been included.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details 
#' @examples 
#' \dontrun{
#' edgeScanKNearest(nodes, edges, 5)
#' }
#' @rdname edgeScanKNearest
#' @export 
edgeScanKNearest = function(nodes, edges, k, weighted=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  numedges = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    kNearest = nearestNeighbors(nodes, nodes[[i]], k, bipartite) 
    rad = 0 
    for (node in kNearest) {
      if (euclidDistance(node, nodes[[i]]) > rad) {
        rad = euclidDistance(node, nodes[[i]])
      }
    }
    numEdges = getNumEdgesInRange(nodes, edges, nodes[[i]], rad, weighted)
    
    labels = c(labels, nodes[[i]][['label']])
    numedges = c(numedges, numEdges)
  }
  
  heat = data.frame('label' = labels, 'heat' = numedges)
  
  source = c()
  target = c()
  weight = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if(weighted) {weight = c(weight, edge[['Weight']])}
    #bipartite assumes source node is the one with heat values
    if(bipartite) {
      if (edge[['Target']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Source']]]], k, bipartite), function(x) x[['label']])) {
        withinwindow = c(withinwindow, 1) 
      } else {
        withinwindow = c(withinwindow, 0) 
      }
    } else {
      if (edge[['Target']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Source']]]], k, bipartite), function(x) x[['label']]) | 
          edge[['Source']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Target']]]], k, bipartite), function(x) x[['label']])) {
        withinwindow = c(withinwindow, 1) 
      } else {
        withinwindow = c(withinwindow, 0) 
      }
    }
    
  }
  
  if(weighted) {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'Weight' = weight, 'WithinWindow' = withinwindow)
  } else {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  }
  return(list(heat, edgeWithin))
}

#' @title edgeScanManhattan
#' @description calculate number of edges for each independent node in a graph in a range of manhattan distance
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param radius radius in the unit of coordinates of search window (Manhattan distance)
#' @param min (optional) minimum number of nodes in the searching window
#' @param weighted (optional) boolean value of whether a weighted column has been included.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname edgeScanManhattan
#' @export 
edgeScanManhattan = function(nodes, edges, radius, min=3, weighted=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  numedges = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    numNodesInManhattanDistance = numberNodesWithinManhattanDistance(nodes, nodes[[i]], radius, bipartite)
    
    if (numNodesInManhattanDistance < min) {
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, NA)
    } else {
      numEdges = numberEdgesWithinManhattanDistance(nodes, edges, nodes[[i]], radius, weighted)
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, numEdges)
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = numedges)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  
  source = c()
  target = c()
  weight = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if(weighted) {weight = c(weight, edge[['Weight']])}
    if (ManhattanDistance(nodes[[edge[['Source']]]], nodes[[edge[['Target']]]]) < radius) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  if(weighted) {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'Weight' = weight, 'WithinWindow' = withinwindow)
  } else {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  }
  
  return(list(heat, edgeWithin))
}

#' @title edgeScanMatrix
#' @description calculate number of edges per node in a given threshold from a user-defined matrix.
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param thres threshold (e.g., distance, travel time) to calculate the number of edges, given the user-defined matrix
#' @param matrix a user-defined full matrix, including all pairs of nodes. The matrix's column and row names are consistent with nodes' labels. The cell values can be distance, travel time and so on. 
#' @param min (optional) minimum number of nodes in the searching window
#' @param weighted (optional) boolean value of whether a weighted column has been included.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname edgeScanMatrix
#' @export 
edgeScanMatrix = function(nodes, edges, thres, matrix, min=3, weighted=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  if(!inherits(matrix, "matrix")) {
    stop('Your matrix input is not recognized as a matrix in R. Please check R matrix formats and make sure you have row and column names for the matrix.')
  }
  if(!is.null(nodes[[1]][['bipartite']]) & !bipartite) {
    stop('Your data has a bipartite column, but your bipartite argument is set to FALSE. Please set your bipartite argument to TRUE')
  }
  
  labels = c()
  numedges = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
    #calculate the number of nodes with bipartite == 1
    bipartite_num = sum(as.numeric(unlist(nodes)[grepl(pattern='bipartite', names(unlist(nodes)))]))
    bipartite_trans_matrix = matrix
    #assign node pairs in the same set with values of 0
    bipartite_trans_matrix[1:bipartite_num, 1:bipartite_num] <- NA
    bipartite_trans_matrix[(bipartite_num+1):length(nodes), (bipartite_num+1):length(nodes)] <- NA
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    NodesInMatrix = NodesWithinMatrixThres(nodes[[i]][['label']], thres, matrix) #NodesInMatrix includes POI and centroids
    if(bipartite) {
      CentroidsInMatrix = NodesWithinMatrixThres(nodes[[i]][['label']], thres, bipartite_trans_matrix) #CentroidsInMatrix only includes centroids
      numNodesInMatrix = length(CentroidsInMatrix)  
    } else {
      numNodesInMatrix = length(NodesInMatrix)  
    }
    if (numNodesInMatrix < min) {
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, NA)
    } else {
      numEdges = getNumEdgesInMatrix(nodes[[i]][['label']], names(NodesInMatrix), edges, matrix, thres, weighted)
      labels = c(labels, nodes[[i]][['label']])
      numedges = c(numedges, numEdges)
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = numedges)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  
  source = c()
  target = c()
  weight = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if(weighted) {weight = c(weight, edge[['Weight']])}
    
    if(bipartite) {
      nameWithinMatrix=names(NodesWithinMatrixThres(edge[['Source']], thres, bipartite_trans_matrix))
    } else {
      nameWithinMatrix=names(NodesWithinMatrixThres(edge[['Source']], thres, matrix))
    }
    
    if (edge[['Target']] %in% nameWithinMatrix) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  if(weighted) {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'Weight' = weight, 'WithinWindow' = withinwindow)
  } else {
    edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  }
  
  return(list(heat, edgeWithin))
}

# Below are the three implementations of this, using radius, K-nearest, and Manhattan distance metrics.
#' @title NDScanRadius
#' @description FUNCTION_DESCRIPTION
#' @param nodes PARAM_DESCRIPTION
#' @param edges PARAM_DESCRIPTION
#' @param radius PARAM_DESCRIPTION
#' @param min PARAM_DESCRIPTION
#' @param directed (optional) boolean value of whether network density should be calculated as a directed graph.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname NDScanRadius
#' @export 
NDScanRadius = function(nodes, edges, radius, min=3, directed=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  ndensity = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    numNodesInRadius = numberNodesWithinRadius(nodes, nodes[[i]], radius, bipartite)
    if (numNodesInRadius < min) {
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, NA)
    } else {
      if(bipartite) {
        if(directed) {dir = 2} else {dir = 1}
        potential = numNodesInRadius * (numberNodesWithinRadius(nodes, nodes[[i]], radius, FALSE) - numNodesInRadius) * dir
      } else {
        if(directed) {dir = 1} else {dir = 2}
        potential = numNodesInRadius * (numNodesInRadius - 1)/dir
      }
      numEdges = getNumEdgesInRange(nodes, edges, nodes[[i]], radius, FALSE) #weighted = FALSE for network density
      nDensity = numEdges / potential 
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, nDensity)
      if (nDensity > 1) {stop(paste0('Node', i, ' network density is greater than 1'))}
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = ndensity)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  
  source = c()
  target = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if (euclidDistance(nodes[[edge[['Source']]]], nodes[[edge[['Target']]]]) < radius) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  
  return(list(heat, edgeWithin))
}

#' @title NDScanKNearest
#' @description FUNCTION_DESCRIPTION
#' @param nodes PARAM_DESCRIPTION
#' @param edges PARAM_DESCRIPTION
#' @param k PARAM_DESCRIPTION
#' @param directed (optional) boolean value of whether network density should be calculated as a directed graph.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname NDScanKNearest
#' @export 
NDScanKNearest = function(nodes, edges, k, directed=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  ndensity = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    kNearest = nearestNeighbors(nodes, nodes[[i]], k, bipartite)
    rad = 0
    for (node in kNearest) {
      if (euclidDistance(nodes[[i]], node) > rad) {
        rad = euclidDistance(nodes[[i]], node)
      }
    }
    numNodesInRadius = numberNodesWithinRadius(nodes, nodes[[i]], rad, bipartite)
    if(bipartite) {
      if(directed) {dir = 2} else {dir = 1}
      potential = numNodesInRadius * (numberNodesWithinRadius(nodes, nodes[[i]], rad, FALSE) - numNodesInRadius) * dir
    } else {
      if(directed) {dir = 1} else {dir = 2}
      potential = numNodesInRadius * (numNodesInRadius - 1)/dir
    }
    numEdges = getNumEdgesInRange(nodes, edges, nodes[[i]], rad, FALSE) #weighted = FALSE for network density
    nDensity = numEdges / potential 
    labels = c(labels, nodes[[i]][['label']])
    ndensity = c(ndensity, nDensity)
    if (nDensity > 1) {stop(paste0('Node', i, ' network density is greater than 1'))}
  }
  
  heat = data.frame('label' = labels, 'heat' = ndensity)
  
  source = c()
  target = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    #bipartite assumes source node is the one with heat values
    if(bipartite) {
      if (edge[['Target']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Source']]]], k, bipartite), function(x) x[['label']])) {
        withinwindow = c(withinwindow, 1) 
      } else {
        withinwindow = c(withinwindow, 0) 
      }
    } else {
      if (edge[['Target']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Source']]]], k, bipartite), function(x) x[['label']]) | 
          edge[['Source']] %in% sapply(nearestNeighbors(nodes, nodes[[edge[['Target']]]], k, bipartite), function(x) x[['label']])) {
        withinwindow = c(withinwindow, 1) 
      } else {
        withinwindow = c(withinwindow, 0) 
      }
    }
    
  }
  
  edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  
  return(list(heat, edgeWithin))
}

#' @title NDScanManhattan
#' @description FUNCTION_DESCRIPTION
#' @param nodes PARAM_DESCRIPTION
#' @param edges PARAM_DESCRIPTION
#' @param radius PARAM_DESCRIPTION
#' @param min PARAM_DESCRIPTION
#' @param directed (optional) boolean value of whether network density should be calculated as a directed graph.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname NDScanManhattan
#' @export 
NDScanManhattan = function(nodes, edges, radius, min=3, directed=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  
  labels = c()
  ndensity = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    numNodesInManhattanDistance = numberNodesWithinManhattanDistance(nodes, nodes[[i]], radius, bipartite)
    if (numNodesInManhattanDistance < min) {
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, NA)
    } else {
      if(bipartite) {
        if(directed) {dir = 2} else {dir = 1}
        potential = numNodesInManhattanDistance * (numberNodesWithinManhattanDistance(nodes, nodes[[i]], radius, FALSE) - numNodesInManhattanDistance) * dir
      } else {
        if(directed) {dir = 1} else {dir = 2}
        potential = numNodesInManhattanDistance * (numNodesInManhattanDistance - 1)/dir
      }
      numEdges = numberEdgesWithinManhattanDistance(nodes, edges, nodes[[i]], radius, FALSE)
      nDensity = numEdges / potential 
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, nDensity)
      if (nDensity > 1) {stop(paste0('Node', i, ' network density is greater than 1'))}
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = ndensity)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  source = c()
  target = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    if (ManhattanDistance(nodes[[edge[['Source']]]], nodes[[edge[['Target']]]]) < radius) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  
  return(list(heat, edgeWithin))
}

#' @title NDScanMatrix
#' @description calculate network density per node in a given threshold (searching window) from a user-defined matrix.
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param thres threshold (e.g., distance, travel time) to calculate network density, given the user-defined matrix
#' @param matrix a user-defined full matrix, including all pairs of nodes. The matrix's column and row names are consistent with nodes' labels. The cell values can be distance, travel time and so on. 
#' @param min (optional) minimum number of nodes in the searching window
#' @param directed (optional) boolean value of whether the network is directed.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R datafrmae contains a column of node label, and a column of heat associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the edge is within the scanning window.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname NDScanMatrix
#' @export 
NDScanMatrix = function(nodes, edges, thres, matrix, min=3, directed=FALSE, bipartite=FALSE) {
  if(!inherits(nodes, "list") | !inherits(edges, "list")) {
    stop('nodes or edges arguments only intake a list of lists. Please use processNode or processEdge functions to convert R dataframe to a list of lists')
  }
  if(!inherits(matrix, "matrix")) {
    stop('Your matrix input is not recognized as a matrix in R. Please check R matrix formats and make sure you have row and column names for the matrix.')
  }
  if(!is.null(nodes[[1]][['bipartite']]) & !bipartite) {
    stop('Your data has a bipartite column, but your bipartite argument is set to FALSE. Please set your bipartite argument to TRUE')
  }
  
  labels = c()
  ndensity = c()
  
  if(bipartite) {
    if (is.null(nodes[[1]][['bipartite']])) {
      stop('Node bipartite value is not available. Please check if node table contains a bipartite column and if the name of the bipartite column is provided in the processNode function')}
    #sort so that bipartite == 1 is on top.
    nodes = nodes[order(-sapply(nodes, function(x) x[['bipartite']]))] 
    stop = length(Filter(function(x) all((x <- x$bipartite == 1)), nodes))
    #calculate the number of nodes with bipartite == 1
    bipartite_num = sum(as.numeric(unlist(nodes)[grepl(pattern='bipartite', names(unlist(nodes)))]))
    bipartite_trans_matrix = matrix
    #assign node pairs in the same set with values of 0
    bipartite_trans_matrix[1:bipartite_num, 1:bipartite_num] <- NA
    bipartite_trans_matrix[(bipartite_num+1):length(nodes), (bipartite_num+1):length(nodes)] <- NA
  } else {
    stop = length(nodes)
  }
  
  for (i in seq(1, stop)) {
    NodesInMatrix = NodesWithinMatrixThres(nodes[[i]][['label']], thres, matrix) #NodesInMatrix includes POI and centroids
    if(bipartite) {
      CentroidsInMatrix = NodesWithinMatrixThres(nodes[[i]][['label']], thres, bipartite_trans_matrix) #CentroidsInMatrix only includes centroids
      numNodesInMatrix = length(CentroidsInMatrix)  
    } else {
      numNodesInMatrix = length(NodesInMatrix)  
    }
    if (numNodesInMatrix < min) {
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, NA)
    } else {
      if(bipartite) {
        if(directed) {dir = 2} else {dir = 1}
        potential = numNodesInMatrix * (length(NodesInMatrix)+1 - numNodesInMatrix) * dir #plus 1 for including the self node
      } else {
        if(directed) {dir = 1} else {dir = 2}
        potential = numNodesInMatrix * (numNodesInMatrix - 1)/dir
      }
      numEdges = getNumEdgesInMatrix(nodes[[i]][['label']], names(NodesInMatrix), edges, matrix, thres, FALSE)
      nDensity = numEdges / potential 
      labels = c(labels, nodes[[i]][['label']])
      ndensity = c(ndensity, nDensity)
      if (nDensity > 1) {stop(paste0('Node', i, ' network density is greater than 1'))}
    }
  }
  
  heat = data.frame('label' = labels, 'heat' = ndensity)
  if(abs(nodes[[1]][['lat']]) <= 180) {
    warning("Distance may be calculated in the degree coordinates, which may need to be projected into other distance units")
  }
  
  source = c()
  target = c()
  withinwindow = c()
  
  for (edge in edges) {
    source = c(source, edge[['Source']])
    target = c(target, edge[['Target']])
    
    if(bipartite) {
      nameWithinMatrix=names(NodesWithinMatrixThres(edge[['Source']], thres, bipartite_trans_matrix))
    } else {
      nameWithinMatrix=names(NodesWithinMatrixThres(edge[['Source']], thres, matrix))
    }
    
    if (edge[['Target']] %in% nameWithinMatrix) {
      withinwindow = c(withinwindow, 1) 
    } else {
      withinwindow = c(withinwindow, 0) 
    }
  }
  
  edgeWithin = data.frame('Source' = source, 'Target' = target, 'WithinWindow' = withinwindow)
  
  return(list(heat, edgeWithin))
}


#' @title K-fullfillment
#' @description calculate K-fullfillment for nodes. K-fullfillment is defined as the number of a node’s k-nearest neighbors that it is connected to. k is equal to the node's degree. Nodes that are exclusively connected to their nearest neighbors will have a k-fulfillment value of 1
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param minK (optional) minimum k value (degree) for a node to have a meaningful K-fullfillment value. K=1 means the node only has one connection.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R dataframe contains a column of node label, and a column of K_fullfillment associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the source node is k-nearest neighbor to the target node and vice versa.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname Kfullfillment
#' @export 
Kfullfillment = function(nodes, edges, minK=1, bipartite=FALSE) {
  
  if (bipartite) {
    #if bipartite, filter nodes to those that are in set 1
    nodes2 = nodes[sapply(nodes, function(node) node[['bipartite']] == 1)]
  } else {
    nodes2 = nodes
  }
  
  # add degree, connected_nodes, and Knn as attributes to each node
  nodes2 = lapply(nodes2, function(node) {
    return(Add_K_connected_nodes_knn_to_node(nodes, edges, node, bipartite))
  })
  
  # create K-fullfillment values
  kf = lapply(nodes2, function(node) {
    return(Kfullfillment_for_one_node(node, minK, bipartite))
  })
  
  if (bipartite) {
    #if bipartite, only need to check if Target is a k-nearest neighbor of the 'Source'
    edge_table = lapply(edges, function(edge) {
      source_knn = nodes2[[edge[['Source']]]][['Knn']]
      is_K_nearest_neighbor = as.integer(edge[['Target']] %in% source_knn)
      return(list(Source = edge[['Source']], Target = edge[['Target']], is_K_nearest_neighbor = is_K_nearest_neighbor))
    })
  } else {
    # The edge_table is created by iterating over each edge in the edges list and 
    # checking if the 'Source' node is a k-nearest neighbor of the 'Target' 
    # node or vice versa. If yes, the KNearestNeighbor value is set to 1; 
    # otherwise, it's set to 0.
    edge_table = lapply(edges, function(edge) {
      source_knn = nodes2[[edge[['Source']]]][['Knn']]
      target_knn = nodes2[[edge[['Target']]]][['Knn']]
      is_K_nearest_neighbor = as.integer(edge[['Target']] %in% source_knn | edge[['Source']] %in% target_knn)
      return(list(Source = edge[['Source']], Target = edge[['Target']], is_K_nearest_neighbor = is_K_nearest_neighbor))
    })
  }
  
  labels = lapply(nodes2, function(node) {
    return(node[['label']])
  })
  
  k_values = lapply(nodes2, function(node) {
    return(node[['K']])
  })
  
  node_table = data.frame('label' = unname(unlist(labels)), 'K' = unname(unlist(k_values)), 
                          'K_fullfillment' = unname(unlist(kf)))
  
  edge_table = do.call(rbind.data.frame, edge_table)
  
  return(list(node_table, edge_table))
}

#' @title Local Network Flattening Ration 
#' @description calculate local network flattening ratio for nodes. 
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param minK (optional) minimum k value (degree) for a node to have a meaningful local flattening ratio value. K=1 means the node only has one connection.
#' @param bipartite (optional) boolean value of whether the data is a bipartite network
#' @return a list of two dataframes. The first R dataframe contains a column of node label, and a column of Local_flattening_ratio associated with the node. The second R dataframe contains the edge pairs and a boolean column indicating whether the source node is k-nearest neighbor to the target node and vice versa.
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname LocalFlatteningRatio
#' @export 
LocalFlatteningRatio = function(nodes, edges, minK=1, bipartite=FALSE) {
  if (bipartite) {
    #if bipartite, filter nodes to those that are in set 1
    nodes2 = nodes[sapply(nodes, function(node) node[['bipartite']] == 1)]
  } else {
    nodes2 = nodes
  }
  
  # add degree, connected_nodes, and Knn as attributes to each node
  nodes2 = lapply(nodes2, function(node) {
    return(Add_K_connected_nodes_knn_to_node(nodes, edges, node, bipartite))
  })
  
  # create Local Flattening Ratio values
  lfr = lapply(nodes2, function(node) {
    return(LocalFlatteningRatio_for_one_node(nodes, node, minK, bipartite))
  })
  
  if (bipartite) {
    #if bipartite, only need to check if Target is a k-nearest neighbor of the 'Source'
    edge_table = lapply(edges, function(edge) {
      source_knn = nodes2[[edge[['Source']]]][['Knn']]
      is_K_nearest_neighbor = as.integer(edge[['Target']] %in% source_knn)
      return(list(Source = edge[['Source']], Target = edge[['Target']], is_K_nearest_neighbor = is_K_nearest_neighbor))
    })
  } else {
    # The edge_table is created by iterating over each edge in the edges list and 
    # checking if the 'Source' node is a k-nearest neighbor of the 'Target' 
    # node or vice versa. If yes, the KNearestNeighbor value is set to 1; 
    # otherwise, it's set to 0.
    edge_table = lapply(edges, function(edge) {
      source_knn = nodes2[[edge[['Source']]]][['Knn']]
      target_knn = nodes2[[edge[['Target']]]][['Knn']]
      is_K_nearest_neighbor = as.integer(edge[['Target']] %in% source_knn | edge[['Source']] %in% target_knn)
      return(list(Source = edge[['Source']], Target = edge[['Target']], is_K_nearest_neighbor = is_K_nearest_neighbor))
    })
  }
  
  labels = lapply(nodes2, function(node) {
    return(node[['label']])
  })
  
  k_values = lapply(nodes2, function(node) {
    return(node[['K']])
  })
  
  node_table = data.frame('label' = unname(unlist(labels)), 'K' = unname(unlist(k_values)), 
                          'Local_flattening_ratio' = unname(unlist(lfr)))
  
  edge_table = do.call(rbind.data.frame, edge_table)
  
  return(list(node_table, edge_table))
}

#' @title Global Network Flattening Ration 
#' @description calculate global network flattening ratio for nodes. 
#' @param nodes nodes of graph (a list of named lists)
#' @param edges edges of graph (a list of lists)
#' @param iter number of iterations to shuffle the order of nodes for various configuration of G_bar
#' @return a numeric value that is the ratio of sum of distance between G_bar and G
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#' }
#' @rdname GlobalFlatteningRatio
#' @export 
GlobalFlatteningRatio = function(nodes, edges, iter) {
  # add K and Knn to each node
  nodes2 = lapply(nodes, function(node) {
    return(Add_K_connected_nodes_knn_to_node(nodes, edges, node, FALSE))
  })
  
  # generate iteration number of node orders 
  node_orders <- list()
  for (i in 1:iter) {
    node_orders[[i]] <- sample(names(nodes2))
  }
  
  # Precompute the distance matrix
  nodes_labels <- names(nodes2)
  n <- length(nodes_labels)
  distance_matrix <- matrix(NA, nrow = n, ncol = n, dimnames = list(nodes_labels, nodes_labels))
  
  for (i in seq_len(n)) {
    #skip diagonal values 
    for (j in seq_len(n)[-i]) {
      distance <- euclidDistance(nodes2[[i]], nodes2[[j]]) 
      # update both upper and lower side of the matrix since the network is undirected
      distance_matrix[i, j] <- distance
      distance_matrix[j, i] <- distance
    }
  }
  
  # Precompute the degree constraint matrix
  degree_constraint_matrix <- sapply(nodes2, function(x) x$K)
  
  # calculate average distance of G_bar under iterations. 
  avg_G_bar_sum = mean(sapply(node_orders, function(order) G_bar_sum_distances(order, nodes2, distance_matrix, degree_constraint_matrix)))
  G_sum = Sum_connected_nodes_distances(nodes2, distance_matrix)
  return(avg_G_bar_sum/G_sum)
}
friendlycities-gatech/SSNtools documentation built on Sept. 13, 2023, 10:40 a.m.