R/get_layers_values.R

Defines functions get_groups_based_on_annotation get_new_colors_based_on_groups get_groups_based_on_clustering get_gene_ID_based_on_tooltip get_groups_based_on_gene_ID get_colors_based_on_gene_ID get_colors_based_on_size get_colors_based_on_strains get_group_names_based_on_n_nodes get_group_names_based_on_tooltip

Documented in get_colors_based_on_gene_ID get_colors_based_on_size get_colors_based_on_strains get_gene_ID_based_on_tooltip get_group_names_based_on_n_nodes get_group_names_based_on_tooltip get_groups_based_on_annotation get_groups_based_on_clustering get_groups_based_on_gene_ID get_new_colors_based_on_groups

############################################################################################
####################### get layers values based on several info  ####################################
############################################################################################

#' get group names based on annotation
#' 
#'function to get the group names based on the annotation of the nodes (tooltip column)
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @return Object of class "dataframe", which reports the old names associated with the new names
#' @export
#' @examples 
#' \dontrun{get_group_names_based_on_tooltip(data)}
#' @import ff
get_group_names_based_on_tooltip<-function(data){
  all_groups<-unique(as.character(data$nodes$group[])) # All groups
  # analyze tooltip of each group
  new_groups_names<-vapply(1:length(all_groups),function(i){
    current_group_name<-all_groups[i]
    inds<-which(as.character(data$nodes$group[])==current_group_name)
    tooltips_current_group<-as.character(data$nodes$title[])[inds]
    n_nodes<-nrow(data$nodes[inds,])
    if(length(inds)==1){
      n_nodes<-1
    }
    list_words<-lapply(1:length(tooltips_current_group),function(j){
      name<-tooltips_current_group[j]
      splitted_name<-strsplit(name," ")
      return(splitted_name)
    })
    vector_words<-unlist(list_words)
    # clean the vector of words, removing not meaningful words
    vector_words<-vector_words[vector_words != ""] # remove empty characters
    vec_words_to_remove<-c("family|protein|subunit|transporter|regulator|transcriptional|domain-containing|of|type")
    inds<-grep(vec_words_to_remove,vector_words)
    vector_words<-vector_words[-inds] 
    # measure frequency of the words in the current group
    table_counts<-table(vector_words)
    table_counts<-sort(table_counts,decreasing = T)
    selected_keywords<-names(table_counts)
    # the final name of the group contains the 4 most common keywords
    final_name<-paste0(selected_keywords[1:4],collapse = "_")
    # we replace commas with dots in the final name (commas are reserved characters)
    final_name<-gsub(",",".",final_name)
    final_name<-sprintf("%s_n%s_group%d",final_name,n_nodes,i)
    return(final_name)
  },character(1))
  df_new_groups_names<-as.data.frame(cbind(all_groups,new_groups_names))
  # check correctness of the operation
  output<- length(all_groups) == nrow(df_new_groups_names)
  if(output!=T){
    stop("Number of predicted group names different from the number of original group")
  }
  return(df_new_groups_names)
}

#' get the group names based on the number of nodes
#' 
#'function to get the group names based on the number of nodes in each group.
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @return Object of class "dataframe", which reports the old names associated with the new names
#' @export
#' @examples \dontrun{get_group_names_based_on_n_nodes(data)}
#' @import ff
get_group_names_based_on_n_nodes<-function(data){
  all_groups<-unique(as.character(data$nodes$group[])) # All groups
  # analyze each group
  new_groups_names<-vapply(1:length(all_groups),function(i){
    current_group_name<-all_groups[i]
    inds<-which(as.character(data$nodes$group[])==current_group_name)
    n_nodes<-nrow(data$nodes[inds,])
    if(length(inds)==1){
      n_nodes<-1
    }
    final_name<-sprintf("n%d_group%d",n_nodes,i)
    return(final_name)
  },character(1))
  df_new_groups_names<-as.data.frame(cbind(all_groups,new_groups_names))
  # check correctness of the operation
  output<- length(all_groups) == nrow(df_new_groups_names)
  if(output!=T){
    stop("length(all_groups) == nrow(df_new_groups_names)")
  }
  return(df_new_groups_names)
}



#' get new colors based on the strains
#' 
#' function to predict the colors based on the strains occurrence of genes 
#' i.e.,show nodes/genes associated with the selected strains (i.e., that gene is present in that strains)

#' @param input_binary_df Object of class "dataframe", containing presence/absence of genes. Mandatory argument.
#' @param data visnetwork dataframe generated by the geNet algorithm. Mandatory argument.
#' @param strains_names the vector containing the names of the strains to visualize. Mandatory argument.
#' @param col_nodes Color of the nodes matched with the strains. Default to "black".
#' @return Object of class "dataframe", which reports the new colors for each node of the network
#' @export
#' @examples 
#' \dontrun{get_colors_based_on_strains(input_binary_df,data,strains_names=c("strain_x"),col_nodes="black")}
#' @import ff
get_colors_based_on_strains<-function(input_binary_df,data,strains_names,col_nodes="black"){
  splitted_strains_names<-strsplit(strains_names,",")[[1]]
  string<-paste0(splitted_strains_names,sep = "$",collapse = "|")
  inds<-grep(string,row.names(input_binary_df))
  if(length(inds)!=0){
    stop("selected strains not found")
  }
  input_binary_df_selected_strains<-input_binary_df[inds,]
  print("------ presence indices extraction------" )
  list_all_inds<-lapply(1:nrow(input_binary_df_selected_strains),function(i){
    vec_i<-input_binary_df_selected_strains[i,]
    inds_1<-which(vec_i==1)
    return(inds_1)
  })
  common_inds<-Reduce(intersect, list_all_inds)
  if(length(common_inds)==0){
    stop("the selected strains don't have any common nodes")
  }
  id_reference_common<-colnames(input_binary_df_selected_strains)[common_inds]
  out<-as.character(data$nodes$id[]) %in% id_reference_common  
  inds<-which(out==T)
  genes_id_selected<-as.character(data$nodes$id[])[inds]
  vec_colors<-rep(col_nodes,length(genes_id_selected))
  data_new_colors<-as.data.frame(cbind(genes_id_selected,vec_colors))
  return(data_new_colors)
}
#' get_colors_based_on_size
#' 
#' function to get the colors based on the size of the nodes
#' 
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @param size_thr size threshold. Mandatory argument.
#' @param direction values greater/less  than (or equal) the selected threshold
#' @param col_nodes color of the nodes that match the specified condition. Default to "black".
#' @return data_new_colors: object of class "dataframe", which reports the new colors for each node of the network
#' @export
#' @examples 
#' \dontrun{get_colors_based_on_size(data,size_thr=10,direction="less",col_nodes="black")}
#' @import ff
get_colors_based_on_size<-function(data,size_thr,direction="less",col_nodes="black"){
  if(direction=="less"){
    inds<-which(data$nodes$size[]<=size_thr)
  }else if(direction=="greater"){
    inds<-which(data$nodes$size[]>=size_thr)
  }else{
    stop("direction argument not valid, please specify a correct direction argument: greater,less")
  }
  genes_id_selected<-as.character(data$nodes$id[])[inds]
  vec_colors<-rep(col_nodes,length(genes_id_selected))
  data_new_colors<-as.data.frame(cbind(genes_id_selected,vec_colors))
  return(data_new_colors)
}
#' get_colors_based_on_gene_ID
#' 
#' function to get the colors based on the ID of the nodes
#' 
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @param vec_gene_ids vector of genes IDs to color. Mandatory argument.
#' @param col_nodes color of the nodes that match the specified IDs. Default to "black".
#' @return Object of class "dataframe", which reports the new colors for each node of the network
#' @export
#' @examples 
#' \dontrun{get_colors_based_on_gene_ID(data,vec_gene_ids="None",col_nodes="black")}
#' @import ff
get_colors_based_on_gene_ID<-function(data,vec_gene_ids="None",col_nodes="black"){
  string<-paste0(vec_gene_ids,collapse = "|")
  inds<-grep(string,as.character(data$nodes$id[]))
  if(length(inds)==0){
    stop("selected IDs not found in the visnetwork data object")
  }
  genes_id_selected<-as.character(data$nodes$id[])[inds]
  vec_colors<-rep(col_nodes,length(genes_id_selected))
  data_new_colors<-as.data.frame(cbind(genes_id_selected,vec_colors))
  return(data_new_colors)
}

#' get_groups_based_on_gene_ID
#' 
#' function to get the groups based on the ID of the nodes
#' 
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @param vec_gene_ids vector of genes IDs to which the groups must be matched. Mandatory argument.
#' @return Object of class "dataframe", which reports the groups for the specified genes ID.
#' @export
#' @examples 
#' \dontrun{get_groups_based_on_gene_ID(data,vec_gene_ids=c("group_200","group_253"))
#' # Get the groups for the nodes IDs "group_200" and "group_253"
#' }
#' @import ff
get_groups_based_on_gene_ID<-function(data,vec_gene_ids){
  string<-paste0(vec_gene_ids,collapse = "|")
  inds<-grep(string,as.character(data$nodes$id[]))
  if(length(inds)==0){
    warning("selected id not found")
    return(NULL)
  }
  groups<-as.character(data$nodes$group[])[inds]
  gene_ids<-as.character(data$nodes$id[])[inds]
  df<-as.data.frame(cbind(gene_ids,groups))
  return(df)
}
#' get_gene_ID_based_on_tooltip
#' 
#' function to get the gene IDs based on the annotation (tooltip column)
#' 
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @param vec_tooltip vector of annotation to match with the gene IDs. Mandatory argument.
#' @return Object of class "dataframe", which reports the genes ID for the specified annotation keywords.
#' @export
#' @examples 
#' \dontrun{get_gene_ID_based_on_tooltip(data,vec_tooltip=c("protein","secretion"))
#' # Get the nodes IDs whose annotation reports the word "protein" and "secretion"
#' }
#' @import ff
get_gene_ID_based_on_tooltip<-function(data,vec_tooltip){
  string<-paste0(vec_tooltip,collapse = "|")
  inds<-grep(string,as.character(data$nodes$title[]))
  if(length(inds)==0){
    stop("selected tooltip not found")
  }
  gene_ids<-as.character(data$nodes$id[])[inds]
  tooltip<-as.character(data$nodes$title[])[inds]
  df<-as.data.frame(cbind(gene_ids,tooltip))
  return(df)
}


#' get_groups_based_on_clustering
#' 
#' function to get the groups based on a clustering algorithm
#' Note: the color of the nodes is based on the group (using the function get_new_colors_based_on_groups())
#' to evaluate the clustering method we use the modularity measure.
#' The modularity of a graph with respect to some division (or vertex types) measures how good the division is, 
#' or how separated are the different vertex types from each other.
#' @param igraph_network igraph network generated by the gen_igraph_network() function. Mandatory argument.
#' @param method clustering method:
#' * louvain: predict clusters using the Louvain algorithm. 
#' The "modularity" is the objective function to maximize. Small clusters may be hidden. It detects only the bigger clusters.
#' * infomap: predict clusters using the Infomap algorithm.
#' The "map equation" is the objective function to minimize. Compared to the Louvain algorithm, it is able to identify smaller clusters (subclusters)
#' * fastgreedy: predicts clusters using the Fastgreedy algorithm
#' * walktrap: predicts clusters using the Walktrap algorithm
#' 
#' Default to "infomap" to detect smaller clusters. The Louvain algorithm is faster than the Infomap algorithm in large networks.
#' @return Object of class "dataframe", which reports the genes ID associated with the new groups
#' @export
#' @examples 
#' \dontrun{get_groups_based_on_clustering(igraph_network,method="infomap")}
#' @import ff igraph
get_groups_based_on_clustering<-function(igraph_network,method="infomap"){
  if(method=="louvain"){
    print("----- generating louvain partitions-----")
    lc <- cluster_louvain(igraph_network)
    partition<-as.vector(membership(lc))
    nodes_id<-vertex_attr(igraph_network,name="name")
    # tot_n_clusters<-length(unique(partition))
    df_groups<-data.frame(nodes_id,partition)
  }else if(method=="infomap"){
    print("--- generating infomap partitions")
    lc <- cluster_infomap(igraph_network)
    partition<-as.vector(membership(lc))
    nodes_id<-vertex_attr(igraph_network,name="name")
    # tot_n_clusters<-length(unique(partition))
    df_groups<-data.frame(nodes_id,partition)
  }else if(method=="fastgreedy"){
    print("--- generating fastgreedy partitions ----")
    lc <- cluster_fast_greedy(igraph_network)
    partition<-as.vector(membership(lc))
  }else if(method=="walktrap"){
    print("--- generating walktrap partitions ------")
    lc <- cluster_walktrap(igraph_network)
    partition<-as.vector(membership(lc))
    nodes_id<-vertex_attr(igraph_network,name="name")
    # tot_n_clusters<-length(unique(partition))
    df_groups<-data.frame(nodes_id,partition)
  }else{
    stop("input clustering method not valid")
  }
  print("---- generate colors based on groups -------")
  gc()
  df_groups<-get_new_colors_based_on_groups(df_groups=df_groups)
  gc()
  print("Done")
  return(df_groups)
}

#' get_new_colors_based_on_groups
#' 
#' function to get the colors based on the groups
#' @param df_groups object of class "dataframe", which reports the genes ID associated with the groups. Mandatory argument.
#' @return Object of class "dataframe", which reports the gene IDs and the groups associated with the colors
#' @export
#' @examples 
#' \dontrun{get_new_colors_based_on_groups(df_groups)}
#' @import RColorBrewer grDevices
get_new_colors_based_on_groups<-function(df_groups){
  numb_clusters<-length(unique(df_groups[,2]))
  # we generate vec of unique colors
  qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
  col_vector <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
  col_vector <- colorRampPalette(col_vector)(numb_clusters)
  # extract random colors
  unique_groups<-unique(df_groups[,2])
  rand_int<-sample.int(length(col_vector),length(unique_groups))
  colors_selected<-col_vector[rand_int]
  df_temp<-as.data.frame(cbind(unique_groups,colors_selected))
  df_temp$unique_groups<-as.character(df_temp$unique_groups)
  final_col_vec<-rep("a",nrow(df_groups))
  gc()
  # assign colors to the clusters
  for(i in 1:nrow(df_temp)){
    current_group<-df_temp$unique_groups[i]
    current_color<-df_temp$colors_selected[i]
    inds<-which(df_groups[,2]==current_group)
    final_col_vec[inds]<-current_color
  }
  df_groups_colors<-cbind(df_groups,final_col_vec)
  return(df_groups_colors)
}
#' get groups based on the annotation
#' 
#' function to get the groups based on the annotation (tooltip column)
#' @param data List of two ffdf objects (format generated by the geNet() function )
#' @param select_annotation vector of annotation words to match with the groups
#' @return Object of class "vector", which reports the groups selected
#' @export
#' @examples 
#' \dontrun{get_groups_based_on_tooltip(data,select_tooltip=c("protein","secretion"))
#' # Get the groups where the annotations words "protein" and "secretion" appear.
#' }
#' @import ff
get_groups_based_on_annotation<-function(data,select_annotation){
  string<-paste0(select_annotation,collapse = "|")
  inds<-grep(string,as.character(data$nodes$title[]))
  if(length(inds)==0){
    groups_selected<-"No groups with the selected annotation"
  }else{
    groups_selected<-unique(as.character(data$nodes$group[])[inds])
  }
  return(groups_selected)
}
haneylab/geNet documentation built on Oct. 4, 2020, 8:40 a.m.