R/AbForests_RemoveNets.R
In Platypus: Single-Cell Immune Repertoire and Gene Expression Analysis

Documented in AbForests_RemoveNets

#' Filter sub-repertoires with less than N unique sequences or with less than C unique cells

#' @description RemoveNets takes the output of SubRepertoires and performs the filtering of the 5 sub-repertoires. In particular, from these 5 sub-repertoires, networks with number of nodes or number of unique sequences below a specified threshold are eliminated.
#' @param list a list of 5 sub-lists of data.frames. Each sub-list corresponds to the set of networks, in which a majority isotype is specifyied. list[[1]] or list$list_IGHG contains the networks, in data.frame format, with more IGG isotypes, list[[2]] or list$list_IGHA contains the networks, in data.frame format, with more IGA isotypes, list[[3]] or list$list_IGHM contains the networks, in data.frame format, with more IGM isotypes, list[[4]] or list$list_IGAG contains the networks, in data.frame format, with a tie in IGA and IGG isotypes and list[[5]] or list$list_other contains the networks, in data.frame format, with other isotypes apart from the aforementioned combinations. In each sub-list, each data.frame represents a clone lineage and contains 2 columns, one that describes the antibody sequences and the other which type of information (isotype) is considered in the analysis. This list of data.frames has been generated by SubRepertoires function based on the initial data input and user's preferences.
#' @param opt a string with options "isotype" and "cluster". The option "isotype" is utilized when the user desires to do an isotype analysis, while the selection of "cluster" denotes that an analysis based on transcriptome is requested.
#' @param distance_mat a custom integer distance matrix, or NULL for using the default distance matrix (calucated based on the levenshtein distance, which counts the number of mutations between sequences). Given the phylogenetic tree, a custom-made distance matrix can be produced by PlyloToMatrix function.
#' @param tie_flag  a string, with options 'rand', 'full', 'close_to_germ', 'far_from_germ', 'close_path_to_germ', 'far_path_from_germ','most_expanded' and 'least_expanded' for removing edges when equal distance (tie) in distance matrix.
#' 'rand' means random pruning in one of nodes, 'full' means keeping all nodes, close_to_germ means pruning of node(s) farthest from germline (based on number of intermediate nodes), 'far_from_germ' means pruning of node(s) closest to germline (based on number of intermediate nodes),
#' 'close_path_to_germ' means pruning of node(s) farthest from germline (based on edge path length), 'far_path_from_germ' meams pruning of node(s) closest to germline (based on edge path length),'most_expanded' means pruning of node(s) with the lowest B cell count(clonal frequency) and least_expanded, which means pruning of node(s) with the hightest B cell count(clonal frequency). In cases of subsequent ties, a random node is selected.
#' @param weight  logical variable. When its value is FALSE, then the weights of outgoing edges from Germline node are set to 1. When its value is TRUE, the weights are set to the difference between the number of mutations among sequences in germline and connected nodes(value in the corresponding distance matrix) and the absolute value of the difference between the sequence lengths of germline and corresponding connected nodes. In both cases, weights of remaining edges are extracted from the distance matrix.
#' @param N the threshold of unique sequences below which networks are removed.
#' @param C the threshold of unique cells below which networks are removed.
#' @param random.seed a random seed, specified by the user, when random sampling of sequences happens in each of the cases described in tie_flag argument.
#' @param alg_opt a string denoting the version of the edge selection algorithm used in the construction of networks. "0" means the naive version and "1" the advanced one.
#' @return  list a nested list of 5 sub-lists of data.frames. Each sub-list corresponds to the reduced set of networks according to threshold N, in which a majority isotype is specifyied. list[[1]] or list$list_IGHG contains the networks, in data.frame format, with more IGG isotypes, list[[2]] or list$list_IGHA contains the networks, in data.frame format, with more IGA isotypes, list[[3]] or list$list_IGHM contains the networks, in data.frame format, with more IGM isotypes, list[[4]] or list$list_IGAG contains the networks, in data.frame format, with a tie in IGA and IGG isotypes and list[[5]] or list$list_other contains the networks, in data.frame format, with other isotypes apart from the aforementioned combinations.
#' @export
#' @seealso SubRepertoires, SubRepertoiresByUniqueSeq, PlyloToMatrix, AntibodyForest
#' @examples
#' \dontrun{
#' RemoveNets(list,opt="cluster",distance_mat=NULL,
#' tie_flag='close_to_germ',weight=TRUE,N=4,C=NULL,random.seed=165)
#'}


AbForests_RemoveNets<-function(list,opt,distance_mat,tie_flag,weight,N,C,random.seed,alg_opt){

  list<-lapply(list, lapply,function(k) {
    if (length(k)>0){
      if(!(is.null(C))){
        if(length(k$Seq)<C){
          return()
        }
      }
      if(opt=="isotype"){
        pre_data<-.PREPROCESSING(k$Seq,k$isotype,distance_mat,NULL)
      }else{
        pre_data<-.PREPROCESSING(k$Seq,k$cluster,distance_mat,NULL)
      }
      if(alg_opt=="two-step"){
        edgeList<-.FIND_EDGES_v1(pre_data$dist_mat,pre_data$countData)
        edgeList<-.DELETE_MULTIPLE_PARENTS(edgeList$node_list,edgeList$edges_final,pre_data$dist_mat,pre_data$countData,tie_flag,weight,random.seed)
      }else{
        germline.index<-which(apply(pre_data$countData, 1, function(r) any(grepl("Germline",r))))
        adj_mat<-.ADJ_MAT(pre_data$dist_mat)
        edgeList<-.EDGES(pre_data$dist_mat,adj_mat,germline.index,weight,random.seed)
      }
      columns <- grep('Ratio', colnames(pre_data$countData), value=TRUE)
      values <- lapply(edgeList$node_list, function(x) unlist(pre_data$countData[x,columns]))
      df_values <- do.call("rbind", values)
      if(!is.null(N)){
        if(nrow(df_values)<N){
          return()
        }
      }
      return(k)
    }
  })
  list<-lapply(list, function(x) Filter(length, x))
  if(!(purrr::vec_depth(list)>3)){
    list<-lapply(list,function(z) list(z))
  }


  return(list)
}