R/main.R

Defines functions get_final_module get_Pvalue_S get_candidate_module get_mutual_module get_local_network MI get_mut_status

Documented in get_candidate_module get_final_module get_local_network get_mut_status get_mutual_module get_Pvalue_S MI

#' @title Converts MAF file into mutation matrix.
#' @description The function 'get_mut_status' uses to convert MAF file into mutation matrix.
#' @param mutvariant A nx3 data frame of patients' somatic mutation data,the first line is gene symbol,the second line is sample ID and the third line is mutation classification.
#' @param nonsynonymous Logical, tell if extract the non-synonymous somatic mutations (nonsense mutation, missense mutation, frame-shif indels, splice site, nonstop mutation, translation start site, inframe indels).
#' @return A binary mutations matrix, in which 1 represents that a particular gene has mutated in a particular sample, and 0 represents that gene has no mutation in a particular sample.
#' @importFrom utils read.delim
#' @export
#' @examples
#' \donttest{maf<-system.file("extdata","maffile.maf",package = "ProgModule")
#' maf_data<-read.delim(maf)
#' mutvariant<-maf_data[,c("Hugo_Symbol","Tumor_Sample_Barcode","Variant_Classification")]
#' #perform the function `get_mut_status`.
#' mut_status.example<-get_mut_status(mutvariant,nonsynonymous = TRUE)}

get_mut_status<-function(mutvariant,nonsynonymous = TRUE){
  if(nonsynonymous){
    mafmut<-mutvariant[which(
      mutvariant$Variant_Classification == "Missense_Mutation" |
        mutvariant$Variant_Classification == "Frame_Shift_Del" |
        mutvariant$Variant_Classification == "Frame_Shift_Ins" |
        mutvariant$Variant_Classification == "In_Frame_Del" |
        mutvariant$Variant_Classification == "Nonsense_Mutation" |
        mutvariant$Variant_Classification == "In_Frame_Ins" |
        mutvariant$Variant_Classification == "Splice_Site" |
        mutvariant$Variant_Classification == "Nonstop_Mutation" |
        mutvariant$Variant_Classification == "Translation_Start_Site"
    ),]
  }else{
    mafmut<-mutvariant
  }
  mut_status<-matrix(data=0,nrow=length(unique(mafmut[,1])),ncol=length(unique(mafmut[,2])))
  colnames(mut_status)<-unique(mafmut[,2])
  rownames(mut_status)<-unique(mafmut[,1])
  for(i in 1:dim(mut_status)[2]){
    un_sname<-table(mafmut[which(mafmut[,2]%in%colnames(mut_status)[i]),1])
    mut_status[match(names(un_sname),rownames(mut_status)),i]<-un_sname
  }
  mut_status[mut_status>1]<-1
  return(mut_status)
}

#' @title Calculate mutual information.
#' @description The function `MI` is used to calculate the mutual information score between samples' survival status and mutation status.
#' @param mylist1 Is input a list of samples' survival status.
#' @param mylist2 Is input a list of samples' mutation status.
#' @return The mutual information score
#' @importFrom infotheo entropy
#' @importFrom utils read.delim
#' @export
#' @examples
#' \donttest{#load the data
#' data(mut_status)
#' sur<-system.file("extdata","sur.csv",package = "ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names = 1)
#' #perform the function 'MI'
#' mut_matrix<-MI(mylist1 = as.numeric(mut_status[1,]),mylist2 = sur[,1])}
MI<-function(mylist1,mylist2){
  return(entropy(mylist1)+entropy(mylist2)-entropy(cbind(mylist1,mylist2)))
}



#' @title Extract the local networks from the PPI network.
#' @description The function `get_local_network` is used to search local network of each gene by breadth-first algorithm.
#' @param network The PPI network.
#' @param freq_matrix The mutations matrix,generated by `get_mut_status`.
#' @param max.size The size of maximum connected local network,default is 500.
#' @return local nerwork.
#' @importFrom igraph get.vertex.attribute
#' @importFrom igraph bfs
#' @export
#' @examples
#' \donttest{#load the data
#' data(mut_status)
#' data(subnet)
#' #perform the function `get_local_network`.
#' localnetwork.example<-get_local_network(network=subnet,freq_matrix=mut_status,max.size=500)}

get_local_network<-function(network,freq_matrix,max.size=500){
  node<-get.vertex.attribute(network)[[1]]
  inter<-intersect(node,rownames(freq_matrix))
  node<-inter
  freq_matrix<-freq_matrix[inter,]
  re<-list()
  for(i in 1:length(node)){
    ne<-bfs(network,node[i],order = TRUE,  unreachable = FALSE,dist=TRUE,mode= "all")
    m=0
    l=0
    r1<-c()
    while(l<length(ne$dist)){
      l<-l+length(which(ne$dist==m))
      if(l>max.size){
        break
      }
      r1<-c(r1,names(which(ne$dist==m)))
      m=m+1
    }
    re[[i]]<-r1
    names(re)[[i]]<-node[i]
  }
  return(re)

}

#' @title Extract the mutually exclusive module.
#' @description The function `get_mutual_module` is used to determine if neighbor genes should be added to the module by calculating the score.
#' @param module The Original modular gene set.
#' @param net The local network extracted from PPI network.
#' @param freq_matrix The mutations matrix,generated by `get_mut_status`.
#' @param sur A nx2 data frame of samples' survival data,the first line is samples' survival event and the second line is samples' overall survival.
#' @param module_sig A label for whether the module is a risk factor or a protective factor for survival.
#' @param univarCox_result The result of Cox univariate analysis,generated by `get_univarCox_result`.
#' @param rate The rate of increase in score,default is 0.05.
#' @return The mutually exclusive module.
#' @importFrom igraph neighbors
#' @importFrom utils read.delim
#' @export
#' @examples
#' \donttest{#load the data
#' data(mut_status)
#' sur<-system.file("extdata","sur.csv",package = "ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names = 1)
#' data(net)
#' data(module)
#' data(univarCox_result)
#' #perform the function `get_mutual_module`.
#' mutuallyexclusivemodule.example<-get_mutual_module(module,net,freq_matrix=mut_status,sur,
#' module_sig="risk",univarCox_result,rate=0.05)}

get_mutual_module<-function(module,net,freq_matrix,sur,module_sig,univarCox_result,rate){
  inter<-intersect(colnames(freq_matrix),rownames(sur))
  if(length(inter)==0){
    stop("Plese input the same patients' mutation and survival data!")
  }else{
    freq_matrix<-freq_matrix[,inter]
    sur<-sur[inter,]
  }
  nodes<-get.vertex.attribute(net)$name
  freq_matrix<-freq_matrix[intersect(rownames(freq_matrix),nodes),]
  sample_mut<-colSums(freq_matrix)
  if(length(module)==1){
    MI_score<-MI(mylist1 = as.numeric(freq_matrix[module,]),mylist2 = sur[,1])
    CovEs<-length(intersect(names(which(sample_mut==1)),names(freq_matrix[module,])[which(freq_matrix[module,]==1)]))/length(which(freq_matrix[module,]==1))
    S_score<-MI_score*CovEs*(length(which(freq_matrix[module,]!=0))/length(freq_matrix[module,]))
  }else{
    CovEs<-c()
    module_matrix<-freq_matrix[module,]
    for(i in 1:length(module)){
      CovE1<-length(intersect(names(which(sample_mut==1)),names(freq_matrix[module[i],])[which(freq_matrix[module[i],]==1)]))/length(which(freq_matrix[module[i],]==1))
      CovEs<-c(CovEs,CovE1)
    }
    module_mut<-colSums(module_matrix)
    module_mut[which(module_mut>0)]<-1
    MI_score<-MI(mylist1 = as.numeric(module_mut),mylist2 = sur[,1])
    CovE_score<-mean(CovEs)
    S_score<-MI_score*CovE_score*(length(which(colSums(freq_matrix[module,])!=0))/dim(freq_matrix)[2])
  }
  if(S_score==0){
    final_s_score<-0
    diff_s_score<-0
    diff_s_score_rate<-0
  }else{
    neig<-names(neighbors(net,module))
    neig<-setdiff(neig,module)

    if(length(neig)!=0){
      univarCox_result_tem<-univarCox_result[match(neig,names(univarCox_result))]
      if(module_sig=="favor"){
        neig<-intersect(neig,names(univarCox_result_tem[which(univarCox_result_tem<1)]))
      }
      if(module_sig=="risk"){
        neig<-intersect(neig,names(univarCox_result_tem[which(univarCox_result_tem>1)]))

      }
    }

    if(length(neig)!=0){
      S_score1<-c()
      for(i in 1:length(neig)){
        new_module_mut<-colSums(freq_matrix[c(module,neig[i]),])
        new_module_mut[which(new_module_mut>0)]<-1
        m1<-MI(mylist1 = as.numeric(new_module_mut),mylist2 = sur[,1])
        cove1<-length(intersect(names(which(sample_mut==1)),names(freq_matrix[neig[i],])[which(freq_matrix[neig[i],]==1)]))/length(which(freq_matrix[neig[i],]==1))
        S_score1[i]<-m1*mean(c(CovEs,cove1))*(length(which(colSums(freq_matrix[c(module,neig[i]),])!=0))/dim(freq_matrix)[2])
      }
      if((max(S_score1)-S_score)/S_score>rate){
        module<-c(module,neig[which.max(S_score1)])
        diff_s_score<-max(S_score1)-S_score
        diff_s_score_rate<-(max(S_score1)-S_score)/S_score
        final_s_score<-max(S_score1)
      }else{
        final_s_score<-S_score
        diff_s_score<-0
        diff_s_score_rate<-0
      }
    }else{
      final_s_score<-S_score
      diff_s_score<-0
      diff_s_score_rate<-0
    }}
  return(list(module=module,final_s_score=final_s_score,diff_s_score=diff_s_score,diff_s_score_rate=diff_s_score_rate))
}

#' @title Get candidate module.
#' @description The function `get_candidate_module` is used to search candidate module of each local network using greedy algorithm.
#' @param local_network The local networks,generated by `get_local_network`.
#' @param network The maximum connected subnet,extracted by mapping all mutated genes to the PPI network.
#' @param freq_matrix The mutations matrix,generated by `get_mut_status`.
#' @param sur A nx2 data frame of samples' survival data,the first line is samples' survival event and the second line is samples' overall survival.
#' @param seed The canonical drivers from NCG database, which use as the starting node of the greedy algorithm.
#' @param max.size The maximum size of the module,default is 200.
#' @param rate The rate of increase in score,default is 0.05.
#' @return candidate module.
#' @importFrom igraph delete_vertices
#' @importFrom igraph decompose
#' @importFrom igraph simplify
#' @importFrom utils read.delim
#' @importFrom utils read.table
#' @export
#' @examples
#' \donttest{#load the data
#' data(local_network)
#' data(mut_status)
#' data(subnet)
#' canonical_drivers<-system.file("extdata","canonical_drivers.txt",package = "ProgModule")
#' seed_gene<-read.table(canonical_drivers,header=FALSE)
#' sur<-system.file("extdata","sur.csv",package = "ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names=1)
#' #perform the function `get_candidate_module`.
#' candidatemodule.example<-get_candidate_module(local_network=local_network,network=subnet,
#' freq_matrix=mut_status,sur=sur,seed=seed_gene[,1],max.size=200,rate=0.05)}

get_candidate_module<-function(local_network,network,freq_matrix,sur,seed,max.size,rate){
  if(!is.null(seed)){
    local_network<-local_network[unlist(lapply(seed,function(x){which(names(local_network)==x)}))]
  }
  univarCox_result<-suppressWarnings(get_univarCox_result(freq_matrix=freq_matrix,sur))
  if(sum(is.na(univarCox_result))!=0){
    univarCox_result<-univarCox_result[-which(univarCox_result%in%NA)]
  }
  module_set<-list()
  final_score<-c()
  final_diff_s_score<-c()
  final_diff_s_score_rate<-c()
  for(i in 1:length(local_network)){
    module<-names(local_network[i])
    net<-delete_vertices(network, setdiff(names(network[[]]),local_network[[i]]))#local_network[[i]]$name
    net<-simplify(decompose(net)[[1]])
    diff_s_score_rate<-1
    final_s_score<-0
    diff_s_score<-0
    if(length(which(names(univarCox_result)%in%module))!=0){
      if(univarCox_result[which(names(univarCox_result)%in%module)]!=1){
        if(univarCox_result[which(names(univarCox_result)%in%module)]<1){module_sig="favor"}
        if(univarCox_result[which(names(univarCox_result)%in%module)]>1){module_sig="risk"}
        while(diff_s_score_rate > rate & length(module)<max.size & length(net)>1){
          get_module_res<-get_mutual_module(module=module,net=net,freq_matrix=freq_matrix,sur=sur,module_sig=module_sig,univarCox_result=univarCox_result,rate)
          if(length(module)==length(get_module_res$module)){
            module<-get_module_res$module
            final_s_score<- get_module_res$final_s_score
            diff_s_score<-get_module_res$diff_s_score
            diff_s_score_rate<-get_module_res$diff_s_score_rate
          }else{
            module<-unique(c(module,get_module_res$module))
            final_s_score<- get_module_res$final_s_score
            diff_s_score<-get_module_res$diff_s_score
            diff_s_score_rate<-get_module_res$diff_s_score_rate
          }
        }
      }
    }
    module_set[[i]]<-module
    final_score[i]<-final_s_score
    final_diff_s_score[i]<-diff_s_score
    final_diff_s_score_rate [i]<-diff_s_score_rate
  }
  return(list(module_set=module_set,final_score=final_score,final_diff_s_score=final_diff_s_score,final_diff_s_score_rate=final_diff_s_score_rate))
}

#' @title Get perturbed p-value.
#' @description The function `get_Pvalue_S` is used to calculate the perturbed p-value.
#' @param local_network The local network gene sets,generated by `get_local_network`.
#' @param freq_matrix The mutations matrix,generated by `get_mut_status`.
#' @param sur A nx2 data frame of samples' survival data,the first line is samples' survival event and the second line is samples' overall survival.
#' @param module The candidate module,generated by `get_candidate_module`.
#' @param perms The perturbation number,default is 1000.
#' @return Perturbed p-value.
#' @importFrom utils read.delim
#' @export
#' @examples
#' \donttest{#load the data
#' data(local_network)
#' data(mut_status)
#' data(candidate_module)
#' sur<-system.file("extdata","sur.csv",package = "ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names = 1)
#' #perform the function `get_Pvalue_S`.
#' turbulence.example<-get_Pvalue_S(module=candidate_module,freq_matrix=mut_status,
#' sur=sur,perms=100,local_network)}

get_Pvalue_S<-function(module,freq_matrix,sur,perms=1000,local_network){
  pvalue<-c()
  for(i in 1:length(module$module_set)){
    lo<-which(names(local_network)%in%module$module_set[[i]][1])
    if(length(local_network[[lo]])==1){
      sample_mut<-freq_matrix[local_network[[lo]],]
    }else{
      sample_mut<-colSums(freq_matrix[local_network[[lo]],])
    }
    freq_matrix_part<-freq_matrix[module$module_set[[i]],]
    S_score<-c()
    for(j in 1:perms){
      a<-freq_matrix_part
      if(length(module$module_set[[i]])==1){a<-as.data.frame(a)}
      colnames(a)<-sample(colnames(freq_matrix_part))
      inter<-intersect(colnames(a),rownames(sur))
      a<-a[,inter]
      sur<-sur[inter,]
      if(dim(a)[1]==1){
        MI_score<-MI(mylist1 = as.numeric(a),mylist2 = sur[,1])
        CovE_score<-length(intersect(names(which(sample_mut==1)),names(a)[which(a==1)]))/length(which(a==1))
        res<-MI_score*CovE_score*(length(which(a!=0))/length(a))
      }else{
        CovEs<-c()
        for(n in 1:dim(a)[1]){
          CovE1<-length(intersect(names(which(sample_mut==1)),names(a[n,])[which(a[n,]==1)]))/length(which(a[n,]==1))
          CovEs<-c(CovEs,CovE1)
        }
        module_mut<-colSums(a)
        module_mut[which(module_mut>0)]<-1
        MI_score<-MI(mylist1 = as.numeric(module_mut),mylist2 = sur[,1])
        CovE_score<-mean(CovEs)
        res<-MI_score*CovE_score*(length(which(colSums(a)!=0))/dim(a)[2])
      }
      S_score[j]<-res
    }
    if(length(which(S_score>module$final_score[[i]]))==0){
      pvalue[i]<-1e-16
    }else{
      pvalue[i]<-length(which(S_score>module$final_score[[i]]))/perms
    }
    names(pvalue)[i]<-module$module_set[[i]][1]
  }
  return(pvalue)
}

#' @title Get final module.
#' @description The function `get_final_module` is used to identify the final module.
#' @param index A index file of PPI networks,downloaded from http://compbio-research.cs.brown.edu/pancancer/hotnet2/.
#' @param edge The edge lists of PPI networks,downloaded from http://compbio-research.cs.brown.edu/pancancer/hotnet2/.
#' @param mut_status The mutations matrix,generated by `get_mut_status`.
#' @param sur A nx2 data frame of samples' survival data,the first line is samples' survival event and the second line is samples' overall survival.
#' @param seed The canonical drivers from NCG database, which use as the starting node of the greedy algorithm.
#' @param cutoff The perturbed p-value cutoff point,default is 0.05.
#' @param max.size.candidate The maximum size of the candidate module,default is 200.
#' @param max.size.local The size of maximum connected local network,default is 500.
#' @param rate The rate of increase in score,default is 0.05.
#' @param perm The perturbation number,default is 1000.
#' @return The final module.
#' @importFrom igraph graph_from_data_frame
#' @importFrom igraph delete_vertices
#' @importFrom igraph decompose
#' @importFrom igraph simplify
#' @importFrom igraph V
#' @importFrom stats p.adjust
#' @importFrom utils read.delim
#' @importFrom utils read.table
#' @export
#' @examples
#' \donttest{#load the data
#' indexdata<-system.file("extdata","hint+hi2012_index_file.txt",package="ProgModule")
#' index<-read.table(indexdata,sep="\t",header=FALSE)
#' edgedata<-system.file("extdata","hint+hi2012_edge_file.txt",package="ProgModule")
#' edge<-read.table(edgedata,sep="\t",header=FALSE)
#' data(mut_status)
#' sur<-system.file("extdata","sur.csv",package ="ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names=1)
#' canonical_drivers<-system.file("extdata","canonical_drivers.txt",package="ProgModule")
#' seed_gene<-read.table(canonical_drivers,header=FALSE)
#' #perform the function `get_final_module`.
#' finalmodule.example<-get_final_module(index,edge,mut_status,sur,seed=seed_gene,
#' cutoff=0.05,max.size.local=500,max.size.candidate=200,rate=0.05,perm=100)}

get_final_module<-function(index,edge,mut_status,sur,seed,cutoff=0.05,max.size.local=500,max.size.candidate=200,rate=0.05,perm=1000){
  edge<-edge[,1:2]
  edge[,1]<-index[match(edge[,1],index[,1]),2]
  edge[,2]<-index[match(edge[,2],index[,1]),2]
  if(sum(is.na(edge[,1]))!=0){
    edge<-edge[-which(edge[,1]%in%NA),]
  }
  if(sum(is.na(edge[,2]))!=0){
    edge<-edge[-which(edge[,2]%in%NA),]
  }
  net <- igraph::graph_from_data_frame(d=edge,directed =FALSE)
  mut_status<- mut_status[rowSums(mut_status)>0,]
  subnet<-delete_vertices(net,setdiff(names(net[[]]),rownames(mut_status)))
  components <- igraph::clusters(subnet)
  biggest_cluster_id <- which.max(components$csize)
  vert_ids <- V(subnet)[components$membership == biggest_cluster_id]
  subnet<-igraph::induced_subgraph(subnet, vert_ids)
  #save(subnet,file="subnet.Rdata")
  local_network<-get_local_network(network=subnet,freq_matrix=mut_status,max.size=max.size.local)
  #save(local_network,file="local_network.Rdata")
  gene<-intersect(seed[,1],names(local_network))
  if(length(gene)!=0){
    candidate_module<-get_candidate_module(local_network,network=subnet,freq_matrix=mut_status,sur,seed=gene,max.size=max.size.candidate,rate=rate)
    #save(candidate_module,file="candidate_module.Rdata")
    turbulence<-get_Pvalue_S(freq_matrix=mut_status,sur,module=candidate_module,perms=perm,local_network)
    #save(turbulence,file="turbulence.Rdata")
  }
  final_module<-candidate_module[["module_set"]][which(p.adjust(turbulence,method = "fdr")<cutoff)]
  final_module<-final_module[which(sapply(final_module,function(x) length(x)>1)==TRUE)]
  return(final_module)
}

#' @title Get univarCox result.
#' @description The function `get_univarCox_result` is used to calculate the result of Cox univariate analysis.
#' @param freq_matrix The mutations matrix,generated by `get_mut_status`.
#' @param sur A nx2 data frame of samples' survival data,the first line is samples' survival event and the second line is samples' overall survival.
#' @return The result of Cox univariate analysis.
#' @importFrom survival coxph
#' @importFrom utils read.delim
#' @importFrom stats as.formula
#' @export
#' @examples
#' \donttest{#load the data
#' data(mut_status)
#' sur<-system.file("extdata","sur.csv",package ="ProgModule")
#' sur<-read.delim(sur,sep=",",header=TRUE,row.names=1)
#' #perform the function `get_univarCox_result`.
#' univarCoxresult.example<-get_univarCox_result(freq_matrix=mut_status,sur)}

get_univarCox_result<-function (freq_matrix,sur)
{
  DE_path_sur<-cbind(t(freq_matrix),sur)
  path_name <- cbind(colnames(DE_path_sur), paste0("a", 1:length(colnames(DE_path_sur))))
  colnames(DE_path_sur)[1:(dim(DE_path_sur)[2] - 2)] <- path_name[1:(dim(path_name)[1]-2),2]
  covariates <- colnames(DE_path_sur)[1:(length(DE_path_sur[1,]) - 2)]
  univ_formulas <- sapply(covariates, function(x) as.formula(paste("Surv(survival, event) ~",x)))
  univ_models <- lapply(univ_formulas, function(x) {
    coxph(x, data = DE_path_sur)
  })
  univ_results <- sapply(univ_models, function(x) {
    x <- summary(x)
    HR <- signif(x$coef[2], digits = 2)
    return(HR)
  })
  result<-univ_results
  names(result) <- path_name[match(names(result), path_name[,2]), 1]
  return(result)
}

Try the ProgModule package in your browser

Any scripts or data that you put into this service are public.

ProgModule documentation built on May 29, 2024, 9:16 a.m.