R/querries.R

Defines functions profiles2Genebags selectListSize aggregateProfiles

Documented in aggregateProfiles profiles2Genebags selectListSize

#' \code{projectOntoMGS} 
#' @title projectOntoMGS
#' @export
#' @description This function takes a list of genes and projects it to a mgs catalogue
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector of gene_ids
#' @param list.mgs : this is the structured MGS information formatted as a list of geneids each corresponding to an MGS
#' @param res.filt.mode : the filtering method, either 'perc' or 'size', default will be size.
#'        perc : genes projected onto mgs are only kept if they exceed a threshold percentage of the mgs size 
#'        size : genes projected onto mgs are only kept if they exceed a threshold number of genes
#' @param res.filt.threshold : the threshold used to retain an mgs according to the filtering mode, either 'perc' or 'size'
#'        if 'perc' : the minimal percentage of genes projected onto mgs (number of genes projected / mgs size) needed to select a mgs 
#'        if 'size' : the minimal number of genes projected onto mgs
#' @param not_projected : whether a last genebag containing not projected genes is to be added to the list of selected mgs
#' @return a list of selected mgs with the geneids according to the projected genes
projectOntoMGS <- function (genebag, list.mgs, res.filt.mode = "size", res.filt.threshold = 50, not_projected = TRUE) {
  res <- list()
  if (res.filt.mode != "perc" & res.filt.mode != "size") {
    stop("Choose 'perc' or 'size' as a filtering mode of the result")
  }
  else {
    for (m in 1:length(list.mgs)) {
      tmp <- genebag[(genebag %in% list.mgs[[m]])]
      if (res.filt.mode == "size") {
        if (length(tmp) >= res.filt.threshold) {
          res[[names(list.mgs)[m]]] <- tmp
        }
      }
      else {
        if (length(tmp) >= (length(list.mgs[[m]]) * res.filt.threshold/100)) {
          res[[names(list.mgs)[m]]] <- tmp
        }
      }
    }
    if (not_projected == TRUE) {
      if(length(unlist(res))==0){
        res[["not_projected"]] <- genebag
      } else {
        res[["not_projected"]] <- genebag[-which(genebag %in% unlist(res))]       
      }
    }
    return(res)
  }
}

#' \code{extractProfiles} 
#' @title extractProfiles
#' @export
#' @description This function extracts the profiles from a gene profile matrix of a group of genes or a list 
#' of groups of genes. It can also restrict the size of the result
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector or list of gene_ids
#' @param data : raw count or frequency matrix with genes_ids as rawnames
#' @param size.max : default 15000, maximum number of profiles to be extracted
#' @param size.min : default 1, the minimal size threshold above which a group of genes is selected.
#' This is used to extract multiple profiles and filtering the list with a minimal number of genes
#' @param silent : default TRUE, detailling and following computation progress
#' @return a matrix or a list of profile matrixes
extractProfiles <- function (genebag, data, size.max = 15000, size.min = 1, silent = TRUE) 
{
  if (!is.list(genebag)) 
  {
    print("Profile extraction from a single gene vector")
    if (length(genebag) < size.max) 
    {
      ind <- match(genebag, rownames(data))
      if (sum(is.na(ind))>0) 
      {
        warning(paste("There ",sum(is.na(ind)),"are genes from the genebag not found in the data !"))
        ind <- ind[!is.na(ind)]
      }
      res <- data[ind, ]
    }else 
    {
      ind <- match(genebag[1:size.max], rownames(data))
      if (sum(is.na(ind))>0) 
      {
        warning(paste("There ",sum(is.na(ind)),"are genes from the genebag not found in the data !"))
        ind <- ind[!is.na(ind)]
      }
      res <- data[ind, ]
    }
  }
  else 
  {
    print("Multiple profile extraction")
    res <- list()
    for (i in 1:length(genebag)) 
    {
      if (i%%10 == 0 & silent == FALSE) print(i)
      if (length(genebag[[i]]) < size.max) 
      {
        ind <- match(genebag[[i]], rownames(data))
        if (sum(is.na(ind))>0) 
        {
          warning(paste("There ",sum(is.na(ind)),"are genes from the genebag not found in the data !"))
          ind <- ind[!is.na(ind)]
        }
        res[[i]] <- data[ind, ]
      } else 
      {
        ind <- match(genebag[[i]][1:size.max], rownames(data))
        if (sum(is.na(ind))>0) 
        {
          warning(paste("There ",sum(is.na(ind)),"are genes from the genebag not found in the data !"))
          ind <- ind[!is.na(ind)]
        }
        res[[i]] <- data[ind, ]
      }
    }
    names(res) <- names(genebag)
    res <- res[as.numeric(summary(genebag)[, "Length"]) >= size.min]
  }
  return(res)
}


#' \code{aggregateProfiles} 
#' @title aggregateProfiles
#' @export
#' @description This function takes a list of profile matrixes and returns an aggregated big matrix.
#' The individual matrixes can be filtered in size so that the first X rows are used for each of them.
#' This function is used to prepare the data and plot different MGS as barcodes.
#' @author Emmanuelle Le Chatelier
#' @param list.profiles : list of matrix profiles
#' @param max.size : the maximum number of rows to be selected in the final agregated matrix.
#' default   max.size=25.
#' @param min.size : this is the minimum number of rows rows to be selected in the final aggregated matrix. 
#' If a group has less it will be discarded. By default min.size=max.size.
#' @return an aggregated profile matrix.
aggregateProfiles <- function(list.profiles, max.size = 25, min.size = max.size)
{
  if(!is.list(list.profiles))
  {
    stop("Error! The object is not a list")
  }
  # list of matrices to be returned
  res <- c()
  for(i in 1:length(list.profiles))
  {
    if(nrow(list.profiles[[i]]) >= min.size)
    {
      res <- rbind(res, list.profiles[[i]][1:min(max.size, nrow(list.profiles[[i]])),])
    }
  }
  return(res)
}

#' \code{computeFilteredVectors}
#' @export 
#' @title computeFilteredVectors
#' @description filters and computes verctors based on gene profiles from a single matrix or a 
#' list of matrix profiles
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param profile : list of (or unique) matrix profiles
#' @param type : vectorisation method, vectors are calculated from the mean, median or sum of a given list of genes;
#' default type="mean" otherwise it will be "median" or  "sum"
#' @param filt : filtering threshold in % , rate of positive value in an individual under which matrix is cleaned 
#' and sparse values put to 0 default filt= 0, no filtering
#' @param debug : default is FALSE, when TRUE information on advancement is printed
#' @return a filtered vector or a matrix of filtered vectors
computeFilteredVectors <- function (profile, type = "mean", filt = 0, debug = FALSE) {
  if (is.list(profile)) {
    res <- matrix(data = NA, ncol = ncol(profile[[1]]), nrow = length(profile))
    for (i in 1:length(profile)) {
      if(debug) if (i%%100 == 0) { print(i) }
      if (type == "mean") { # mean
        res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, mean)
      } else {
        if (type == "median") { # median
          res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, median)
        }else { #sum
          res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, sum)
        }
      }
    }
    rownames(res) <- names(profile)
    colnames(res) <- colnames(profile[[1]])
  }
  else {
    if (type == "mean") { # mean
      res <- apply(filterMat(as.matrix(profile), filt = filt), 2, mean)
    }else {
      if(type== "median"){ # median
        res <- apply(filterMat(as.matrix(profile), filt = filt), 2, median)
      }else { # sum
        res <- apply(filterMat(as.matrix(profile), filt = filt), 2, sum)
      }
    }
  }
  return(res)
}

#' \code{buildMgsFinal} 
#' @title buildMgsFinal
#' @export
#' @description This function will take a vector of genes (to be transformed into a list of genebags) 
#'      or a list of genebags and will extract the profiles. Next genes well be ordered by connectivity
#'      which is to be computed for each group and the 50 most connected are selected to consitute the 
#'      marker genes. These will be then used to compute the mean vectors. A final object containing all 
#'      this information along with taxonomical annotation will be returned
#' @author Edi Prifti
#' @param genebag : a vector of genes to be projected onto mgs or a list of genebags, default = NULL.
#' @param mgs.cat : MGS catalogue to be used
#' @param mgs.taxo : taxonomy table for the MGS catalogue
#' @param profiles :  the data profile matrix to extract the profiles
#' @param conn : if TRUE the connectivity of a group is to be computed and ordered, default = TRUE.
#' @param silent : print detailled information on progress, default = FALSE.
#' @param filt : filtering based on percentage of prevalence to avoid noise for no signal samples by computeFilteredVectors.
#' @param save : Save files after each step
#' @return a list containing the final elements such as the 50 most connected genes, the mean vectors etc
buildMgsFinal <- function (genebag = NULL, mgs.cat, mgs.taxo, profiles, conn = TRUE, silent = TRUE, filt = 10, save = FALSE) 
{
  if(!is.list(genebag)) 
  {
    if (!silent) 
    {
      print("           Genebag is not a list, projecting onto the MGS catalogue ...")
    }
    genebag.list <- projectOntoMGS(genebag = genebag, list.mgs = mgs.cat, res.filt.mode = "size", res.filt.threshold = 50)
  }else {
    genebag.list <- genebag
  }
  
  genebag.list.dat <- extractProfiles(genebag.list, profiles, silent = silent)
  
  if (!silent) 
  {
    print("           Profiles have been extracted for each MGS ...")
  }
  
  if (save) 
  {
    saveRDS(genebag.list.dat, file = "genebag.list.dat.rda", compress = FALSE)
  }
  
  if (!silent & save) {
    print("           Profiles file has been saved in the working directory ...")
  }
  
  genebag.list.ordered <- list()
  genebag.list.50 <- list()
  
  if (!silent & conn) 
  {
    print("           Profiles will be reordered for each MGS using connectivity ...")
  }
  
  for (i in 1:length(genebag.list)) 
  {
    if (i %% 100 == 0 & !silent) 
      print(i)
    
    dat <- genebag.list.dat[[i]]
    
    if (is.null(dim(dat))) {
      nr <- 1
    }
    else {
      nr <- nrow(dat)
    }
    
    if(nr==0)
    {
      next()
    }
    
    if (conn == TRUE) {
      dat <- filterMat(as.matrix(dat), filt = filt)
      con <- connectivity(prof = dat, soft = TRUE, method = "spearman")
      dat <- dat[order(con, decreasing = T), ]
    }
    
    genebag.list.ordered[[i]] <- dat
    
    size.max <- min(50, nr)
    if (nr == 1) 
    {
      genebag.list.50[[i]] <- genebag.list.ordered[[i]]
    }else 
    {
      genebag.list.50[[i]] <- genebag.list.ordered[[i]][1:size.max, ]
    }
  }
  
  
  if (!silent & conn) 
  {
    print("           Profiles re-ordering, filtering has been finished ...")
  }
  if (!silent & !conn) 
  {
    print("           Profiles filtering has been finished ...")
  }
  names(genebag.list.ordered) <- names(genebag.list.dat)
  names(genebag.list.50) <- names(genebag.list.dat)
  if (save & conn) 
  {
    saveRDS(genebag.list.ordered, file = "genebag.list.ordered.rda", compress = FALSE)
  }
  if (save) 
  {
    saveRDS(genebag.list.50, file = "genebag.list.50.rda", compress = FALSE)
  }
  
  without.data <- unlist(lapply(lapply(genebag.list.50,nrow),is.null))
  
  res <- list()
  res$genebags <- genebag.list[!without.data]
  res$genebags.all <- mgs.cat[names(genebag.list)[!without.data]]
  res$mgs.profiles <- genebag.list.ordered[!without.data]
  res$mgs.50 <- genebag.list.50[!without.data]
  res$mean_vectors <- computeFilteredVectors(profile = res$mgs.50, type = "mean", filt = filt)
  if (!silent) 
  {
    print("           The filtered tracer vectors have been created successfully ...")
  }
  res$taxonomy <- as.data.frame(mgs.taxo[names(res$genebags),])
  name <- as.character(res$taxonomy$species)
  name <- paste(names(res$genebags), name)
  name <- gsub(" NA", "", name)
  res$taxonomy$name <- name
  if (!silent) 
  {
    print("           All operations are finished ...")
  }
  return(res)
}


#' \code{connectivity} 
#' @title connectivity
#' @description This function computes the intra-row correlation and applied a threshold to compute the connections of each row.
#' A connectivity vector is returned.
#' @author Emmanuelle le Chatelier & Edi Prifti
#' @param prof : a profile matrix. This data is used to compute correlations.
#' @param method : the correlation method, default = pearson.
#' @param th : default 0, if >0 than this threshold will be applied to compute a hard threholded connectivity.
#' @param soft : default = FALSE, if TRUE and when th > 0, the connectivity is computed as the soft threholded, sum of correlations 
#' above the threshold
#' @return a vector of connectivity
#' @note this connectivity does not use a hard thresholding but is based on a total correlation score
connectivity <- function (prof, method = "pearson", th = 0, soft = FALSE)  
{
  if (th < 0 | th >= 1) {
    stop("The threhold should be positive and smaller than 1.")
  }
  else {
    data.cor <- Hmisc::rcorr(t(prof), type = method)$r
    if (soft) {
      data.cor[data.cor <= th] <- 0                 #
      con <- colSums(data.cor, na.rm = TRUE)        #
    }
    else {
      con <- colSums(data.cor > th, na.rm = TRUE)   #
    }
  }
  return(con)
}


#' \code{selectListSize} 
#' @title selectListSize
#' @export
#' @description This function will sextract a part of a list based on the length of its components. Typically
#' a genebag list can be used. This function will work for uniclass lists of vectors and data frames.
#' @author Edi Prifti
#' @param l : a list of vectors or matrixes.
#' @param min.size : default = 0 and this has no action. This is the lower threshold for the element's size.
#' @param max.size : default = 0 and this has no action. This is the upper threshold for the element's size.
#' @param names : default = FALSE the function will return a subset of the list, if TRUE the function will 
#' return only the names of the elements of the list as identifiers.
#' @return the trimmed subset of the list.
selectListSize <- function(l, min.size=0, max.size=0, names=FALSE){
  if(min.size==0 & max.size==0){stop("nothing to be done")}
  if(min.size > max.size & max.size!=0){stop("minimum value should be lower than maximum value")}
  if(min.size!=0 | max.size!=0){
    type <- table(unlist(lapply(l, class)))
    if (length(type)==1){
      if(names(type)=="matrix" | names(type)=="data.frame"){
        l.size <- as.numeric(lapply(l, nrow))
      }else {l.size <- as.numeric(lapply(l, length))}
    }else {stop("multi class list case. not handled")}
    
    if(min.size!=0 & max.size!=0) {
      l.ind <- which(l.size >= min.size & l.size <= max.size)
    }else if(max.size!=0){
      l.ind <- which(l.size <= max.size)
    }else if(min.size!=0){
      l.ind <- which(l.size >= min.size)
    }
    if(names){
      return(names(l[l.ind]))
    }else{
      return(l[l.ind])
    }
  }
}


#' \code{profiles2Genebags} 
#' @title profiles2Genebags
#' @description This function will extract from a list of group profiles a list of identifiers of the matrix.
#' @author Edi Prifti
#' @param profiles : a list of dataframes
#' @return a list of genebags
profiles2Genebags <- function(profiles){
    if(!is.list(profiles)){
        stop("Error ! A list of profiles is needed.")
    }else{
      if(dim(profiles[[1]])<2){
        stop("Error ! the elements of the list should be data frames or matrixes.")
      }else{
        return(lapply(profiles, rownames))
      }
    }
}

#' End of section and file
eprifti/momr documentation built on Sept. 27, 2022, 3:36 a.m.