R/querries.R

Defines functions profiles2Genebags selectListSize connectivity buildMgsFinal computeFilteredVectors aggregateProfiles extractProfiles projectOntoMGS

Documented in aggregateProfiles buildMgsFinal computeFilteredVectors connectivity extractProfiles profiles2Genebags projectOntoMGS selectListSize

#' \code{projectOntoMGS} 
#' @title projectOntoMGS
#' @description This function takes a list of genes and projects it to a mgs catalogue
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector of gene_ids
#' @param list.mgs : this is the structured MGS information formatted as a list of geneids each corresponding to an MGS
#' @param res.filt.mode : the filtering method, either 'perc' or 'size', default will be size.
#'        perc : genes projected onto mgs are only kept if they exceed a threshold percentage of the mgs size 
#'        size : genes projected onto mgs are only kept if they exceed a threshold number of genes
#' @param res.filt.threshold : the threshold used to retain an mgs according to the filtering mode, either 'perc' or 'size'
#'        if 'perc' : the minimal percentage of genes projected onto mgs (number of genes projected / mgs size) needed to select a mgs 
#'        if 'size' : the minimal number of genes projected onto mgs
#' @param not_projected : whether a last genebag containing not projected genes is to be added to the list of selected mgs
#' @return a list of selected mgs with the geneids according to the projected genes
projectOntoMGS <- function (genebag, list.mgs, res.filt.mode = "size", res.filt.threshold = 50, not_projected = TRUE) {
  res <- list()
  if (res.filt.mode != "perc" & res.filt.mode != "size") {
    stop("Choose 'perc' or 'size' as a filtering mode of the result")
  }
  else {
    for (m in 1:length(list.mgs)) {
      tmp <- genebag[(genebag %in% list.mgs[[m]])]
      if (res.filt.mode == "size") {
        if (length(tmp) >= res.filt.threshold) {
          res[[names(list.mgs)[m]]] <- tmp
        }
      }
      else {
        if (length(tmp) >= (length(list.mgs[[m]]) * res.filt.threshold/100)) {
          res[[names(list.mgs)[m]]] <- tmp
        }
      }
    }
    if (not_projected == TRUE) {
      if(length(unlist(res))==0){
        res[["not_projected"]] <- genebag
      } else {
        res[["not_projected"]] <- genebag[-which(genebag %in% unlist(res))]       
      }
    }
    return(res)
  }
}

#' \code{extractProfiles} 
#' @title extractProfiles
#' @description This function extracts the profiles from a gene profile matrix of a group of genes or a list 
#' of groups of genes. It can also restrict the size of the result
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector or list of gene_ids
#' @param data : raw count or frequency matrix with genes_ids as rawnames
#' @param size.max : default 15000, maximum number of profiles to be extracted
#' @param size.min : default 1, the minimal size threshold above which a group of genes is selected.
#' This is used to extract multiple profiles and filtering the list with a minimal number of genes
#' @param silent : default TRUE, detailling and following computation progress
#' @return a matrix or a list of profile matrixes
extractProfiles <- function(genebag, data, size.max = 15000, size.min = 1, silent=TRUE){
  if(!is.list(genebag)){
    print("Profile extraction from a single gene vector")
    if(length(genebag) < size.max){
      res <- data[match(genebag, rownames(data)),]
    }else{
      res <- data[match(genebag[1:size.max], rownames(data)),]
    }
  }else{
    print("Multiple profile extraction")
    # list of matrixes to be returned
    res <- list()
    for(i in 1:length(genebag)){
      if(i%%10==0 & silent==FALSE){print(i)}
      if(length(genebag[[i]]) < size.max){
        res[[i]] <- data[match(genebag[[i]], rownames(data)),]
      }else{
        res[[i]] <- data[match(genebag[[i]][1:size.max], rownames(data)),]
      }
    }
    names(res) <- names(genebag)
    res <- res[as.numeric(summary(genebag)[,"Length"])>=size.min]
  }
  return(res)
}

#' \code{aggregateProfiles} 
#' @title aggregateProfiles
#' @description This function takes a list of profile matrixes and returns an aggregated big matrix.
#' The individual matrixes can be filtered in size so that the first X rows are used for each of them.
#' This function is used to prepare the data and plot different MGS as barcodes.
#' @author Emmanuelle Le Chatelier
#' @param list.profiles : list of matrix profiles
#' @param max.size : the maximum number of rows to be selected in the final agregated matrix.
#' default   max.size=25.
#' @param min.size : this is the minimum number of rows rows to be selected in the final aggregated matrix. 
#' If a group has less it will be discarded. By default min.size=max.size.
#' @return an aggregated profile matrix.
aggregateProfiles <- function(list.profiles, max.size = 25, min.size = max.size){
  if(!is.list(list.profiles)){
    stop("Error! The object is not a list")
  }
  # list of matrices to be returned
  res <- c()
  for(i in 1:length(list.profiles)){
    if(nrow(list.profiles[[i]]) >= min.size){
      res <- rbind(res, list.profiles[[i]][1:min(max.size, nrow(list.profiles[[i]])),])
    }
  }
  return(res)
}

#' \code{computeFilteredVectors} 
#' @title computeFilteredVectors
#' @description filters and computes verctors based on gene profiles from a single matrix or a 
#' list of matrix profiles
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param profile : list of (or unique) matrix profiles
#' @param type : vectorisation method, vectors are calculated from the mean, median or sum of a given list of genes;
#' default type="mean" otherwise it will be "median" or  "sum"
#' @param filt : filtering threshold in % , rate of positive value in an individual under which matrix is cleaned 
#' and sparse values put to 0 default filt= 0, no filtering
#' @param debug : default is FALSE, when TRUE information on advancement is printed
#' @return a filtered vector or a matrix of filtered vectors
computeFilteredVectors <- function (profile, type = "mean", filt = 0, debug = FALSE) {
  if (is.list(profile)) {
    res <- matrix(data = NA, ncol = ncol(profile[[1]]), nrow = length(profile))
    for (i in 1:length(profile)) {
      if(debug) if (i%%100 == 0) { print(i) }
      if (type == "mean") { # mean
        res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, mean)
      } else {
        if (type == "median") { # median
          res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, median)
        }else { #sum
          res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, sum)
        }
      }
    }
    rownames(res) <- names(profile)
    colnames(res) <- colnames(profile[[1]])
  }
  else {
    if (type == "mean") { # mean
      res <- apply(filterMat(as.matrix(profile), filt = filt), 2, mean)
    }else {
      if(type== "median"){ # median
        res <- apply(filterMat(as.matrix(profile), filt = filt), 2, median)
      }else { # sum
        res <- apply(filterMat(as.matrix(profile), filt = filt), 2, sum)
      }
    }
  }
  return(res)
}

#' \code{buildMgsFinal} 
#' @title buildMgsFinal
#' @@date December 20th 2013
#' @description This function will take a vector of genes (to be transformed into a list of genebags) 
#'      or a list of genebags and will extract the profiles. Next genes well be ordered by connectivity
#'      which is to be computed for each group and the 50 most connected are selected to consitute the 
#'      marker genes. These will be then used to compute the mean vectors. A final object containing all 
#'      this information along with taxonomical annotation will be returned
#' @author Edi Prifti
#' @param genebag : a vector of genes to be projected onto mgs or a list of genebags, default = NULL.
#' @param mgs.cat : MGS catalogue to be used
#' @param mgs.taxo : taxonomy table for the MGS catalogue
#' @param profiles :  the data profile matrix to extract the profiles
#' @param conn : if TRUE the connectivity of a group is to be computed and ordered, default = TRUE.
#' @param silent : print detailled information on progress, default = FALSE.
#' @param filt : filtering based on percentage of prevalence to avoid noise for no signal samples by 
#' computeFilteredVectors.
#' @return a list containing the final elements such as the 50 most connected genes, the mean vectors etc
buildMgsFinal <- function (genebag = NULL, mgs.cat, mgs.taxo, profiles, conn = TRUE, silent = TRUE, filt=20) {
  # if a list of genes than we need to project it onto the mgs.cat
  if (!is.list(genebag)) {
    genebag.list <- projectOntoMGS(genebag = genebag, list.mgs = mgs.cat, res.filt.mode = "size", res.filt.threshold = 50)
    #NOTE: another possibility is to trim the mgs.cat using selectListSize for an upper and lower trim and then projectOntoMGS
  } else { genebag.list <- genebag }
  # extract the profiles for the list of mgs
  genebag.list.dat <- extractProfiles(genebag.list, profiles, silent=silent)
  genebag.list.ordered <- list()
  genebag.list.filtered <- list()
  genebag.list.50 <- list()
  for (i in 1:length(genebag.list)) {
    if (i%%10 == 0 & !silent) {print(i)} # inform on where we are
    dat <- genebag.list.dat[[i]]
    if (conn == TRUE) { # order by connectivity
      con <- connectivity(dat) # connectivity as sum of correlations using default settings.
      dat <- dat[order(con, decreasing = T), ]
    }
    genebag.list.ordered[[i]] <- dat
    genebag.list.filtered[[i]] <- filterMat(as.matrix(dat), filt=filt)
    genebag.list.50[[i]] <- genebag.list.filtered[[i]][1:50, ]
  }
  names(genebag.list.ordered) <- names(genebag.list.dat)
  names(genebag.list.filtered) <- names(genebag.list.dat)
  names(genebag.list.50) <- names(genebag.list.dat)
  # build the resulting object
  res <- list()
  res$genebags <- genebag.list
  res$genebags.all <- mgs.cat[names(genebag.list)]
  res$mgs.profiles <- genebag.list.ordered
  res$mgs.50 <- genebag.list.50
  res$mean_vectors <- computeFilteredVectors(profile = res$mgs.50, type="mean",filt=0) # the filtering was done on the whole MGS
  res$taxonomy <- as.data.frame(mgs.taxo[names(res$genebags), ])
  name <- as.character(res$taxonomy$species)
  name <- paste(names(res$genebags), name)
  name <- gsub(" NA", "", name)
  res$taxonomy$name <- name
  return(res)
}


#' \code{connectivity} 
#' @title connectivity
#' @description This function computes the intra-row correlation and applied a threshold to compute the connections of each row.
#' A connectivity vector is returned.
#' @author Emmanuelle le Chatelier & Edi Prifti
#' @param prof : a profile matrix. This data is used to compute correlations.
#' @param method : the correlation method, default = pearson.
#' @param th : default 0, if >0 than this threshold will be applied to compute a hard threholded connectivity.
#' @param soft : default = FALSE, if TRUE and when th > 0, the connectivity is computed as the soft threholded, sum of correlations 
#' above the threshold
#' @return a vector of connectivity
#' @note this connectivity does not use a hard thresholding but is based on a total correlation score
connectivity <- function (prof, method = "pearson", th = 0, soft=FALSE) {
  if(th < 0 | th >= 1 ) {
    stop("The threhold should be positive and smaller than 1.")
  }else{
    # compute the correlation matrix
    data.cor <- Hmisc::rcorr(t(prof), type = method)$r
    if(soft){ # if real correlations
      if(th==0){
        con <- colSums(data.cor , na.rm = TRUE)
      }else{ # don't take into account values lower than the threshold
        tmp <- data.cor; tmp[tmp<=th] <- 0
        con <- colSums(tmp, na.rm=TRUE)
      }
    }else{ # hard threholding
      con <- colSums((data.cor > th) + 0, na.rm=TRUE)
    }
  }
  return(con)
}


#' \code{selectListSize} 
#' @title selectListSize
#' @description This function will sextract a part of a list based on the length of its components. Typically
#' a genebag list can be used. This function will work for uniclass lists of vectors and data frames.
#' @author Edi Prifti
#' @param l : a list of vectors or matrixes.
#' @param min.size : default = 0 and this has no action. This is the lower threshold for the element's size.
#' @param max.size : default = 0 and this has no action. This is the upper threshold for the element's size.
#' @param names : default = FALSE the function will return a subset of the list, if TRUE the function will 
#' return only the names of the elements of the list as identifiers.
#' @return the trimmed subset of the list.
selectListSize <- function(l, min.size=0, max.size=0, names=FALSE){
  if(min.size==0 & max.size==0){stop("nothing to be done")}
  if(min.size > max.size & max.size!=0){stop("minimum value should be lower than maximum value")}
  if(min.size!=0 | max.size!=0){
    type <- table(unlist(lapply(l, class)))
    if (length(type)==1){
      if(names(type)=="matrix" | names(type)=="data.frame"){
        l.size <- as.numeric(lapply(l, nrow))
      }else {l.size <- as.numeric(lapply(l, length))}
    }else {stop("multi class list case. not handled")}
    
    if(min.size!=0 & max.size!=0) {
      l.ind <- which(l.size >= min.size & l.size <= max.size)
    }else if(max.size!=0){
      l.ind <- which(l.size <= max.size)
    }else if(min.size!=0){
      l.ind <- which(l.size >= min.size)
    }
    if(names){
      return(names(l[l.ind]))
    }else{
      return(l[l.ind])
    }
  }
}


#' \code{profiles2Genebags} 
#' @title profiles2Genebags
#' @description This function will extract from a list of group profiles a list of identifiers of the matrix.
#' @author Edi Prifti
#' @param profiles : a list of dataframes
#' @return a list of genebags
profiles2Genebags <- function(profiles){
    if(!is.list(profiles)){
        stop("Error ! A list of profiles is needed.")
    }else{
      if(dim(profiles[[1]])<2){
        stop("Error ! the elements of the list should be data frames or matrixes.")
      }else{
        return(lapply(profiles, rownames))
      }
    }
}

#' End of section and file

Try the momr package in your browser

Any scripts or data that you put into this service are public.

momr documentation built on May 29, 2017, 5:47 p.m.