#' \code{projectOntoMGS}
#' @title projectOntoMGS
#' @export
#' @description This function takes a list of genes and projects it to a mgs catalogue
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector of gene_ids
#' @param list.mgs : this is the structured MGS information formatted as a list of geneids each corresponding to an MGS
#' @param res.filt.mode : the filtering method, either 'perc' or 'size', default will be size.
#' perc : genes projected onto mgs are only kept if they exceed a threshold percentage of the mgs size
#' size : genes projected onto mgs are only kept if they exceed a threshold number of genes
#' @param res.filt.threshold : the threshold used to retain an mgs according to the filtering mode, either 'perc' or 'size'
#' if 'perc' : the minimal percentage of genes projected onto mgs (number of genes projected / mgs size) needed to select a mgs
#' if 'size' : the minimal number of genes projected onto mgs
#' @param not_projected : whether a last genebag containing not projected genes is to be added to the list of selected mgs
#' @return a list of selected mgs with the geneids according to the projected genes
projectOntoMGS <- function (genebag,
list.mgs,
res.filt.mode = "size",
res.filt.threshold = 50,
not_projected = TRUE) {
res <- list()
if (res.filt.mode != "perc" & res.filt.mode != "size") {
stop("Choose 'perc' or 'size' as a filtering mode of the result")
}
else {
for (m in 1:length(list.mgs)) {
tmp <- genebag[(genebag %in% list.mgs[[m]])]
if (res.filt.mode == "size") {
if (length(tmp) >= res.filt.threshold) {
res[[names(list.mgs)[m]]] <- tmp
}
}
else {
if (length(tmp) >= (length(list.mgs[[m]]) * res.filt.threshold / 100)) {
res[[names(list.mgs)[m]]] <- tmp
}
}
}
if (not_projected == TRUE) {
if (length(unlist(res)) == 0) {
res[["not_projected"]] <- genebag
} else {
res[["not_projected"]] <- genebag[-which(genebag %in% unlist(res))]
}
}
return(res)
}
}
#' \code{extractProfiles}
#' @title extractProfiles
#' @export
#' @description This function extracts the profiles from a gene profile matrix of a group of genes or a list
#' of groups of genes. It can also restrict the size of the result
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param genebag : vector or list of gene_ids
#' @param data : raw count or frequency matrix with genes_ids as rawnames
#' @param size.max : default 15000, maximum number of profiles to be extracted
#' @param size.min : default 1, the minimal size threshold above which a group of genes is selected.
#' This is used to extract multiple profiles and filtering the list with a minimal number of genes
#' @param silent : default TRUE, detailling and following computation progress
#' @return a matrix or a list of profile matrixes
extractProfiles <- function (genebag,
data,
size.max = 15000,
size.min = 1,
silent = TRUE)
{
if (!is.list(genebag))
{
print("Profile extraction from a single gene vector")
if (length(genebag) < size.max)
{
ind <- match(genebag, rownames(data))
if (sum(is.na(ind)) > 0)
{
warning(paste(
"There ",
sum(is.na(ind)),
"are genes from the genebag not found in the data !"
))
ind <- ind[!is.na(ind)]
}
res <- data[ind, ]
} else
{
ind <- match(genebag[1:size.max], rownames(data))
if (sum(is.na(ind)) > 0)
{
warning(paste(
"There ",
sum(is.na(ind)),
"are genes from the genebag not found in the data !"
))
ind <- ind[!is.na(ind)]
}
res <- data[ind, ]
}
}
else
{
print("Multiple profile extraction")
res <- list()
for (i in 1:length(genebag))
{
if (i %% 10 == 0 & silent == FALSE)
print(i)
if (length(genebag[[i]]) < size.max)
{
ind <- match(genebag[[i]], rownames(data))
if (sum(is.na(ind)) > 0)
{
warning(paste(
"There ",
sum(is.na(ind)),
"are genes from the genebag not found in the data !"
))
ind <- ind[!is.na(ind)]
}
res[[i]] <- data[ind, ]
} else
{
ind <- match(genebag[[i]][1:size.max], rownames(data))
if (sum(is.na(ind)) > 0)
{
warning(paste(
"There ",
sum(is.na(ind)),
"are genes from the genebag not found in the data !"
))
ind <- ind[!is.na(ind)]
}
res[[i]] <- data[ind, ]
}
}
names(res) <- names(genebag)
res <- res[as.numeric(summary(genebag)[, "Length"]) >= size.min]
}
return(res)
}
#' \code{aggregateProfiles}
#' @title aggregateProfiles
#' @export
#' @description This function takes a list of profile matrixes and returns an aggregated big matrix.
#' The individual matrixes can be filtered in size so that the first X rows are used for each of them.
#' This function is used to prepare the data and plot different MGS as barcodes.
#' @author Emmanuelle Le Chatelier
#' @param list.profiles : list of matrix profiles
#' @param max.size : the maximum number of rows to be selected in the final agregated matrix.
#' default max.size=25.
#' @param min.size : this is the minimum number of rows rows to be selected in the final aggregated matrix.
#' If a group has less it will be discarded. By default min.size=max.size.
#' @return an aggregated profile matrix.
aggregateProfiles <- function(list.profiles,
max.size = 25,
min.size = max.size)
{
if (!is.list(list.profiles))
{
stop("Error! The object is not a list")
}
# list of matrices to be returned
res <- c()
for (i in 1:length(list.profiles))
{
if (nrow(list.profiles[[i]]) >= min.size)
{
res <- rbind(res, list.profiles[[i]][1:min(max.size, nrow(list.profiles[[i]])), ])
}
}
return(res)
}
#' \code{computeFilteredVectors}
#' @export
#' @title computeFilteredVectors
#' @description filters and computes verctors based on gene profiles from a single matrix or a
#' list of matrix profiles
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param profile : list of (or unique) matrix profiles
#' @param type : vectorisation method, vectors are calculated from the mean, median or sum of a given list of genes;
#' default type="mean" otherwise it will be "median" or "sum"
#' @param filt : filtering threshold in % , rate of positive value in an individual under which matrix is cleaned
#' and sparse values put to 0 default filt= 0, no filtering
#' @param debug : default is FALSE, when TRUE information on advancement is printed
#' @return a filtered vector or a matrix of filtered vectors
computeFilteredVectors <- function (profile,
type = "mean",
filt = 0,
debug = FALSE) {
if (is.list(profile)) {
res <- matrix(
data = NA,
ncol = ncol(profile[[1]]),
nrow = length(profile)
)
for (i in 1:length(profile)) {
if (debug)
if (i %% 100 == 0) {
print(i)
}
if (type == "mean") {
# mean
res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, mean)
} else {
if (type == "median") {
# median
res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, median)
} else {
#sum
res[i, ] <- apply(filterMat(as.matrix(profile[[i]]), filt = filt), 2, sum)
}
}
}
rownames(res) <- names(profile)
colnames(res) <- colnames(profile[[1]])
}
else {
if (type == "mean") {
# mean
res <- apply(filterMat(as.matrix(profile), filt = filt), 2, mean)
} else {
if (type == "median") {
# median
res <- apply(filterMat(as.matrix(profile), filt = filt), 2, median)
} else {
# sum
res <- apply(filterMat(as.matrix(profile), filt = filt), 2, sum)
}
}
}
return(res)
}
#' \code{buildMgsFinal}
#' @title buildMgsFinal
#' @export
#' @description This function will take a vector of genes (to be transformed into a list of genebags)
#' or a list of genebags and will extract the profiles. Next genes well be ordered by connectivity
#' which is to be computed for each group and the 50 most connected are selected to consitute the
#' marker genes. These will be then used to compute the mean vectors. A final object containing all
#' this information along with taxonomical annotation will be returned
#' @author Edi Prifti
#' @param genebag : a vector of genes to be projected onto mgs or a list of genebags, default = NULL.
#' @param mgs.cat : MGS catalogue to be used
#' @param mgs.taxo : taxonomy table for the MGS catalogue
#' @param profiles : the data profile matrix to extract the profiles
#' @param conn : if TRUE the connectivity of a group is to be computed and ordered, default = TRUE.
#' @param silent : print detailled information on progress, default = FALSE.
#' @param filt : filtering based on percentage of prevalence to avoid noise for no signal samples by computeFilteredVectors.
#' @param save : Save files after each step
#' @return a list containing the final elements such as the 50 most connected genes, the mean vectors etc
buildMgsFinal <- function (genebag = NULL,
mgs.cat,
mgs.taxo,
profiles,
conn = TRUE,
silent = TRUE,
filt = 10,
save = FALSE)
{
if (!is.list(genebag))
{
if (!silent)
{
print(" Genebag is not a list, projecting onto the MGS catalogue ...")
}
genebag.list <- projectOntoMGS(
genebag = genebag,
list.mgs = mgs.cat,
res.filt.mode = "size",
res.filt.threshold = 50
)
} else {
genebag.list <- genebag
}
genebag.list.dat <- extractProfiles(genebag.list, profiles, silent = silent)
if (!silent)
{
print(" Profiles have been extracted for each MGS ...")
}
if (save)
{
saveRDS(genebag.list.dat, file = "genebag.list.dat.rda", compress = FALSE)
}
if (!silent & save) {
print(" Profiles file has been saved in the working directory ...")
}
genebag.list.ordered <- list()
genebag.list.50 <- list()
if (!silent & conn)
{
print(" Profiles will be reordered for each MGS using connectivity ...")
}
for (i in 1:length(genebag.list))
{
if (i %% 100 == 0 & !silent)
print(i)
dat <- genebag.list.dat[[i]]
if (is.null(dim(dat))) {
nr <- 1
}
else {
nr <- nrow(dat)
}
if (nr == 0)
{
next()
}
if (conn == TRUE) {
dat <- filterMat(as.matrix(dat), filt = filt)
con <- connectivity(prof = dat,
soft = TRUE,
method = "spearman")
dat <- dat[order(con, decreasing = T), ]
}
genebag.list.ordered[[i]] <- dat
size.max <- min(50, nr)
if (nr == 1)
{
genebag.list.50[[i]] <- genebag.list.ordered[[i]]
} else
{
genebag.list.50[[i]] <- genebag.list.ordered[[i]][1:size.max, ]
}
}
if (!silent & conn)
{
print(" Profiles re-ordering, filtering has been finished ...")
}
if (!silent & !conn)
{
print(" Profiles filtering has been finished ...")
}
names(genebag.list.ordered) <- names(genebag.list.dat)
names(genebag.list.50) <- names(genebag.list.dat)
if (save & conn)
{
saveRDS(genebag.list.ordered,
file = "genebag.list.ordered.rda",
compress = FALSE)
}
if (save)
{
saveRDS(genebag.list.50, file = "genebag.list.50.rda", compress = FALSE)
}
without.data <- unlist(lapply(lapply(genebag.list.50, nrow), is.null))
res <- list()
res$genebags <- genebag.list[!without.data]
res$genebags.all <- mgs.cat[names(genebag.list)[!without.data]]
res$mgs.profiles <- genebag.list.ordered[!without.data]
res$mgs.50 <- genebag.list.50[!without.data]
res$mean_vectors <- computeFilteredVectors(profile = res$mgs.50,
type = "mean",
filt = filt)
if (!silent)
{
print(" The filtered tracer vectors have been created successfully ...")
}
res$taxonomy <- as.data.frame(mgs.taxo[names(res$genebags), ])
name <- as.character(res$taxonomy$species)
name <- paste(names(res$genebags), name)
name <- gsub(" NA", "", name)
res$taxonomy$name <- name
if (!silent)
{
print(" All operations are finished ...")
}
return(res)
}
#' \code{connectivity}
#' @title connectivity
#' @description This function computes the intra-row correlation and applied a threshold to compute the connections of each row.
#' A connectivity vector is returned.
#' @author Emmanuelle le Chatelier & Edi Prifti
#' @param prof : a profile matrix. This data is used to compute correlations.
#' @param method : the correlation method, default = pearson.
#' @param th : default 0, if >0 than this threshold will be applied to compute a hard threholded connectivity.
#' @param soft : default = FALSE, if TRUE and when th > 0, the connectivity is computed as the soft threholded, sum of correlations
#' above the threshold
#' @return a vector of connectivity
#' @note this connectivity does not use a hard thresholding but is based on a total correlation score
connectivity <- function (prof,
method = "pearson",
th = 0,
soft = FALSE)
{
if (th < 0 | th >= 1) {
stop("The threhold should be positive and smaller than 1.")
}
else {
data.cor <- Hmisc::rcorr(t(prof), type = method)$r
if (soft) {
data.cor[data.cor <= th] <- 0 #
con <- colSums(data.cor, na.rm = TRUE) #
}
else {
con <- colSums(data.cor > th, na.rm = TRUE) #
}
}
return(con)
}
#' \code{selectListSize}
#' @title selectListSize
#' @export
#' @description This function will sextract a part of a list based on the length of its components. Typically
#' a genebag list can be used. This function will work for uniclass lists of vectors and data frames.
#' @author Edi Prifti
#' @param l : a list of vectors or matrixes.
#' @param min.size : default = 0 and this has no action. This is the lower threshold for the element's size.
#' @param max.size : default = 0 and this has no action. This is the upper threshold for the element's size.
#' @param names : default = FALSE the function will return a subset of the list, if TRUE the function will
#' return only the names of the elements of the list as identifiers.
#' @return the trimmed subset of the list.
selectListSize <- function(l,
min.size = 0,
max.size = 0,
names = FALSE) {
if (min.size == 0 & max.size == 0) {
stop("nothing to be done")
}
if (min.size > max.size &
max.size != 0) {
stop("minimum value should be lower than maximum value")
}
if (min.size != 0 | max.size != 0) {
type <- table(unlist(lapply(l, class)))
if (length(type) == 1) {
if (names(type) == "matrix" | names(type) == "data.frame") {
l.size <- as.numeric(lapply(l, nrow))
} else {
l.size <- as.numeric(lapply(l, length))
}
} else {
stop("multi class list case. not handled")
}
if (min.size != 0 & max.size != 0) {
l.ind <- which(l.size >= min.size & l.size <= max.size)
} else if (max.size != 0) {
l.ind <- which(l.size <= max.size)
} else if (min.size != 0) {
l.ind <- which(l.size >= min.size)
}
if (names) {
return(names(l[l.ind]))
} else{
return(l[l.ind])
}
}
}
#' \code{profiles2Genebags}
#' @title profiles2Genebags
#' @description This function will extract from a list of group profiles a list of identifiers of the matrix.
#' @author Edi Prifti
#' @param profiles : a list of dataframes
#' @return a list of genebags
profiles2Genebags <- function(profiles) {
if (!is.list(profiles)) {
stop("Error ! A list of profiles is needed.")
} else{
if (dim(profiles[[1]]) < 2) {
stop("Error ! the elements of the list should be data frames or matrixes.")
} else{
return(lapply(profiles, rownames))
}
}
}
#' End of section and file
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.