R/datasets.R

Defines functions gene.clustering gene.correlation gene.histogram ground.truth dataset.split select.cells select.genes filter.dataset export.graph export.dataset export.geneset import.graph import.dataset import.geneset download.graph download.dataset download.geneset list.graphs list.datasets list.genesets

Documented in dataset.split download.dataset download.geneset download.graph export.dataset export.geneset export.graph filter.dataset gene.clustering gene.correlation gene.histogram ground.truth import.dataset import.geneset import.graph list.datasets list.genesets list.graphs select.cells select.genes

HOST = 'https://gnlearn-api.herokuapp.com'

#' List Genesets Available Via RESTful API
#'
#' This function allows you to list genesets available via RESTful API.
#' @param sp.scientific Species (Scientific Name) (regex) (optional).
#' @param sp.common Species (Common Name) (regex) (optional).
#' @param dataset Dataset Name (regex) (optional).
#' @keywords genes api
#' @export
#' @examples
#' genesets <- list.genesets()
#' genesets <- list.genesets(sp.common='human')
#' genesets <- list.genesets(sp.common='mouse')

list.genesets <- function(sp.scientific=NULL, sp.common=NULL, dataset=NULL, host=HOST) {
    uri <- file.path(host, 'genesets')
    df <- jsonlite::fromJSON(uri)
    if (!is.null(sp.scientific)) {
        regex <- sp.scientific
        df <- df[with(df, grepl(regex, sp.scientific, ignore.case=TRUE)), ]
    }
    if (!is.null(sp.common)) {
        regex <- sp.common
        df <- df[with(df, grepl(regex, sp.common, ignore.case=TRUE)), ]
    }
    if (!is.null(dataset)) {
        regex <- dataset
        df <- df[with(df, grepl(regex, dataset, ignore.case=TRUE)), ]
    }
    return(df)
}

#' List Datasets Available Via RESTful API
#'
#' This function allows you to list datasets available via RESTful API.
#' @param sp.scientific Species (Scientific Name) (regex) (optional).
#' @param sp.common Species (Common Name) (regex) (optional).
#' @param bio.layer Biological Information Layer (regex) (optional).
#' @param seq.protocol Sequencing Protocol (regex) (optional).
#' @param cell.identity Biological Cell Identity (regex) (optional).
#' @param min.genes Minimum Number of Genes (optional).
#' @param max.genes Maximum Number of Genes (optional).
#' @param min.cells Minimum Number of Cells (optional).
#' @param max.cells Maximum Number of Cells (optional).
#' @keywords datasets api
#' @export
#' @examples
#' datasets <- list.datasets()
#' datasets <- list.datasets(sp.common='human')
#' datasets <- list.datasets(sp.common='mouse')

list.datasets <- function(sp.scientific=NULL, sp.common=NULL, bio.layer=NULL, seq.protocol=NULL, cell.identity=NULL,
                          min.genes=0, max.genes=Inf, min.cells=0, max.cells=Inf, host=HOST) {
    uri <- file.path(host, 'datasets')
    df <- jsonlite::fromJSON(uri)
    if (!is.null(sp.scientific)) {
        regex <- sp.scientific
        df <- df[with(df, grepl(regex, sp.scientific, ignore.case=TRUE)), ]
    }
    if (!is.null(sp.common)) {
        regex <- sp.common
        df <- df[with(df, grepl(regex, sp.common, ignore.case=TRUE)), ]
    }
    if (!is.null(bio.layer)) {
        regex <- bio.layer
        df <- df[with(df, grepl(regex, bio.layer, ignore.case=TRUE)), ]
    }
    if (!is.null(seq.protocol)) {
        regex <- seq.protocol
        df <- df[with(df, grepl(regex, seq.protocol, ignore.case=TRUE)), ]
    }
    if (!is.null(cell.identity)) {
        regex <- cell.identity
        df <- df[with(df, grepl(regex, cell.identity, ignore.case=TRUE)), ]
    }
    df <- df[df$n.genes >= min.genes & df$n.genes <= max.genes, ]
    df <- df[df$n.cells >= min.cells & df$n.cells <= max.cells, ]
    return(df)
}

#' List Graphs Available Via RESTful API
#'
#' This function allows you to list graphs available via RESTful API.
#' @param sp.scientific Species (Scientific Name) (regex) (optional).
#' @param sp.common Species (Common Name) (regex) (optional).
#' @param dataset Dataset used as input to learn the graph (Dataset Download Code) (optional).
#' @param bio.layer Biological Information Layer (regex) (optional).
#' @param cell.identity Biological Cell Identity (regex) (optional).
#' @param algorithm Algorithm used to learn the graph (regex) (optional).
#' @param min.nodes Minimum Number of Nodes (optional).
#' @param max.nodes Maximum Number of Nodes (optional).
#' @param min.edges Minimum Number of Edges (optional).
#' @param max.edges Maximum Number of Edges (optional).
#' @keywords graphs api
#' @export
#' @examples
#' graphs <- list.graphs()
#' graphs <- list.graphs(sp.common='human')
#' graphs <- list.graphs(sp.common='mouse')

list.graphs <- function(sp.scientific=NULL, sp.common=NULL, dataset=NULL, bio.layer=NULL, cell.identity=NULL, algorithm=NULL,
                        min.nodes=0, max.nodes=Inf, min.edges=0, max.edges=Inf, host=HOST) {
    uri <- file.path(host, 'graphs')
    df <- jsonlite::fromJSON(uri)
    if (!is.null(sp.scientific)) {
        regex <- sp.scientific
        df <- df[with(df, grepl(regex, sp.scientific, ignore.case=TRUE)), ]
    }
    if (!is.null(sp.common)) {
        regex <- sp.common
        df <- df[with(df, grepl(regex, sp.common, ignore.case=TRUE)), ]
    }
    if (!is.null(dataset)) {
        df <- df[df$dataset == dataset, ]
    }
    if (!is.null(bio.layer)) {
        regex <- bio.layer
        df <- df[with(df, grepl(regex, bio.layer, ignore.case=TRUE)), ]
    }
    if (!is.null(cell.identity)) {
        regex <- cell.identity
        df <- df[with(df, grepl(regex, cell.identity, ignore.case=TRUE)), ]
    }
    if (!is.null(algorithm)) {
        regex <- algorithm
        df <- df[with(df, grepl(regex, algorithm, ignore.case=TRUE)), ]
    }
    df <- df[df$n.nodes >= min.nodes & df$n.nodes <= max.nodes, ]
    df <- df[df$n.edges >= min.edges & df$n.edges <= max.edges, ]
    return(df)
}

#' Download A Geneset Via RESTful API
#'
#' This function allows you to download a geneset available via RESTful API.
#' @param code Download Code (indicated by list.genesets() output).
#' @keywords genes api
#' @export
#' @examples
#' df <- download.geneset(1)

download.geneset <- function(code, host=HOST) {
    uri <- file.path(host, 'genesets', code)
    df <- jsonlite::fromJSON(uri)
    url <- df$url
    if (!is.null(url)) {
        tmp <- fs::file_temp(ext='.txt')
        utils::download.file(url, destfile=tmp, quiet=TRUE, mode='wt')
        df <- read.table(tmp, sep='\t', header=TRUE, check.names=FALSE)
        return(df)
    } else {
        return(NULL)
    }
}

#' Download A Dataset Via RESTful API
#'
#' This function allows you to download a dataset available via RESTful API.
#' @param code Download Code (indicated by list.datasets() output).
#' @param log Whether or not to apply log(x+1) (optional). Default: TRUE
#' @keywords datasets api
#' @export
#' @examples
#' df <- download.dataset (1)

download.dataset  <- function(code, log=TRUE, host=HOST) {
    uri <- file.path(host, 'datasets', code)
    df <- jsonlite::fromJSON(uri)
    url <- df$raw.dataset
    if (!is.null(url)) {
        tmp <- fs::file_temp(ext='.gz')
        utils::download.file(url, destfile=tmp, quiet=TRUE, mode='wb')
        tmp <- gzfile(tmp, 'rt')
        df <- read.table(tmp, sep='\t', header=TRUE, check.names=FALSE)
        df <- as.data.frame(lapply(df, as.double))
        df <- drop.all.zeros(df)
        if (log) {
            df <- log(df+1)
        }
        return(df)
    } else {
        return(NULL)
    }
}

#' Download A Graph Via RESTful API
#'
#' This function allows you to download a graph available via RESTful API.
#' @param code Download Code (indicated by list.graphs() output).
#' @keywords graphs api
#' @export
#' @examples
#' g <- download.graph(1)

download.graph <- function(code, to=c('igraph', 'adjacency', 'edges', 'graph', 'bnlearn'), host=HOST) {
    to <- match.arg(to)
    uri <- file.path(host, 'graphs', code)
    df <- jsonlite::fromJSON(uri)
    url <- df$url
    if (!is.null(url)) {
        tmp <- fs::file_temp(ext='.rds')
        utils::download.file(url, destfile=tmp, quiet=TRUE, mode='wb')
        g <- readRDS(tmp)
        g$average <- convert.format(g$average, from='adjacency', to=to)
        R <- length(g$replicates)
        for (i in 1:R) {
            g$replicates[[i]] <- convert.format(g$replicates[[i]], from='adjacency', to=to)
        }
        return(g)
    } else {
        return(NULL)
    }
}

#' Import A Local Geneset
#'
#' This function allows you to import a local geneset.
#' @param path Path to local geneset.
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not the file has a header (optional). Default: TRUE
#' @param index Whether or not the file has an index column (optional). Default: FALSE
#' @keywords genesets
#' @export
#' @examples
#' df <- import.geneset('mygeneset.tsv')

import.geneset <- function(path, sep='\t', header=TRUE, index=FALSE) {
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'rt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'rt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'rt')
    } else {
        file <- file(path, 'rt')
    }
    if (index) {
        df <- read.table(file, sep=sep, header=header, row.names=1, check.names=FALSE)
    } else {
        df <- read.table(file, sep=sep, header=header, check.names=FALSE)
    }
    df <- as.data.frame(df)
    close(file)
    return(df)
}

#' Import A Local Dataset
#'
#' This function allows you to import a local dataset.
#' @param path Path to local dataset.
#' @param log Whether or not to apply log(x+1) (optional). Default: FALSE
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not the file has a header (optional). Default: TRUE
#' @param index Whether or not the file has an index column (optional). Default: FALSE
#' @param transpose Whether or not to transpose the data matrix (optional). IMPORTANT NOTE: gnlearn works with cell x gene matrices. Default: FALSE
#' @keywords datasets
#' @export
#' @examples
#' df <- import.dataset('mydataset.tsv')

import.dataset <- function(path, log=FALSE, sep='\t', header=TRUE, index=FALSE, transpose=FALSE) {
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'rt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'rt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'rt')
    } else {
        file <- file(path, 'rt')
    }
    if (index) {
        df <- read.table(file, sep=sep, header=header, row.names=1, check.names=FALSE)
    } else {
        df <- read.table(file, sep=sep, header=header, check.names=FALSE)
    }
    df <- as.data.frame(lapply(df, as.double))
    df <- drop.all.zeros(df)
    if (transpose) {
        df <- as.data.frame(t(as.matrix(df)))
    }
    if (log) {
        df <- log(df+1)
    }
    close(file)
    return(df)
}

#' Import A Local Graph
#'
#' This function allows you to import a local graph.
#' @param path Path to local graph.
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not the file has a header (optional). Default: TRUE
#' @param index Whether or not the file has an index column (optional). Default: TRUE
#' @param from Input format (optional): 'adjacency' or 'edges'. Default: 'adjacency'
#' @param to Output format (optional): 'adjacency', 'edges', 'graph', 'igraph', or 'bnlearn'. Default: 'igraph'
#' @keywords graphs
#' @export
#' @examples
#' g <- import.graph('mygraph.tsv')

import.graph <- function(path, sep='\t', header=TRUE, index=TRUE, from=c('adjacency','edges'),
                         to=c('igraph', 'adjacency', 'edges', 'graph', 'bnlearn')) {
    from <- match.arg(from)
    to <- match.arg(to)
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'rt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'rt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'rt')
    } else {
        file <- file(path, 'rt')
    }
    if (from=='edges') {
        index <- FALSE
    }
    if (index) {
        df <- read.table(file, sep=sep, header=header, row.names=1, check.names=FALSE)
    } else {
        df <- read.table(file, sep=sep, header=header, check.names=FALSE)
    }
    df <- as.data.frame(df)
    if (header & !index & from=='adjacency') {
        rownames(df) <- colnames(df)
    }
    if (!header & index & from=='adjacency') {
        colnames(df) <- rownames(df)
    }
    g <- convert.format(df, from=from, to=to)
    close(file)
    return(g)
}

#' Export A Geneset To A File
#'
#' This function allows you to export a geneset to a file.
#' @param df Geneset to export.
#' @param path Output file path.
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not to include a header (optional). Default: TRUE
#' @param index Whether or not to include an index column (optional). Default: FALSE
#' @keywords genesets
#' @export
#' @examples
#' export.geneset(df, 'mygeneset.tsv')

export.geneset <- function(df, path, sep='\t', header=TRUE, index=FALSE) {
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'wt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'wt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'wt')
    } else {
        file <- file(path, 'wt')
    }
    write.table(df, file, append=FALSE, quote=FALSE, sep=sep, col.names=header, row.names=index)
    close(file)
}

#' Export A Dataset To A File
#'
#' This function allows you to export a dataset to a file.
#' @param df Dataset to export.
#' @param path Output file path.
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not to include a header (optional). Default: TRUE
#' @param index Whether or not to include an index column (optional). Default: FALSE
#' @param transpose Whether or not to transpose the data matrix (optional). IMPORTANT NOTE: gnlearn works with cell x gene matrices. Default: FALSE
#' @keywords datasets
#' @export
#' @examples
#' export.dataset(df, 'mydataset.tsv')

export.dataset <- function(df, path, sep='\t', header=TRUE, index=FALSE, transpose=FALSE) {
    if (transpose) {
        df <- as.data.frame(t(as.matrix(df)))
    }
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'wt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'wt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'wt')
    } else {
        file <- file(path, 'wt')
    }
    write.table(df, file, append=FALSE, quote=FALSE, sep=sep, col.names=header, row.names=index)
    close(file)
}

#' Export A Graph To A File
#'
#' This function allows you to graph a geneset to a file.
#' @param g Graph to export.
#' @param path Output file path.
#' @param sep Separator / delimiter character (optional). Default: TAB
#' @param header Whether or not to include a header (optional). Default: TRUE
#' @param index Whether or not to include an index column (optional). Default: TRUE
#' @param from Input format (optional): 'auto', 'adjacency', 'edges', 'graph', 'igraph', or 'bnlearn'. Default: 'auto'
#' @param from Output format (optional): 'adjacency' or 'edges'. Default: 'adjacency'
#' @keywords graphs
#' @export
#' @examples
#' export.graph(g, 'mygraph.tsv')

export.graph <- function(g, path, sep='\t', header=TRUE, index=TRUE,
                         from=c('auto', 'adjacency', 'edges', 'graph', 'igraph', 'bnlearn'), to=c('adjacency','edges')) {
    to <- match.arg(to)
    from <- match.arg(from)
    if (from=='auto') {
        from <- detect.format(g)
    }
    df <- convert.format(g, from=from, to=to)
    ext <- tools::file_ext(path)
    if (ext == 'gz') {
        file <- gzfile(path, 'wt')
    } else if (ext %in% c('bz', 'bz2')) {
        file <- bzfile(path, 'wt')
    } else if (ext == 'xz') {
        file <- xzfile(path, 'wt')
    } else {
        file <- file(path, 'wt')
    }
    write.table(df, file, append=FALSE, quote=FALSE, sep=sep, col.names=header, row.names=index)
    close(file)
}

#' Select (automatically) the most appropriate gene columns and cell rows of your dataset.
#'
#' This function allows you to (automatically) select the most appropriate gene columns and cell rows of your dataset.
#' @param df Dataset.
#' @param filter.first Whether to filter the cells or the genes first. Default: cells
#' @param genes Geneset with features (optional).
#' @param selected.genes User-selected genes (optional).
#' @param features Features you want to select (optional).
#' @param max.genes Maximum number of selected genes (optional).
#' @param max.cells Maximum number of selected cells (optional).
#' @param genes.criteria Criteria to be applied when it is necessary to select a maximum number of genes from the previously filtered ones. Default: 'non.zeros'
#' @param cells.criteria Criteria to be applied to select cells. Default: 'non.zeros'
#' @param min.non.zeros Minimum number of non-zero values per gene (optional).
#' @param cor Using correlation to exclude unconnected genes (optional).
#' @param cor.method Correlation method: 'spearman', 'kendall', or 'pearson'. Default: 'spearman'
#' @param cor.threshold Correlation threshold. Default: 0.1
#' @keywords select genes
#' @export
#' @examples
#' df <- filter.dataset(df, max.genes=100, max.cells=2000)

filter.dataset <- function(df, filter.first=c('cells','genes'), genes=NULL, selected.genes=NULL, features=NULL,
                           max.genes=100, max.cells=2000, genes.criteria=c('non.zeros','mean','variance','random'),
                           cells.criteria=c('non.zeros','mean','variance','random'), min.non.zeros=10,
                           cor=TRUE, cor.method=c('spearman','kendall','pearson'), cor.threshold=0.25) {
    filter.first <- match.arg(filter.first)
    genes.criteria <- match.arg(genes.criteria)
    cells.criteria <- match.arg(cells.criteria)
    cor.method <- match.arg(cor.method)

    if (filter.first=='cells') {
        df <- select.cells(df, max.cells=max.cells, selection.criteria=cells.criteria)
        df <- select.genes(df, genes=genes, selected.genes=selected.genes, features=features,
                           max.genes=max.genes, selection.criteria=genes.criteria, min.non.zeros=min.non.zeros,
                           cor=cor, cor.method=cor.method, cor.threshold=cor.threshold)
    } else {
        df <- select.genes(df, genes=genes, selected.genes=selected.genes, features=features,
                           max.genes=max.genes, selection.criteria=genes.criteria, min.non.zeros=min.non.zeros,
                           cor=cor, cor.method=cor.method, cor.threshold=cor.threshold)
        df <- select.cells(df, max.cells=max.cells, selection.criteria=cells.criteria)
    }
    return(df)
}

#' Select (automatically) the most appropriate gene columns of your dataset.
#'
#' This function allows you to (automatically) select the most appropriate gene columns of your dataset.
#' @param df Dataset.
#' @param genes Geneset with features (optional).
#' @param selected.genes User-selected genes (optional).
#' @param features Features you want to select (optional).
#' @param max.genes Maximum number of selected genes (optional).
#' @param selection.criteria Criteria to be applied when it is necessary to select a maximum number of genes from the previously filtered ones. Default: 'non.zeros'
#' @param min.non.zeros Minimum number of non-zero values per gene (optional).
#' @param cor Using correlation to exclude unconnected genes (optional).
#' @param cor.method Correlation method: 'spearman', 'kendall', or 'pearson'. Default: 'spearman'
#' @param cor.threshold Correlation threshold. Default: 0.1
#' @keywords select genes
#' @export
#' @examples
#' df <- select.genes(df, max.genes=100, min.non.zeros=2)
#' df <- select.genes(df, genes=genes, features=c('tumor.suppressor', 'breast.cancer'), cor=TRUE, cor.threshold=0.7)

select.genes <- function(df, genes=NULL, selected.genes=NULL, features=NULL,
                         max.genes=100, selection.criteria=c('non.zeros','mean','variance','random'), min.non.zeros=10,
                         cor=TRUE, cor.method=c('spearman','kendall','pearson'), cor.threshold=0.25) {
    selection.criteria <- match.arg(selection.criteria)
    cor.method <- match.arg(cor.method)
    if (is.null(selected.genes)) {
        selected.genes <- colnames(df)
    }
    selected.genes <- selected.genes[colSums(df!=0) >= min.non.zeros]
    if (!is.null(genes) & !is.null(features)) {
        f_genes <- c()
        for (f in features) {
            tmp <- intersect(selected.genes, genes[genes[f]==TRUE,]$name)
            f_genes <- union(f_genes, tmp)
        }
        selected.genes <- f_genes
    }
    df <- subset(df, select=selected.genes)
    if (cor) {
        A <- cor(df, method=cor.method)
        diag(A) <- 0
        A[abs(A) < cor.threshold] <- 0
        A <- drop.all.zeros(A, square.matrix='all')
        selected.genes <- colnames(A)
    }
    if (is.null(selected.genes)) {
        if (length(colnames(df)) > max.genes) {
            selected.genes <- switch(selection.criteria,
                non.zeros = {
                    nz <- as.list(colSums(df!=0))
                    selected.genes <- names(nz[order(unlist(nz), decreasing=TRUE)])[1:max.genes]
                },
                mean = {
                    m <- as.list(sapply(df, mean))
                    selected.genes <- names(m[order(unlist(m), decreasing=TRUE)])[1:max.genes]
                },
                variance = {
                    v <- as.list(sapply(df, var))
                    selected.genes <- names(v[order(unlist(v), decreasing=TRUE)])[1:max.genes]
                },
                random = {
                    selected.genes <- sample(colnames(df), max.genes, replace=FALSE)
                }
            )
            df <- subset(df, select=selected.genes)
        }
    } else if (length(selected.genes) > max.genes) {
        df <- subset(df, select=selected.genes)
        selected.genes <- switch(selection.criteria,
            non.zeros = {
                nz <- as.list(colSums(df!=0))
                selected.genes <- names(nz[order(unlist(nz), decreasing=TRUE)])[1:max.genes]
            },
            mean = {
                m <- as.list(sapply(df, mean))
                selected.genes <- names(m[order(unlist(m), decreasing=TRUE)])[1:max.genes]
            },
            variance = {
                v <- as.list(sapply(df, var))
                selected.genes <- names(v[order(unlist(v), decreasing=TRUE)])[1:max.genes]
            },
            random = {
                selected.genes <- sample(colnames(df), max.genes, replace=FALSE)
            }
        )
        df <- subset(df, select=selected.genes)
    } else {
        df <- subset(df, select=selected.genes)
    }
    df <- drop.all.zeros(df)
    return(df)
}

#' Select (automatically) the most appropriate cell rows of your dataset.
#'
#' This function allows you to (automatically) select the most appropriate cell rows of your dataset.
#' @param df Dataset.
#' @param max.cells Maximum number of selected cells (optional).
#' @param selection.criteria Criteria to be applied to select cells. Default: 'non.zeros'
#' @keywords select cells
#' @export
#' @examples
#' df <- select.cells(df, max.cells=2000)

select.cells<- function(df, max.cells=2000, selection.criteria=c('non.zeros','mean','variance','random')) {
    selection.criteria <- match.arg(selection.criteria)
    if (max.cells > nrow(df)) {
        max.cells <- nrow(df)
    }
    selected.cells <- switch(selection.criteria,
        non.zeros = {
            nz <- as.list(rowSums(df!=0))
            selected.cells <- names(nz[order(unlist(nz), decreasing=TRUE)])[1:max.cells]
        },
        mean = {
            m <- as.list(apply(df, 1, mean))
            selected.cells <- names(m[order(unlist(m), decreasing=TRUE)])[1:max.cells]
        },
        variance = {
            v <- as.list(apply(df, 1, var))
            selected.cells <- names(v[order(unlist(v), decreasing=TRUE)])[1:max.cells]
        },
        random = {
            selected.cells <- sample(rownames(df), max.cells, replace=FALSE)
        }
    )
    df <- df[selected.cells, ]
    df <- drop.all.zeros(df)
    return(df)
}

#' Split A Dataset Into Training And Test Datasets
#'
#' This function allows you to split a dataset into training and test datasets.
#' @param df Dataset.
#' @param m Size of training dataset (optional). Default: nrow(df)/2
#' @keywords training test dataframe
#' @export
#' @examples
#' splitted.df <- dataset.split(df)
#' splitted.df$train
#' splitted.df$test

dataset.split <- function(df, m=NULL) {
    if (is.null(m)) {
        m <- nrow(df)/2
    }

    df <- drop.all.zeros(df)
    nz <- as.list(colSums(df!=0))
    last.gene <- length(nz)-1
    selected.genes <- names(nz[nz>=10])
    df <- subset(df, select=selected.genes)

    R <- 1
    repeat {
        n.genes <- ncol(df)
        ixs <- sample(1:nrow(df), size=m, replace=FALSE)
        train <- df[ixs,]
        test <- df[-ixs,]
        train <- drop.all.zeros(train, rows=TRUE, columns=TRUE)
        test <- drop.all.zeros(test, rows=TRUE, columns=TRUE)
        if (dim(train)[2]==n.genes & dim(test)[2]==n.genes) {
            break
        } else {
            R <- R+1
            if (R%%100==0) {
                nz <- as.list(colSums(df!=0))
                last.gene <- length(nz)-1
                selected.genes <- names(nz[order(unlist(nz), decreasing=TRUE)])[1:last.gene]
                df <- subset(df, select=selected.genes)
                df <- drop.all.zeros(df)
            }
        }
    }
    return(list(
        train = train,
        test = test
    ))
}

#' Create a Ground Truth Graph
#'
#' This function allows you to create a ground truth graph from a geneset available via RESTful API or from a local list of edges.
#' @param x Download Code (must be of type 'TF-Target Interactions') or list of edges (data.frame or matrix).
#' @param to Output format ('adjacency', 'edges', 'igraph', or 'bn') (optional).
#' @keywords groundtruth graph
#' @export
#' @examples
#' gt <- ground.truth(2)

ground.truth <- function(x, to=c('igraph', 'adjacency', 'edges', 'graph', 'bnlearn')) {
    to <- match.arg(to)
    if (class(x)=='numeric') {
        genesets <- list.genesets()
        type <- genesets[genesets$download.code==x,]$dataset
        if (grepl('TF-Target Interactions', type, ignore.case=TRUE)) {
            geneset <- download.geneset(x)
        } else {
            message(paste('Wrong type of geneset?', '->', type))
            geneset <- download.geneset(x)
        }
    } else if (class(x) %in% c('data.frame','matrix')) {
        fmt <- detect.format(x)
        if (fmt=='adjacency') {
            g <- convert.format(x, to=to)
            return(g)
        }
    } else {
        return(NULL)
    }
    mtx <- as.adjacency(subset(geneset, select=c('from', 'to')))
    if (!is.null(geneset$type)) {
        repression <- geneset[geneset$type=='Repression',]
        for (i in 1:nrow(repression)) {
            gene.from <- repression[i,'from']
            gene.to <- repression[i,'to']
            mtx[gene.from, gene.to] <- -1
        }
    }
    g <- convert.format(mtx, to=to)
    return(g)
}

#' Plot expression histograms of genes in your dataset.
#'
#' This function allows you to plot the expression histograms of genes in your dataset.
#' @param df Dataset.
#' @param selected.genes User-selected genes (optional).
#' @param n.cols Number of column in the plot (optional). Default: 3
#' @keywords plot genes expression
#' @export
#' @examples
#' gene.histogram(df)

gene.histogram <- function(df, selected.genes=NULL, n.cols=3) {
    if (is.null(selected.genes)) {
        selected.genes <- colnames(df)
    }
    n.rows = ceiling(length(selected.genes)/n.cols)
    par(mfrow = c(n.rows, n.cols), mar = c(2, 2, 2, 2))
    for (gene in selected.genes) {
        x <- df[, gene]
        hist(x, prob=TRUE, xlab=gene, ylab='', main='', col=rgb(0.9,0.9,0.9))
        lines(density(x), lwd=2, col='darkred')
        curve(dnorm(x, mean=mean(x), sd=sd(x)), from=min(x), to=max(x), add=TRUE, lwd=2, col='darkgreen')
    }
}

#' Plot expression correlation of genes in your dataset.
#'
#' This function allows you to plot the expression correlation of genes in your dataset.
#' @param df Dataset.
#' @param selected.genes User-selected genes (optional).
#' @param method Correlation method: 'spearman', 'kendall', or 'pearson'. Default: 'spearman'
#' @keywords correlation genes expression
#' @export
#' @examples
#' gene.correlation(df)

gene.correlation <- function(df, selected.genes=NULL, method=c('spearman','kendall','pearson')) {
    method <- match.arg(method)
    if (is.null(selected.genes)) {
        selected.genes <- colnames(df)
    }
    pairs(df[, selected.genes],
       upper.panel = function(x, y, ...) {
         points(x=x, y=y, col=rgb(0.0,0.7,0.7))
         abline(coef(lm(y ~ x)), col='red', lwd=2)
       },
       lower.panel = function(x, y, ...) {
         par(usr = c(0,1,0,1))
         text(x=0.5, y=0.5, round(cor(x, y, method=method), 2), cex=2)
       }
    )
}

#' Gene Clustering
#'
#' This function allows you to cluster genes in your dataset by expression correlation.
#' @param df Dataset.
#' @param selected.genes User-selected genes (optional).
#' @param cor.method Correlation method: 'spearman', 'kendall', or 'pearson'. Default: 'spearman'
#' @keywords clustering genes expression
#' @export
#' @examples
#' gene.clustering(df)

gene.clustering <- function(df, selected.genes=NULL, cor.method=c('spearman','kendall','pearson')) {
    cor.method <- match.arg(cor.method)
    if (is.null(selected.genes)) {
        selected.genes <- colnames(df)
    }
    rho <- cor(df[, selected.genes], method=cor.method)
    diag(rho) <- NA
    palette.breaks <- seq(0,1,0.05)
    palette <- colorRampPalette(c(rgb(0.0,0.7,0.7), 'black', rgb(0.7,0.0,0.7)))
    par(oma = c(2,2,2,2))
    gplots::heatmap.2(rho, scale='none', trace='none', revC=TRUE, col=palette, breaks=palette.breaks, tracecol='yellow')
}
rlebron-bioinfo/gnlearn documentation built on July 25, 2020, 12:38 p.m.