R/MIGTranscriptome_retrieval.R

Defines functions connect showDatasets getTablename getMatrix convertGene getProbes getExpression getMetadata getMetadataList sortMetadata getContrasts getSignificant getResultSet runPCA getPCA getVE tabulateResults getDiffMatrix getDatasetToContrastNames buildComparisonSet

Documented in connect convertGene getExpression getMatrix getMetadata getProbes getTablename showDatasets sortMetadata

# Data accessing from within R using RSQLite3

library(RSQLite)

##################################################
##################################################
##################################################

#' Connect to the database
#'
#' Connects to the SQLite database
#' @param db database to connect to (defaults to csvdb)
#' @return an sqlite connection
#' @import RSQLite 
#' @examples
#' connect(db="csvdb")
#' @export

connect <- function(db="csvdb"){

    # connect to a database
    sqlite <- dbDriver("SQLite")
    con <- dbConnect(sqlite, db)
    return(con)
}

##################################################
##################################################
##################################################

#' Show datasets and their attributes
#'
#' Display datasets and attributes related to those
#' datasets including metadata and contacts
#' @param connection RSQLite database connection
#' @return data frame of datasets and their attributes
#' @examples
#' showDatasets(connect("csvdb"))
#' @export

showDatasets <- function(connection){

    # return data frame of dataset information
    statement = 'SELECT * FROM reference'
    datasets <- dbGetQuery(connection, statement) 
    return(datasets)
}

##################################################
##################################################
##################################################

#' Return tablename based on attributes
#'
#' Return tablename based on attributes
#' @param dataset dataset to base tablename prefix on
#' @param type one of matrix, probe2gene_map, metadata
#' @return string
#' @export
#' @examples
#' getTablename("MIGTranscriptome_0001", type="matrix")

getTablename <- function(dataset, type="matrix"){

    # return tablename as string depending on type
    tablename = paste(dataset, type, sep="_")
    return(tablename)
}

##################################################
##################################################
##################################################

#' Return matrix of expression values for a given dataset
#'
#' Return a normalised expression matrix for a given
#' dataset in the database
#' @param connection sqlite connection
#' @param dataset string (dataset to retrieve matrix for)
#' @return data frame
#' @examples
#' getMatrix(connect(), "MIGTranscriptome_0001")
#' @export

getMatrix <- function(connection, dataset){

    # return dataframe of normalised counts
    tablename <- getTablename(dataset, type="matrix")
    statement <- paste0('SELECT * from ', tablename)
    mat <- dbGetQuery(connection, statement)
    rownames(mat) <- mat$test_id
    mat <- mat[,c(2:ncol(mat))]
    return(mat)
}

##################################################
##################################################
##################################################

#' Convert gene name to uppercase
#'
#' Convert gen name to uppercase
#' @param gene string (gene name in upper or lower case)
#' @return string
#' @examples
#' convertGene("S100a9") # returns S100A9

convertGene <- function(gene){

    # return upper case version of input
    gene <- toupper(gene)
    gene <- paste0('"', gene, '"')
    return(gene)
}

##################################################
##################################################
##################################################

#' Return probe/ensembl id set 
#'
#' Get all probes/ensembl ids that match a given gene
#' from the database
#' @param connection sqlite connection
#' @param dataset string (dataset to retrieve ids from)
#' @param gene gene to search for matching probes/ensembl ids
#' @return vector of probes/ensembl gene ids
#' @export
#' @examples
#' getProbes(connect(), "MIGTranscriptome_0001", "S100a9")

getProbes <- function(connection, dataset, gene){

    # return vector of probes/ensembl ids for a specified
    # gene
    tablename <- getTablename(dataset, type="probe2gene_map")
    statement <- paste0('SELECT probe FROM ', tablename, ' WHERE gene_name==', convertGene(gene))
    probes <- dbGetQuery(connection, statement)
    return(as.character(probes$probe))
}

##################################################
##################################################
##################################################

#' Return expression matrix for a given gene
#'
#' Return an expression matrix that contains values
#' for each probe/ensembl id for the given gene
#' @param connection sqlite connection
#' @param dataset string (dataset to retrieve ids from)
#' @param gene gene to get expression values for
#' @return data frame of expression values
#' @examples
#' getExpression(connect(), "MIGTranscriptome_0001", "S100a9")
#' @export

getExpression <- function(connection, dataset, gene){

    # return expression matrix for probes/ensembl ids
    # matching gene
    probes <- getProbes(connection, dataset, gene)
    mat <- getMatrix(connection, dataset)
    mat <- mat[probes,]
    return(mat)
}

##################################################
##################################################
##################################################

#' Get metadata
#'
#' Return the metadata for a given dataset
#' @param connection sqlite connection
#' @param dataset string (dataset to retrieve metadata for)
#' @examples
#' getMetadata(connect(), "MIGTranscriptome_0001")
#' @export

getMetadata <- function(connection, dataset){

    # return data frame of metadata
    tablename <- getTablename(dataset, type="metadata")
    statement <- paste0('SELECT * FROM ', tablename)
    metadata <- dbGetQuery(connection, statement)
    rownames(metadata) <- metadata$sample
    return(metadata)
}

##################################################
##################################################
##################################################

#' Get list of variables
#'
#' Return a list of metadata variables for a given dataset
#' @param connection sqlite connection
#' @param dataset string (dataset to retrieve metadata for)
#' @examples
#' getMetadataList(connect(), "MIGTranscriptome_0001")
#' @export

getMetadataList <- function(connection, dataset){

    # return a list of variables from metadata
    statement <- paste0('SELECT metadata FROM reference WHERE dataset==', '"', dataset, '"')
    metadata <- dbGetQuery(connection, statement)$metadata
    metadata <- unlist(strsplit(metadata, ","))
    return(metadata)
}

##################################################
##################################################
##################################################

#' Sort metadata
#'
#' Sort metadata by sample names given in an expression matrix
#' @param mat expression matrix (data frame)
#' @param metadata metadata returned by getMetadata (data frame)
#' @export
#' @examples
#' mat <- getMatrix(connect(), "MIGTranscriptome_0001")
#' metdata <- getMetadata(connect(), "MIGTranscriptome_0001")
#' sortMetadata(mat, metadata)

sortMetadata <- function(mat, metadata){

    # sort metadata according to columns in
    # matrix
    metadata <- metadata[colnames(mat),]
    return(metadata)
}


##################################################
##################################################
##################################################

#' Get contrasts
#'
#' Get contrasts for a specified dataset
#' @param connection database connection
#' @param dataset dataset of interest in database
#' @export
#' @examples
#' getContrasts(conn, "MIGTranscriptome_0001")

getContrasts <- function(conn, dataset){

    statement <- paste0('SELECT contrasts FROM reference WHERE dataset==', '"', dataset, '"')
    contrasts <- dbGetQuery(conn, statement)
    contrasts <- gsub("-", "_", contrasts$contrasts)
    contrasts <- unlist(strsplit(contrasts, ","))
    return(contrasts)
}

##################################################
##################################################
##################################################

#' Get significant sets
#'
#' Get significant expression differences for dataset and contrast
#' of interest
#' @param connection database connection
#' @param dataset dataset of interest in database
#' @param contrast contrast of interest
#' @param lfc log2 fold change threshold for defining significance
#' @param padj adjusted p-value threshold for defining significance
#' @param gene gene to find out if significant
#' @export
#' @examples
#' getSignificant(conn, "MIGTranscriptome_0001", "SNHh_TSB", "IL10")

getSignificant <- function(connection, dataset, contrast, lfc, padj, gene){

    # input comes as characters
    # so need to change here
    lfc <- as.numeric(lfc)
    padj <- as.numeric(padj)

    probes <- getProbes(connection, dataset, gene)
    if (length(probes) == 1){
        probes <- convertGene(probes)
    }
    
    probes <- paste0(probes, collapse=",")
    tablename <- getTablename(getTablename(dataset, contrast), type="result")

    # query
    statement <- paste0('SELECT * FROM ', tablename, ' WHERE test_id IN ', '(', probes, ')', ' AND padj < ', padj, ' AND ABS(l2fold) > ', lfc)

    significant <- dbGetQuery(connection, statement)
    if (nrow(significant) == 0){
        significant <- NA
    }
    else
    {
    significant$dataset <- dataset
    significant <- significant[, c("dataset", "test_id", "l2fold", "padj")]
    significant$test_id <- as.character(significant$test_id)
    }
    return(significant)		
}

##################################################
##################################################
##################################################

#' Get differential expression results
#'
#' Get the differential expression results for
#' a given dataset and contrast
#' @param conn connection
#' @param dataset dataset to choose
#' @param contrast contrast of interest
#' @export
#' @examples
#' getResultSet(connect(), "MIGTranscriptome_0001", "SNHh_TSB")

getResultSet <- function(conn, dataset, contrast){

    tablename <- getTablename(dataset, type="result")
    tablename <- paste0(dataset, "_", contrast, "_result")
    statement <- paste0('SELECT * FROM ', '"', tablename, '"')
    dat <- dbGetQuery(conn, statement)
    probe2gene <- getTablename(dataset, type="probe2gene_map")
    statement <- paste0('SELECT * FROM ', probe2gene)
    probe2gene <- dbGetQuery(conn, statement)
    rownames(probe2gene) <- as.character(probe2gene$probe)
    dat$gene_name <- probe2gene[as.character(dat$test_id),]$gene_name
    return(dat)
}

##################################################
##################################################
##################################################

#' Run PCA
#'
#' Run PCA on matrix
#' @param df data frame of normalised counts
#' @export
#' @examples
#' runPCA(df)

runPCA <- function(df, scale=TRUE){

    pc <- prcomp(t(df), scale=scale)
    return (pc)
}

##################################################
##################################################
##################################################

#' Get PCA
#'
#' Get princple components
#' @param pc prcomp object
#' @export
#' @examples
#' getPCA(runPCA(df))

getPCA <- function(pc){

    pcs <- data.frame(pc$x)
    return(pcs)
}

##################################################
##################################################
##################################################

#' Get variance explained
#'
#' Get variance explained for prcomp object
#' @param pc prcomp object
#' @param component string (component to get VE for)
#' @export
#' @examples
#' getVE(runPCA(df))

getVE <- function(pc, component="PC1"){

    pve <- summary(pc)$importance[,component][[2]]
    return (pve)
}



##################################################
##################################################
##################################################

#' Tabulate results

#' Tabulate probes/genes sorted by padj
#' @param connection connection to database
#' @param dataset dataset to look at
#' @param contrast contrast of interest
#' @export
#' @examples
#' tabulateResults(connect(), "MIGTranscriptome_0001", "SNHh_TSB")

tabulateResults <- function(connection, dataset, contrast){

    result.tablename <- getTablename(dataset, type="result")
    result.tablename <- paste0(dataset, "_", contrast, "_result")
    probe2gene <- getTablename(dataset, type="probe2gene_map")
    statement <- paste0('SELECT a.*, b.gene_name FROM ', result.tablename, ' as a, ', probe2gene, ' as b ', ' WHERE a.test_id==b.probe ')
    result <- dbGetQuery(connection, statement)
    res <- result[order(result$padj, decreasing=FALSE),]
    return(res)
}

##################################################
##################################################
##################################################

#' Get expression values for differentially expressed probes/genes

#' Get expression matrix for differentially expressed probes/genes
#' @param connection connection to database
#' @param dataset dataset to see
#' @param contrast contrast of interest
#' @param lfc log fold change threshold
#' @param padj adjusted p-value threshold
#' @export
#' @examples
#' getDiffMatrix(connect(), "MIGTranscriptome_0001", "SNHh_TSB", 1, 0.05)

getDiffMatrix <- function(connection, dataset, contrast, lfc, padj){

    result.tablename <- getTablename(dataset, type="result")
    result.tablename <- paste0(dataset, "_", contrast, "_result")

    matrix.tablename <- getTablename(dataset, type="matrix")

    statement <- paste0('SELECT a.* FROM ', matrix.tablename, ' as a, ', result.tablename,  ' as b  WHERE a.test_id==b.test_id AND b.padj < ', padj, ' AND abs(b.l2fold) > ', lfc)
    mat <- dbGetQuery(connection, statement)
    rownames(mat) <- mat$test_id
    mat <- mat[,2:ncol(mat)]
    return(mat)
}

##################################################
##################################################
##################################################

#' Get choices for comparing datasets

#' These form the choices for comparisons
#' @param connection connection to database
#' @export
#' @examples
#' getDatasetToContrastNames(connect())

getDatasetToContrastNames <- function(connection){

    statement <- 'SELECT dataset, contrasts FROM reference'
    datasets <- dbGetQuery(connection, statement)
    comparison.choices <- c()
    for (dataset in datasets$dataset){
        contrasts <- getContrasts(connection, dataset)
	for (contrast in contrasts){
	    newname <- paste0(dataset, "__", contrast)
	    comparison.choices <- append(newname, comparison.choices)
        }
    }
    return(comparison.choices)
}

##################################################
##################################################
##################################################

#' Build dataframe for comparing fold changes

#' Build data frame for comparing fold changes
#' @param connection connection to database
#' @param dataset1 string (dataset1)
#' @param dataset2 string (dataset2)
#' @export
#' @examples
#' To follow

buildComparisonSet <- function(connection, dataset1, dataset2){


    d1 <- unlist(strsplit(dataset1, "__"))
    d2 <- unlist(strsplit(dataset2, "__"))

    data1 <- d1[1]
    contrast1 <- d1[2]
    data2 <- d2[1]
    contrast2 <- d2[2]

    tablename1 <- getTablename(data1, type=contrast1)
    tablename1 <- getTablename(tablename1, type="result")
    probe2gene1 <- getTablename(data1, type="probe2gene_map")

    tablename2 <- getTablename(data2, type=contrast2)
    tablename2 <- getTablename(tablename2, type="result")
    probe2gene2 <- getTablename(data2, type="probe2gene_map")

    statement1 <- paste0('SELECT a.gene_name, AVG(b.l2fold) as l2fold, b.padj as padj FROM ', probe2gene1,  ' as a, ',  tablename1, ' as b ', 'WHERE a.probe==b.test_id GROUP BY a.gene_name')
    l2fold1 <- dbGetQuery(connection, statement1)

    statement2 <- paste0('SELECT a.gene_name, AVG(b.l2fold) as l2fold, b.padj as padj FROM ', probe2gene2,  ' as a, ',  tablename2, ' as b ', 'WHERE a.probe==b.test_id GROUP BY a.gene_name')
    l2fold2 <- dbGetQuery(connection, statement2)

    df <- merge(l2fold1, l2fold2, by.x="gene_name", by.y="gene_name")

    colnames(df) <- c("gene_name", paste0(dataset1, "_l2fold"), paste0(dataset1, "_padj"), paste0(dataset2, "_l2fold"), paste0(dataset2, "_padj"))
    return(na.omit(df))

}    

##################################################
##################################################
##################################################
nickilott/MIGTranscriptomeExplorer documentation built on Dec. 9, 2019, 2:11 p.m.