Nothing
#' Get GTEx Expression Data for Specific Organ
#'
#' This function retrieves gene expression data from the GTEx project that is specific to a certain organ.
#' It performs various checks and processing steps to ensure that the data is consistent and relevant to the
#' specified organ. The filtered and cleaned data is saved as an RDS file for further analysis.
#'
#' @param organ_specific A character string specifying the organ to filter the gene expression data by.
#' @param file_path A character string specifying the path to the GTEx gene expression data file.
#' @param probe_map_path A character string specifying the path to the gtex_probeMap_gencode data file.
#' @param pheno_path A character string specifying the path to the GTEx phenotype data file.
#' @param output_path A character string specifying the path where the output RDS file will be saved.
#'
#' @details The function begins by checking if the gene expression and phenotype data files exist at
#' the specified paths. It then loads these data files and processes them by setting appropriate row names,
#' modifying column names for clarity, and filtering samples based on the specified organ. The function ensures
#' that only samples present in both datasets are retained for consistency. It also removes any duplicate gene
#' entries to prevent redundancy. Finally, the processed data is saved as an RDS file.
#'
#' @return A data frame containing gene expression data for the specified organ.
#' Rows represent genes, and columns represent samples. Note that this function also saves the
#' organ-specific GTEx data as an RDS file at the specified output path.
#'
#' @note The function will stop and throw an error if the input files do not exist, or if no samples are found
#' for the specified organ.
#'
#' @note CRITICAL: The 'output_path' parameter must end with '.rds' to be properly recognized by the function. It is also highly recommended
#' that the path includes specific identifiers related to the target samples. Please structure the 'output_path' following this pattern: './your_directory/your_sample_type.gtex.rds'.
#'
#' @importFrom utils read.table
#' @importFrom dplyr distinct filter
#' @importFrom rlang .data
#' @export
get_gtex_exp <- function(organ_specific,
file_path,
probe_map_path,
pheno_path,
output_path) {
# Check for the existence of the file paths
if (!file.exists(file_path) | !file.exists(pheno_path) | !file.exists(probe_map_path)) {
stop("One or more of the input files do not exist.")
}
# Load the gene expression, probe map, and phenotype data files from the provided paths
# gtex.exp <- data.table::fread(file_path, header = TRUE, sep = '\t', data.table = FALSE)
# gtex.pro <- data.table::fread(probe_map_path, header = TRUE, sep = '\t', data.table = FALSE)
# gtex.phe <- data.table::fread(pheno_path, header = TRUE, sep = '\t', data.table = FALSE)
# Load the gene expression, probe map, and phenotype data files
gtex.exp <- utils::read.table(file_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
gtex.pro <- utils::read.table(probe_map_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
gtex.phe <- utils::read.table(pheno_path,
header = TRUE,
sep = '\t',
stringsAsFactors = FALSE,
check.names = FALSE)
# Merge the probe map with the expression data
gtex.pro <- gtex.pro[, c(1,2)] # Assuming the columns of interest are the first two
gtex.count.pro <- merge(gtex.pro, gtex.exp, by.x = "id", by.y = "sample")
# Set the row names for the samples, facilitating subsequent operations
rownames(gtex.phe) <- gtex.phe$Sample
# Modify column names to be more intuitive
colnames(gtex.phe) <- c("Sample", "body_site_detail (SMTSD)", "primary_site", "gender", "patient", "cohort")
# Filter samples based on the specified organ
specific_samples <- dplyr::filter(gtex.phe, .data$primary_site == organ_specific)
# If no corresponding samples are found, halt the function with an error message
if (nrow(specific_samples) == 0) {
stop("No samples found for the specified organ.")
}
# Print the number of samples found for the specified organ
message("Number of samples for", organ_specific, ":", nrow(specific_samples), "\n")
# Ensure processing only for samples present in both expression and phenotype data through intersection
valid_sample_names <- intersect(rownames(specific_samples), colnames(gtex.count.pro)) # merge_phe_count_gtex
gtex_data <- gtex.count.pro[, c("gene", valid_sample_names)] # Extract data for relevant samples
# Remove duplicate gene entries and set row names as gene names
gtex_data <- dplyr::distinct(gtex_data, .data$gene, .keep_all = TRUE)
rownames(gtex_data) <- gtex_data$gene
gtex_data <- gtex_data[, -1] # Remove the 'gene' column, keeping only expression data
# Save the results as an RDS file for future data analysis tasks
saveRDS(gtex_data, output_path)
return(gtex_data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.