#' Read in omics data.
#'
#' This function reads omics data: differentially expressed genes and relatively
#' differentially abundant proteins with corresponding fold changes for each
#' time point.
#'
#' @param tp_prots numeric vector of protein timepoints used in experiment;
#' in case of single time point experiments simply assign an experiment number
#' (e.g. 1).
#' @param tp_genes numeric vector of gene/transcript timepoints used in
#' experiment; in case of single time point experiments simply assign an
#' experiment number (e.g. 1).
#' @param omics list containing protein and gene IDs and fold changes:
#' Input are two lists, one for protein data, one for gene data,
#' both including a vector of all measured IDs as first element and a list as
#' second element including for each tp a dataframe with IDs and foldchanges
#' per timepoint.
#' @param PWdatabase character vector of pathway database names which should be
#' used for pathway identification, possible choices are "biocarta", "kegg",
#' "nci", "reactome".
#' @param TFtargetdatabase character vector of TF target database names which
#' should be used for transcription factor/target gene identification,
#' possible choices are "chea", "pazar", "userspec". In case the user is able
#' to provide an own list of transcription factor target gene matches, he can
#' indicate this via "userspec".
#' @return OmicsData object:
#' list of 4 elements (OmicsD, PathwayD, TFtargetsD,
#' Status); OmicsD containing omics data set + results (after analysis);
#' PathwayD containing selected pathway databases + biopax model;
#' TFtargetsD containing selected TF target gene databases + TF target gene data.
#' @keywords manip
#' @export
#' @examples
#' \dontrun{
#' data(OmicsExampleData)
#' data_omics = readOmics(tp_prots = c(0.25, 1, 4, 8, 13, 18, 24),
#' tp_genes = c(1, 4, 8, 13, 18, 24), OmicsExampleData,
#' PWdatabase = c("biocarta", "kegg", "nci", "reactome"),
#' TFtargetdatabase = c("userspec"))
#' }
readOmics <- function(tp_prots, tp_genes, omics, PWdatabase, TFtargetdatabase) {
OmicsData = list(OmicsD = list(OmicsDescription = list(
timepoints = list(Proteins = tp_prots, Genes = tp_genes),
allProteinIDs = unique(omics$P[[1]]),
allGeneIDs = unique(omics$G[[1]])),
OmicsDataSet = list(
ProteinData = list(),
GeneData = list()),
OmicsResults = list(
ProteinData = list(),
GeneData = list())),
PathwayD = list(PWdatabaseNames = PWdatabase,
BiopaxModel = list()),
TFtargetsD = list(TFtargetdatabaseNames = TFtargetdatabase,
TFtargetData = list()),
Status = 1,
PhosphoD = data.frame())
uni_protdata = data.frame()
for(plen in 1: length(tp_prots))
{uni_protdata = omics$P[[2]][[plen]][isUnique(omics$P[[2]][[plen]][,1]),]
OmicsData$OmicsD$OmicsDataSet$ProteinData[[plen]] = uni_protdata}
names(OmicsData$OmicsD$OmicsDataSet$ProteinData) = paste("tp", tp_prots,
sep = "")
uni_genedata = data.frame()
for(glen in 1: length(tp_genes))
{uni_genedata = omics$G[[2]][[glen]][isUnique(omics$G[[2]][[glen]][,1]),]
OmicsData$OmicsD$OmicsDataSet$GeneData[[glen]] = uni_genedata}
names(OmicsData$OmicsD$OmicsDataSet$GeneData) = paste("tp", tp_genes,
sep = "")
class(OmicsData) = "OmicsData"
return(OmicsData)
}
#' Reads in phosphoprotein downstream regulation information.
#'
#' This function reads in phosphoprotein downstream regulation
#' information from a txt file.
#'
#' @param data_omics OmicsData object.
#' @param phosphoreg txt file with 2 columns, first providing gene names, second
#' giving -1/1 annotation whether downstream regulation of phosphoprotein
#' is inhibiting or activating, respectively.
#'
#' @return OmicsData object: list of 4 elements (OmicsD, PathwayD, TFtargetsD,
#' Status); OmicsD containing omics data set + results (after analysis);
#' PathwayD containing selected pathway databases + biopax model;
#' TFtargetsD containing selected TF target gene databases + TF target gene data;
#' PhosphoD containing optionally phosphoprotein downstream regulation annotation.
#' @keywords manip
#' @export
#' @examples
#' \dontrun{
#' data(OmicsExampleData)
#' data_omics = readOmics(tp_prots = c(0.25, 1, 4, 8, 13, 18, 24),
#' tp_genes = c(1, 4, 8, 13, 18, 24), OmicsExampleData,
#' PWdatabase = c("biocarta", "kegg", "nci", "reactome"),
#' TFtargetdatabase = c("userspec"))
#' data_omics = readPhosphodata(data_omics,
#' phosphoreg = system.file("extdata", "phospho_reg_table.txt",
#' package = "pwOmics"))
#' }
readPhosphodata <- function(data_omics, phosphoreg) {
if(class(data_omics) != "OmicsData")
{ stop("Parameter 'data_omics' is not an OmicsData object.\n")}
data_omics$PhosphoD = as.data.frame(read.delim(phosphoreg, header = FALSE))
colnames(data_omics$PhosphoD) = c("Phosphoprotein","regulation")
message("Phosphoprotein information was read. \n")
return(data_omics)
}
#' Reads in chosen transcription factor target database information.
#'
#' This function reads in transcription factor information given the selected
#' transcription factor target gene database. The information is downloaded
#' via the AnnotationHub package and merged, if necessary.
#'
#' @param data_omics OmicsData object.
#' @param TF_target_path character vector indicating path of the txt file of
#' matching transcription factors and target genes; the file should be a txt
#' file with first column transcription factors and second column target gene
#' symbols without a header.
#' @param cell_match character indicating the cell line/cells for which the TF target
#' gene data should be extracted from the database; this is only possible for
#' chea database. Available cell-specific data from chea for matching are
#' "Hs578T", "Raji B cells and iDC", "MCF7", "THP-1", "Hela cells", "STHdh",
#' "H3396 breast cancer cells","HL60","HESC","T-ALL", "HPC-7",
#' "ovarian surface epithelium", "HaCaT", "HCT116","U2OS", "Wilms tumor-derived CCG99-9611",
#' "HepG2","HUMAN INTESTINAL CELL LINE CACO-2", "HEK293T","K562", "AK7",
#' "NEUROBLASTOMA","JURKAT","T-47D","LS174T", "MULTIPLE HUMAN CANCER CELL TYPES",
#' "501MEL", "PC3", "CACO-2", "FETAL_BRAIN", "HELA", "U937_AND_SAOS2",
#' "CD4_POS_T", "ERYTHROLEUKEMIA", "RHABDOMYOSARCOMA", "293T", "SW620",
#' "LYMPHOBLASTOID", "VCAP", "SK-N-MC", "CADO-ES1", "MEDULLOBLASTOMA", "M12",
#' "K562_HELA_HEPG2_GM12878", "NT2", "SHEP-21N", "LN229_GBM", "MCF-7", "MELANOMA",
#' "MYOFIBROBLAST", "NTERA2", "MEGAKARYOCYTES", "HMVEC", "ZR75-1", "TREG",
#' "TLL", "A2780", "MONOCYTES", "BEAS2B", "LNCAP PROSTATE CANCER CELL LINES",
#' "MCF10A", "GC-B", "BL", "IMR90", "EOC", "PCA", "PROSTATE_CANCER", "OVCAR3",
#' "MALME-3M", "HFKS", "HEK293", "HELA-AND-SCP4", "CD34+", "IB4-LCL", "MDA-MB-231",
#' "U87", "T47D", "Z138-A519-JVM2", "DLD1", "ATHEROSCLEROTIC-FOAM", "LCL-AND-THP1",
#' "NB4", "PFSK-1 AND SK-N-MC", "EP156T","GBM1-GSC","CD4+", "FIBROSARCOMA",
#' "LGR5+ INTESTINAL STEM CELL","NEUROBLASTOMA BE2-C".
#' If no tissue is given the data from all cells/cell lines are merged.
#' @param TF_filter_threshold integer defining a threshold number to
#' filter out those transcription factors having higher numbers of target genes
#' than 'TF_filter_threshold' from the further analysis
#'
#' @return OmicsData object - a list containing information about the user data
#' (timepoints, IDs, fold changes) and the selected databases chosen for the
#' analysis.
#' @return OmicsData object: list of 4 elements (OmicsD, PathwayD, TFtargetsD,
#' Status); OmicsD containing omics data set + results (after analysis);
#' PathwayD containing selected pathway databases + biopax model;
#' TFtargetsD containing selected TF target gene databases + TF target gene data.
#' @keywords manip
#' @export
#' @examples
#' \dontrun{
#' data(OmicsExampleData)
#' data_omics = readOmics(tp_prots = c(0.25, 1, 4, 8, 13, 18, 24),
#' tp_genes = c(1, 4, 8, 13, 18, 24), OmicsExampleData,
#' PWdatabase = c("biocarta", "kegg", "nci", "reactome"),
#' TFtargetdatabase = c("userspec"))
#' data_omics = readPhosphodata(data_omics,
#' phosphoreg = system.file("extdata", "phospho_reg_table.txt",
#' package = "pwOmics"))
#' data_omics = readTFdata(data_omics,
#' TF_target_path = system.file("extdata", "TF_targets.txt",
#' package = "pwOmics"))
#' }
readTFdata <- function(data_omics, TF_target_path, cell_match = 0, TF_filter_threshold = 0) {
if(class(data_omics) != "OmicsData")
{ stop("Parameter 'data_omics' is not an OmicsData object.\n")}
if(!"chea" %in% data_omics[[3]][[1]] & cell_match != 0)
{ stop("Matching of cell line/ cells option is only available for the chea
database.\n")}
if("chea" %in% data_omics[[3]][[1]] | "pazar" %in% data_omics[[3]][[1]])
{ah = AnnotationHub()}
TF_data_comb = data.frame()
if("chea" %in% data_omics[[3]][[1]])
{
chea = query(ah, "ChEA")
TF_data_chea = chea[[1]]
TF_data_chea = TF_data_chea[which(TF_data_chea[,"Species"]=="human" |
TF_data_chea[,"Species"]=="HUMAN"),]
if(cell_match != 0 && is.character(cell_match))
{TF_data_chea = TF_data_chea[which(TF_data_chea[,"CellType"] == cell_match),]}
TF_data_chea = data.frame(TF = TF_data_chea[,2],
target = TF_data_chea[,4])
TF_data_comb = rbind(TF_data_comb,TF_data_chea)
message("ChEA database information was read.\n")
}
if("pazar" %in% data_omics[[3]][[1]])
{
pazar = query(ah, "Pazar")
TF_data_pazar = list()
ret_list = c(1:2, 4:35, 37:46, 48:49, 51:62, 64:69, 71, 73:76, 78:84, 88: length(pazar))
for(k in ret_list)
{ if(class(pazar[[k]]) == "GRanges")
{ TF_data_pazar[[k]] = as.data.frame(pazar[[k]])
}else{
TF_data_pazar[[k]] = pazar[[k]]
}
}
TF_data_pazar = rbindlist(TF_data_pazar,use.names = TRUE, fill = TRUE)
TF_data_pazar = data.frame(TF_data_pazar)
TF_data_pazar = TF_data_pazar[which(TF_data_pazar[,"Species"]=="Homo sapiens"),]
target = getAlias_Ensemble(TF_data_pazar[,"EnsemblGeneAccession"])
target = target[match(TF_data_pazar[,"EnsemblGeneAccession"], target[,1] ),2]
TF_data_pazar = data.frame(TF = gsub("_HUMAN", "", TF_data_pazar[,"TFName"]),
target)
TF_data_pazar = unique(TF_data_pazar)
mouse_ind = grep("_MOUSE",
apply(as.data.frame(TF_data_pazar),2,as.character)[,1])
rat_ind = grep("_RAT",
apply(as.data.frame(TF_data_pazar),2,as.character)[,1])
TF_data_pazar = TF_data_pazar[-c(mouse_ind, rat_ind),]
TF_data_comb = rbind(TF_data_comb, TF_data_pazar)
message("Pazar database information was read.\n")
}
if("userspec" %in% data_omics[[3]][[1]])
{ if(!is.character(TF_target_path))
{stop("No path for user specified TF target gene list was given.
Either provide a pathname and the file or chose
another transcription factor target database.")
}else{
TF_data = readTFtargets(data_omics, TF_target_path)
colnames(TF_data)=c("TF", "target")
TF_data_comb = rbind(TF_data_comb, TF_data)}
message("User specified database information was read.\n")
}
if(is.numeric(TF_filter_threshold) & TF_filter_threshold != 0)
{TF_matrix = matrix(nrow = length(as.character(unique(TF_data_comb[,1]))), ncol = 2)
TF_matrix[,1] = as.character(unique(TF_data_comb[,1]))
for(k in 1: length(unique(TF_data_comb[,1])))
{ TF_matrix[k,2] = length(which(TF_data_comb[,1] == TF_matrix[k,1]))}
TF_matrix = TF_matrix[order(as.numeric(TF_matrix[,2])),]
TF_matrix = TF_matrix[-which(as.numeric(TF_matrix[,2])> TF_filter_threshold),]
TF_data_comb = TF_data_comb[which(TF_data_comb[,1] %in% TF_matrix[,1]),]}
data_omics[[3]][[2]] = TF_data_comb
#increment status
if(length(data_omics[[2]][[2]])!=0 & length(data_omics[[3]][[2]]) != 0)
{data_omics[[4]] = data_omics[[4]] +1
}else{
data_omics[[4]] = data_omics[[4]]
}
return(data_omics)
}
#' Read in pathway database data needed for pathway identification.
#'
#' This function reads pathway data of the chosen database(s) via the
#' AnnotationHub [1] package and rBiopaxParser [2] package.
#' Takes a lot of time for a high number of proteins and/or if all databases
#' are chosen.
#' First, chosen databases are retrieved, then new internal pathway IDs are
#' generated. Afterwards the genelists of the different databases are loaded or
#' generated, depending on the loadgenelists option. Pathway ID mapping takes
#' some time, especially for such big databases as reactome, so the genelists
#' are automatically stored in the current working folder and can be used
#' via loadgenelists in case you use this function again for easier and faster
#' usage...
#' Biopax level of retrieved databases is 2 by default.
#'
#' @param data_omics OmicsData object.
#' @param loadgenelists path of genelist RData files stored previously; all
#' genelists stored in this path are read in and used automatically if path is
#' given; if loadgenelists = FALSE, then genelists from pathway databases have
#' to be generated first.
#' @param biopax_level integer indicating biopax level of pathway database
#' information. default level is 2.
#'
#' @references 1. Morgan M, Carlson M, Tenenbaum D and Arora S. AnnotationHub:
#' Client to access AnnotationHub resources. R package version 1.99.75.
#' @references 2. Kramer F, Bayerlova M, Klemm F, Bleckmann A and Beissbarth T.
#' rBiopaxParser - an R package to parse, modify and visualize BioPAX data.
#'
#' @return list of OmicsData object and genelists for selected pathway databases.
#' @keywords manip
#' @export
#' @examples
#' \dontrun{
#' data(OmicsExampleData)
#' data_omics = readOmics(tp_prots = c(0.25, 1, 4, 8, 13, 18, 24),
#' tp_genes = c(1, 4, 8, 13, 18, 24), OmicsExampleData,
#' PWdatabase = c("biocarta", "kegg", "nci", "reactome"),
#' TFtargetdatabase = c("userspec"))
#' data_omics = readPhosphodata(data_omics,
#' phosphoreg = system.file("extdata", "phospho_reg_table.txt",
#' package = "pwOmics"))
#' data_omics = readTFdata(data_omics,
#' TF_target_path = system.file("extdata", "TF_targets.txt",
#' package = "pwOmics"))
#' data_omics_plus = readPWdata(data_omics,
#' loadgenelists = system.file("extdata/Genelists", package = "pwOmics"))
#' }
readPWdata <- function(data_omics, loadgenelists, biopax_level = 2) {
if(class(data_omics) != "OmicsData")
{stop("Parameter 'data_omics' is not an OmicsData object.")}
db_info = loadPWs(data_omics[[2]][[1]], biopax_level)
message("Pathway database information is loaded. \n")
intIDs = createIntIDs(data_omics, db_info)
message("New internal IDs for pathways were generated. \n")
data_omics[[2]][[2]] = createBiopaxnew(intIDs, data_omics[[2]][[1]])
message("New Biopax model was created with new internal pathway IDs. \n")
if(loadgenelists == FALSE)
{ genelists = genGenelists(intIDs, data_omics[[2]][[1]])
}else{
files_genelists = list.files(path = loadgenelists, pattern = "*.RData")
if(sum(file.exists(paste(loadgenelists,"/Genelist_",
data_omics[[2]][[1]], ".RData", sep = ""))) == 0)
{stop("No genelist files corresponding to your pathway database
choice have been found in the working directory. Check your
working folder or run readPWdata with loadgenelists = FALSE
to generate these files.")}
genelists = list()
for(s in 1: length(files_genelists))
{genelists[[s]] = get(load(paste(loadgenelists, files_genelists[s],
sep = "/")))}
}
message("Genelists of databases are loaded/generated. \n")
if(length(data_omics[[2]][[2]])!=0 & length(data_omics[[3]][[2]]) != 0)
{data_omics[[4]] = data_omics[[4]] +1
}else{
data_omics[[4]] = data_omics[[4]]
}
return(list(data_omics, genelists))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.