# recode all (most...) functions
#' @title construct Phenotype data frame
#' @description This function is used to construct a Phenotype data.frame for ExpressionSet object
#' @details Although \code{ExpressionSet} object has a regular pattern for phenotype data, but diverse in different
#' studies. So if we want to merge data from different studies, we have to uniform all variable names of phenotype data
#' . This function is for this purpose and mainly used by \code{\link{create_Exprset}}. User can use this function to
#' create phenodata separately.
#' @param Study_ID a character. Set a ID for a study, like GSEXXXXX for GEO study.
#' @param Sample_ID a character. Set a ID for a sample, like GSMXXXXX for GEO samples.
#' @param Platform a character. Set platform for study. if it is \code{NULL}, \code{annotation} function
#' will be used to get the platform. It is ok to use annotation platform like org.Hs.eg.db
#' @param Type a character. Column name of \code{pData}. It is used to tell the tissue type, such as Normal, LUAD ..
#' @param Gender a character. Column name of \code{pData}. It is used to identify where the Gender column are.
#' @param Age like above.
#' @param Tumor_stage like above.
#' @param Family_history like above.
#' @param Smoking_status like above.
#' @param Tumor_feature like above. You can get what you think related to feature but not set in other argument.
#' @param Mutation_status like above.
#' @param Survival_status like above.
#' @param Survival_time like above.
#' @param Preprocessed_Method a character used to set what method used to preprocess raw data.
#' @param Matched_CNV_ID used to label sample ID which has paired CNV data
#' @param Tumor_Normal_Matched_ID used to label sample ID which is Tumor and Normal tissue matched.
#' @return a data.frame
#' @author Shixiang Wang <w_shixiang@163.com>
#' @seealso \code{\link{ExpressionSet}}
#' @export
construct_PhenoData <- function(Study_ID=NULL, Sample_ID=NULL, Platform=NULL, Type=NULL,
Gender=NA, Age=NA, Tumor_stage=NA, Family_history=NA,
Smoking_status=NA, Tumor_feature=NA, Mutation_status=NA, Survival_status=NA,
Survival_time=NA, Preprocessed_Method="RMA", Matched_CNV_ID=NA,
Tumor_Normal_Matched_ID=NA){
dat <- data.frame(Study_ID=Study_ID, Sample_ID=Sample_ID, Platform=Platform, Type=Type,
Gender=Gender, Age=Age, Tumor_stage=Tumor_stage, Family_history=Family_history,
Smoking_status=Smoking_status, Tumor_feature=Tumor_feature,
Mutation_status=Mutation_status, Survival_status=Survival_status,
Survival_time=Survival_time, Preprocessed_Method=Preprocessed_Method,
Matched_CNV_ID=Matched_CNV_ID, Tumor_Normal_Matched_ID=Tumor_Normal_Matched_ID,
stringsAsFactors = FALSE)
return(dat)
}
#' @title decode labels by separator
#' @description This function is created to separate content of \code{pData} of \code{\link{ExpressionSet}} object.
#' @param x a character to be separated.
#' @param sep a character used to separate \code{x}. Use ":" as default.
#' @return a character. If \code{x} has \code{sep} (such as ":"), it will be chopped off and the second part will be returned.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @export
#' @examples x <- "cell type: A f "
#' decodeLables(x)
#' x <- "cell type: LCC"
#' decodeLables(x)
decodeLables <- function(x, sep=":"){
#x <- "cell type: A f "
#x <- "cell type: LCC"
# gsub(pattern = "^[ ]", replacement = "" ,unlist(strsplit(x, split=":"))[2])
x <- gsub(pattern = "^ ?([^ ].*[^ ]) ?$", replacement = "\\1" ,unlist(strsplit(x, split=sep))[2])
return(x)
}
#' @title apply decodeLables function
#' @description apply \code{decodeLables} function to a character vector.
#' @param x a character vector to be \code{\link{decodeLables}} on.
#' @return a character vector. Characters before separator will be removed and the second part returned.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @export
apply_decode <- function(x){
x <- as.character(x)
s <- c()
n <- length(x)
for (i in 1:n){
y <- decodeLables(x[i])
s <- c(s, y)
}
return(s)
}
#' @title identify whether a separator exists
#' @description identify whether a separator exists in character string. It will be \code{TRUE} that if
#' more than half elements have separator.
#' @param x a character vector.
#' @param separator a character identify separator, ":" is default.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @return a logical value.
#' @export
exists_Separator <- function(x, separator=":"){
# not all characters have separator, so we make a condition
# If there are more than half have separatory, make it true
n = length(x)
# x = any(grepl(pattern = separator, x = as.character(x)))
# return(x)
if(length(which(grepl(pattern = separator, x = as.character(x)))) > n/2 ){
return(TRUE)
}else{
return(FALSE)
}
}
#' @title create an ExpressionSet object from matched ExpressionSets
#' @description Auto-processing function for creating phenotype data and feature data
#' for \code{ExpressionSet} object. An \code{ExpressionSet} object generated from two Expression lists
#' \code{Mat} and \code{Res}.
#' @details This function is used to create an ExpressionSet object which pData and fData are
#' automatically processed. \code{Mat} and \code{Res} are list of matched ExpressionSet object, \code{id}
#' is used to identify which dataset in ExpressionSet list will be processed. Most of other arguments are
#' used for constructing pData.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @param Mat an ExpressionSet object list. The list is used to store a list of \code{ExpressionSet} objects
#' import from \code{GEOquery::getGEO} function which download and import Matrix Series Files.
#' @param Res an ExpressionSet object list. The list is used to store a list of \code{ExpressionSet} objects
#' preprocessed by \code{Oligo} or \code{affy} package and have only arrayData object. Due to any object in
#' the list has no detail \code{pData} or \code{fData}, so we use this function to
#' put \code{pData} and \code{fData} we need in \code{Mat} list to \code{Res} list.
#' @param id a character. We use this parameter to label one object name in
#' both \code{Mat} and \code{Res} list.
#' @param keepSource a character. It marks a column name of pData of ExpressionSet object.
#' Not always we want to keep all samples, so we can use this argument to
#' filter samples we want to keep. For example, in cancer study, we want to keep some
#' types of cancer or some histological type of one cancer etc..
#' @param features a character vector. This paramter is used to keep \code{fData} variables.
#' @param Study_ID a character. Set a ID for a study, like GSEXXXXX for GEO study.
#' @param Sample_ID a character. Set a ID for a sample, like GSMXXXXX for GEO samples. If not specified, it
#'will be equal to \code{id} parameter.
#' @param Platform a character. Set platform for study. if it is \code{NULL}, \code{annotation} function
#' will be used to get the platform. It is ok to use annotation platform like org.Hs.eg.db
#' @param Type a character. Column name of \code{pData}. It is used to tell the tissue type, such as Normal, LUAD ..
#' @param Gender a character. Column name of \code{pData}. It is used to identify where the Gender column are.
#' @param Age like above.
#' @param Tumor_stage like above.
#' @param Family_history like above.
#' @param Smoking_status like above.
#' @param Tumor_feature like above. You can get what you think related to feature but not set in other argument.
#' @param Mutation_status like above.
#' @param Survival_status like above.
#' @param Survival_time like above.
#' @param Preprocessed_Method a character used to set what method used to preprocess raw data.
#' @param Matched_CNV_ID used to label sample ID which has paired CNV data
#' @param Tumor_Normal_Matched_ID used to label sample ID which is Tumor and Normal tissue matched.
#' @return an \code{ExpressionSet} object
#' @seealso \code{\link{ExpressionSet}}, \code{\link{construct_PhenoData}}
#' @import Biobase
#' @export
create_Exprset <- function(Mat=NULL, Res=NULL, id=NULL, keepSource=NULL, rev_nams=NULL,
features=c("ID", "Gene.title", "Gene.symbol", "Gene.ID"),Study_ID=NULL,
Platform=NULL, Gender=NA, Age=NA, Tumor_stage=NA, Family_history=NA,
Smoking_status=NA, Tumor_feature=NA, Mutation_status=NA, Survival_status=NA,
Survival_time=NA, Preprocessed_Method="RMA", Matched_CNV_ID=NA,
Tumor_Normal_Matched_ID=NA,...){
id <- as.character(id)
# Mat and Res must be a list of ExpressionSet
if(!all(c(class(Mat[[id]])[1]=="ExpressionSet", class(Res[[id]])[1]=="ExpressionSet"))){
stop("Error: The Arguments 'Mat' and 'Res' must be a list of ExpressionSet!")
}
# judge the two dataset have same sort for samples
if(!all(sampleNames(Mat[[id]]) == sampleNames(Res[[id]]))){
stop("Error: Mat[[id]] and Res[[id]] must have same assayData slot!")
}
# View(pData(Mat[[id]]))
keepIDs <- c()
# if the rev_names is NULL, we keep all the samples
# keepSource <- "source_name_ch1"
if (is.null(rev_nams)){
if (exists_Separator(pData(Mat[[id]])[, keepSource])){
rev_nams <- apply_decode(names(table(pData(Mat[[id]])[, keepSource])))
}else{
rev_nams <- names(table(pData(Mat[[id]])[, keepSource]))
}
}
# if a separator exists, we decode it
if (exists_Separator(pData(Mat[[id]])[, keepSource])){
keepIDs <- as.character(pData(Mat[[id]])[which(apply_decode((pData(Mat[[id]])[, keepSource]))%in%rev_nams), "geo_accession"])
}else{
keepIDs <- as.character(pData(Mat[[id]])[which(as.character(pData(Mat[[id]])[, keepSource])%in%rev_nams), "geo_accession"])
}
# keep samples we want, like LUAD and Normal samples
res <- Mat[[id]][ , as.character(Mat[[id]]$geo_accession)%in%keepIDs]
res_new <- NULL
res_new <- Res[[id]][, sampleNames(Res[[id]])%in%keepIDs]
# make sure they have sample sort
res_new <- res_new[, sampleNames(res)]
# construct a new pheno data
if(is.null(Study_ID)){
Study_ID <- id
}
Sample_ID <- as.character(res$geo_accession)
if(is.null(Platform)){
Platform <- annotation(res)
}
trans2 <- function(name, ...){
if(exists_Separator(pData(res)[, name])){
result <- apply_decode(pData(res)[, name])
}else{
result <- as.character(pData(res)[, name])
}
# result <-ifelse(exists_Separator(pData(res)[, name]), apply_decode(pData(res)[, name]),
# as.character(pData(res)[, name]))
return(result)
}
n <- length(Study_ID)
Type <- trans2(keepSource)
if(!is.na(Gender)){
Gender <- trans2(Gender)
}
if(!is.na(Age)){
Age <- trans2(Age)
}
if(!is.na(Tumor_stage)){
Tumor_stage <- trans2(Tumor_stage)
}
if(!is.na(Family_history)){
Family_history <- trans2(Family_history)
}
if(!is.na(Smoking_status)){
Smoking_status <- trans2(Smoking_status)
}
if(!is.na(Tumor_feature)){
Tumor_feature <- trans2(Tumor_feature)
}
if(!is.na(Mutation_status)){
Mutation_status <- trans2(Mutation_status)
}
if(!is.na(Survival_status)){
Survival_status <- trans2(Survival_status)
}
if(!is.na(Survival_time)){
Survival_time <- trans2(Survival_time)
}
if(!is.na(Matched_CNV_ID)){
Matched_CNV_ID <- trans2(Matched_CNV_ID)
}
if(!is.na(Tumor_Normal_Matched_ID)){
Tumor_Normal_Matched_ID <- trans2(Tumor_Normal_Matched_ID)
}
p_data <- construct_PhenoData(Study_ID=Study_ID, Sample_ID=Sample_ID, Platform=Platform, Type=Type,
Gender=Gender, Age=Age, Tumor_stage=Tumor_stage, Family_history=Family_history,
Smoking_status=Smoking_status, Tumor_feature=Tumor_feature,
Mutation_status=Mutation_status, Survival_status=Survival_status,
Survival_time=Survival_time, Preprocessed_Method=Preprocessed_Method,
Matched_CNV_ID=Matched_CNV_ID, Tumor_Normal_Matched_ID=Tumor_Normal_Matched_ID)
# p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = as.character(pData(res)[, "source_name_ch1"]), Gender = apply_decode(pData(res)[, "characteristics_ch1"]), Age = apply_decode(pData(res)[, "characteristics_ch1.1"]), Tumor_stage = apply_decode(pData(res)[, "characteristics_ch1.3"]), Smoking_status = apply_decode(pData(res)[, "characteristics_ch1.2"]))
pData(res_new) <- p_data
### assign feature data
f_data <- fData(res)
colnames(f_data) <- make.names(colnames(f_data))
# make data have some sort, and only keep id, gene symbol and gene id..
f_data <- f_data[featureNames(res_new), features]
# make sure the names of probe are same
all(rownames(f_data) == featureNames(res_new))
# assign
fData(res_new) <- f_data
## remove some variabes and do next one
rm(f_data, p_data, res, rev_nams, keepIDs, keepSource, Mat, Res); gc();
return(res_new)
}
#' @title collapse feature data of \code{ExpressionSet} object by \code{identifier}
#' @description This function is used to collapse multiple \code{identifier} to unique one by a function
#' @details In expression dataset, we often meet there are multiple probes mapping to one gene id or
#' symbol. While we want make gene expression unique, this function works for this purpose.
#' Due to multiple probes mapping to one gene,
#' the \code{identifier} should be "SYMBOL" (gene symbol) or "ENTREZ_ID (entrez id), then \code{collapse_fun}
#' function will be called and unique identifiers will substitute probe IDs, unique gene expression value
#' computed by \code{collapse_fun} will substitute multiple probe values mapping to one gene.
#' @param exprz a \code{ExpressionSet} object. The first column of \code{fData} of this object is equal to
#' \code{rownames(exprs(exprz))}, and alWways stores probe IDs.
#' @param name a character tells the names of the \code{ExpressionSet} object. You can treat it as a ID for
#' the \code{exprz} object.
#' @param identifier a character. "SYMBOL" is default to set gene symbols will be the rownames of expression
#' matrix. You can also use "ENTREZ_ID".
#' @param collapse_fun a function object used for collapsing multiple identifiers with same name to
#' unique one. NA and "" will be removed.
#' @return a \code{ExpressionSet} object.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @import Biobase
#' @export
ArrayBuildfData <-function(exprz=NULL, name=NULL, identifier=c("SYMBOL", "ENTREZ_ID"),collapse_fun=median) {
if(is.null(name)){
name <- names(exprz)
}
# subset expressionSet, remove unmatched labels in case
match_fData <- match(rownames(exprs(exprz)), fData(exprz)[,1])
match_fData <- na.omit(match_fData)
exprz <- exprz[match_fData]
exprs_z <- as.data.frame(exprs(exprz))
identifier <- match.arg(identifier)
if(identifier=="SYMBOL"){
# label_name <- "Gene.symbol"
label_name <- grep("symbol", colnames(fData(exprz)), value = TRUE, ignore.case = TRUE)
}else if (identifier=="ENTREZ_ID"){
label_name <- grep("entrez", colnames(fData(exprz)), value = TRUE, ignore.case = TRUE)
}
exprs_z <- cbind(identifier=fData(exprz)[,label_name],exprs_z,row.names=NULL)
cat("\nNow preprocessing raw data of ",as.character(name),": Collapsing expression values to their ",as.character(as.list(collapse_fun)[[4]])[2],"...\n")
# library(reshape2)
exprs_z <- recast(data=exprs_z,formula=identifier~variable,fun.aggregate=collapse_fun)
cat("\nNow preprocessing raw data of ",as.character(name),": Annotating expression values with ",as.character(label_name),"...\n")
fData(exprz) <- fData(exprz)[fData(exprz)[,label_name]%in%exprs_z$identifier, ]
fData(exprz) <- fData(exprz)[!duplicated(fData(exprz)[,label_name]), ]
fData(exprz) <- fData(exprz)[fData(exprz)[, label_name]!="", ]
rownames(fData(exprz)) <- fData(exprz)[,label_name]
sorted_name <- as.character(fData(exprz)[, label_name])
rownames(exprs_z) <- exprs_z$identifier
exprs_z <- as.matrix(exprs_z[, -1])
exprs_z <- exprs_z[sorted_name, ]
exprz <- exprz[1:length(sorted_name),]
dimnames(exprz)[[1]] <- fData(exprz)[,label_name]
dimnames(exprz)[[2]] <- colnames(exprs_z)
# featureNames(z) <- fData(z)[,1]
exprs(exprz) <- exprs_z
# exprs_z <- data.matrix(exprs_z, rownames.force = TRUE)
# creat a new object
# z2 <- ExpressionSet(assayData = exprs_z, phenoData = z@phenoData,
# featureData = z@featureData, experimentData = z@experimentData,
# annotation = z@annotation, protocolData = z@protocolData)
cat("\nSize of expression matrix of",as.character(name),":",dim(exprs(exprz))[1],"rows and",dim(exprs(exprz))[2],"columns\n")
return(exprz)
}
#' @title Merge multiple \code{ExpressionSet} objects
#' @description This function is used to merge multiple \code{ExpressionSet} objects by \code{Features}.
#' @details The mechanism of \code{\link{biobase}} package has no function to merge multiple
#' \code{ExpressioSet} object. Due to different feature names in different platforms, we use custom
#' \code{Features} to construct a uniform feature data.frame.
#'
#' @param Exprs_list a list of \code{ExpressionSet} objects. They must have same \code{identifier}, you can
#' use either gene symbol or gene id. This can be done by \code{\link{ArrayBuildfData}}.
#' @param Features a character vector at least include all \code{\link{featureNames}} of \code{Exprs_list}.
#' You can get HUGO gene symbol or id from \code{data(hugo)}.
#' @param annot set a annotation platform or package for merged \code{ExpressionSet} object.
#'
#' @author Shixiang Wang <w_shixiang@163.com>
#' @export
ArrayMerge <- function(Exprs_list=NULL, Features=NULL, annot="org.Hs.eg.db"){
Res <- eval(as.symbol(Exprs_list))
# Res <- eval(Res)
# construct assayData of ExpressionSet
lens <- sum(sapply(Res, function(x)(length(sampleNames(x)))))
exprs <- matrix(data = NA, nrow=length(Features), ncol = lens)
rownames(exprs) <- Features
colnames(exprs) <- 1:lens
assignExprs <- function(part_exprs=NULL, res=NULL){
matched_id <- match(rownames(part_exprs) ,rownames(exprs(res)))
matched_id <- na.omit(matched_id)
cat(length(matched_id), " feature identifiers matched!---\n")
matched_names <- rownames(exprs(res))[matched_id]
# assign values
part_exprs[matched_names,] <- exprs(res)[matched_names, ]
return(part_exprs)
}
# cumsum lengths of datasets
Cs <- as.integer(cumsum(sapply(Res, function(x)(length(sampleNames(x))))))
for(i in 1:length(Cs)){
if(i == 1){
# accelarate speed
part_exprs <- matrix(NA, nrow = length(Features), ncol = Cs[i])
part_exprs <- exprs[, 1:Cs[i]]
}else{
# accelarate speed
part_exprs <- matrix(NA, nrow = length(Features), ncol = Cs[i] - Cs[i-1])
part_exprs <- exprs[, (Cs[i-1]+1):Cs[i]]
}
colnames(part_exprs) <- sampleNames(Res[[i]])
rownames(part_exprs) <- Features
# assign part-exprs
part_exprs <- assignExprs(part_exprs=part_exprs, res=Res[[i]])
# assign part- back to exprs
if(i == 1){
exprs[,1:Cs[i]] <- part_exprs
colnames(exprs)[1:Cs[i]] <- sampleNames(Res[[i]])
}else{
exprs[, (Cs[i-1]+1):Cs[i]] <- part_exprs
colnames(exprs)[(Cs[i-1]+1):Cs[i]] <- sampleNames(Res[[i]])
}
}
# build phenoData
merged_pData <- Reduce(rbind, lapply(Res, function(x)pData(x)))
meta_data <- data.frame(labelDescription=rep(NA,length(colnames(merged_pData))), row.names = colnames(merged_pData))
pheno_data <- new("AnnotatedDataFrame", data=merged_pData, varMetadata=meta_data)
# build featureData
merged_fData <- as.data.frame(Features)
rownames(merged_fData) <- Features
meta_data2 <- data.frame(labelDescription=rep(NA,length(colnames(merged_fData))), row.names = colnames(merged_fData))
feature_data <- new("AnnotatedDataFrame", data=merged_fData, varMetadata=meta_data2)
# create a expressionSet
Eset <- ExpressionSet(assayData = exprs, phenoData = pheno_data, featureData = feature_data,annotation = annot)
return(Eset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.