R/allFunctions_constructExprSet.R

# recode all (most...) functions

#' @title  construct Phenotype data frame
#' @description This function is used to construct a Phenotype data.frame for ExpressionSet object
#' @details Although \code{ExpressionSet} object has a regular pattern for phenotype data, but diverse in different
#' studies. So if we want to merge data from different studies, we have to uniform all variable names of phenotype data
#' . This function is for this purpose and mainly used by \code{\link{create_Exprset}}. User can use this function to
#'  create phenodata separately.
#' @param Study_ID a character.  Set a ID for a study, like GSEXXXXX for GEO study.
#' @param Sample_ID a character. Set a ID for a sample, like GSMXXXXX for GEO samples.
#' @param Platform a character. Set platform for study. if it is \code{NULL}, \code{annotation} function
#'  will be used to get the platform. It is ok to use annotation platform like org.Hs.eg.db
#' @param Type a character. Column name of \code{pData}. It is used to tell the tissue type, such as Normal, LUAD ..
#' @param Gender a character. Column name of \code{pData}. It is used to identify where the Gender column are.
#' @param Age like above.
#' @param Tumor_stage like above.
#' @param Family_history like above.
#' @param Smoking_status like above.
#' @param Tumor_feature like above. You can get what you think related to feature but not set in other argument.
#' @param Mutation_status like above.
#' @param Survival_status like above.
#' @param Survival_time like above.
#' @param Preprocessed_Method a character used to set what method used to preprocess raw data.
#' @param Matched_CNV_ID used to label sample ID which has paired CNV data
#' @param Tumor_Normal_Matched_ID used to label sample ID which is Tumor and Normal tissue matched.
#' @return  a data.frame
#' @author Shixiang Wang <w_shixiang@163.com>
#' @seealso \code{\link{ExpressionSet}}
#' @export
construct_PhenoData <- function(Study_ID=NULL, Sample_ID=NULL, Platform=NULL, Type=NULL,
                                Gender=NA, Age=NA, Tumor_stage=NA, Family_history=NA,
                                Smoking_status=NA, Tumor_feature=NA, Mutation_status=NA, Survival_status=NA,
                                Survival_time=NA, Preprocessed_Method="RMA", Matched_CNV_ID=NA,
                                Tumor_Normal_Matched_ID=NA){
    dat <- data.frame(Study_ID=Study_ID, Sample_ID=Sample_ID, Platform=Platform, Type=Type,
                      Gender=Gender, Age=Age, Tumor_stage=Tumor_stage, Family_history=Family_history,
                      Smoking_status=Smoking_status, Tumor_feature=Tumor_feature,
                      Mutation_status=Mutation_status, Survival_status=Survival_status,
                      Survival_time=Survival_time, Preprocessed_Method=Preprocessed_Method,
                      Matched_CNV_ID=Matched_CNV_ID, Tumor_Normal_Matched_ID=Tumor_Normal_Matched_ID,
                      stringsAsFactors = FALSE)
    return(dat)
}

#' @title decode labels by separator
#' @description This function is created to separate content of \code{pData} of \code{\link{ExpressionSet}} object.
#' @param x a character  to be separated.
#' @param sep a character used to separate \code{x}. Use ":" as default.
#' @return  a character. If \code{x} has \code{sep} (such as ":"), it will be chopped off and the second part will be returned.
#' @author  Shixiang Wang  <w_shixiang@163.com>
#' @export
#' @examples x <- "cell type: A f "
#' decodeLables(x)
#' x <- "cell type: LCC"
#' decodeLables(x)
decodeLables <- function(x, sep=":"){
    #x <- "cell type: A f "
    #x <- "cell type: LCC"
    # gsub(pattern = "^[ ]", replacement = "" ,unlist(strsplit(x, split=":"))[2])
    x <- gsub(pattern = "^ ?([^ ].*[^ ]) ?$", replacement = "\\1" ,unlist(strsplit(x, split=sep))[2])
    return(x)
}

#' @title apply decodeLables function
#' @description  apply \code{decodeLables} function to a character vector.
#' @param x a character vector to be \code{\link{decodeLables}} on.
#' @return  a character vector. Characters before separator will be removed and the second part returned.
#' @author  Shixiang Wang <w_shixiang@163.com>
#' @export
apply_decode <- function(x){
    x <- as.character(x)
    s <- c()
    n <- length(x)
    for (i in 1:n){
        y <- decodeLables(x[i])
        s <- c(s, y)
    }
    return(s)
}

#' @title identify whether a separator exists
#' @description  identify whether a separator exists in character string. It will be \code{TRUE} that if
#' more than half elements have separator.
#' @param x a character vector.
#' @param separator a character identify separator, ":" is default.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @return a logical value.
#' @export
exists_Separator <- function(x, separator=":"){
    # not all characters have separator, so we make a condition
    # If there are more than half have separatory, make it true
    n = length(x)
    # x = any(grepl(pattern = separator, x = as.character(x)))
    # return(x)
    if(length(which(grepl(pattern = separator, x = as.character(x)))) > n/2 ){
        return(TRUE)
    }else{
        return(FALSE)
    }
}

#' @title create an ExpressionSet object from matched ExpressionSets
#' @description  Auto-processing function for creating phenotype data and feature data
#' for \code{ExpressionSet} object. An \code{ExpressionSet} object generated from two Expression lists
#' \code{Mat} and \code{Res}.
#' @details This function is used to create an ExpressionSet object which pData and fData are
#' automatically processed. \code{Mat} and \code{Res} are list of matched ExpressionSet object, \code{id}
#' is used to identify which dataset in ExpressionSet list will be processed. Most of other arguments are
#' used for constructing pData.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @param Mat an ExpressionSet object list. The list is used to store a list of \code{ExpressionSet} objects
#' import from \code{GEOquery::getGEO} function which download and import Matrix Series Files.
#' @param Res an ExpressionSet object list. The list is used to store a list of \code{ExpressionSet} objects
#' preprocessed by \code{Oligo} or \code{affy} package and have only arrayData object. Due to any object in
#' the list has no detail \code{pData} or \code{fData}, so we use this function to
#' put \code{pData} and \code{fData} we need in \code{Mat} list to \code{Res} list.
#' @param id a character. We use this parameter to label one object name in
#' both \code{Mat} and \code{Res} list.
#' @param keepSource a character. It marks a column name of pData of ExpressionSet object.
#'  Not always we want to keep all samples, so we can use this argument to
#' filter samples we want to keep. For example, in cancer study, we want to keep some
#' types of cancer or some histological type of one cancer etc..
#' @param features a character vector. This paramter is used to keep \code{fData} variables.
#' @param Study_ID a character.  Set a ID for a study, like GSEXXXXX for GEO study.
#' @param Sample_ID a character. Set a ID for a sample, like GSMXXXXX for GEO samples. If not specified, it
#'will be equal to \code{id} parameter.
#' @param Platform a character. Set platform for study. if it is \code{NULL}, \code{annotation} function
#'  will be used to get the platform. It is ok to use annotation platform like org.Hs.eg.db
#' @param Type a character. Column name of \code{pData}. It is used to tell the tissue type, such as Normal, LUAD ..
#' @param Gender a character. Column name of \code{pData}. It is used to identify where the Gender column are.
#' @param Age like above.
#' @param Tumor_stage like above.
#' @param Family_history like above.
#' @param Smoking_status like above.
#' @param Tumor_feature like above. You can get what you think related to feature but not set in other argument.
#' @param Mutation_status like above.
#' @param Survival_status like above.
#' @param Survival_time like above.
#' @param Preprocessed_Method a character used to set what method used to preprocess raw data.
#' @param Matched_CNV_ID used to label sample ID which has paired CNV data
#' @param Tumor_Normal_Matched_ID used to label sample ID which is Tumor and Normal tissue matched.
#' @return an \code{ExpressionSet} object
#' @seealso \code{\link{ExpressionSet}}, \code{\link{construct_PhenoData}}
#' @import Biobase
#' @export
create_Exprset <- function(Mat=NULL, Res=NULL, id=NULL, keepSource=NULL, rev_nams=NULL,
                           features=c("ID", "Gene.title", "Gene.symbol", "Gene.ID"),Study_ID=NULL,
                           Platform=NULL,  Gender=NA, Age=NA, Tumor_stage=NA, Family_history=NA,
                           Smoking_status=NA, Tumor_feature=NA, Mutation_status=NA, Survival_status=NA,
                           Survival_time=NA, Preprocessed_Method="RMA", Matched_CNV_ID=NA,
                           Tumor_Normal_Matched_ID=NA,...){
    id <- as.character(id)
    # Mat and Res must be a list of ExpressionSet
    if(!all(c(class(Mat[[id]])[1]=="ExpressionSet", class(Res[[id]])[1]=="ExpressionSet"))){
        stop("Error: The Arguments 'Mat' and 'Res' must be a list of ExpressionSet!")
    }
    # judge the two dataset have same sort for samples
    if(!all(sampleNames(Mat[[id]]) == sampleNames(Res[[id]]))){
        stop("Error: Mat[[id]] and Res[[id]] must have same assayData slot!")
    }
    # View(pData(Mat[[id]]))
    keepIDs <- c()
    # if the rev_names is NULL, we keep all the samples
    # keepSource <- "source_name_ch1"
    if (is.null(rev_nams)){
        if (exists_Separator(pData(Mat[[id]])[, keepSource])){
            rev_nams <- apply_decode(names(table(pData(Mat[[id]])[, keepSource])))
        }else{
            rev_nams <- names(table(pData(Mat[[id]])[, keepSource]))
        }
    }

    # if a separator exists, we decode it
    if (exists_Separator(pData(Mat[[id]])[, keepSource])){
        keepIDs <- as.character(pData(Mat[[id]])[which(apply_decode((pData(Mat[[id]])[, keepSource]))%in%rev_nams), "geo_accession"])
    }else{
        keepIDs <- as.character(pData(Mat[[id]])[which(as.character(pData(Mat[[id]])[, keepSource])%in%rev_nams), "geo_accession"])
    }

    # keep samples we want, like LUAD and Normal samples
    res <- Mat[[id]][ , as.character(Mat[[id]]$geo_accession)%in%keepIDs]
    res_new <- NULL
    res_new <- Res[[id]][, sampleNames(Res[[id]])%in%keepIDs]
    # make sure they have sample sort
    res_new <- res_new[, sampleNames(res)]

    # construct a new pheno data
    if(is.null(Study_ID)){
        Study_ID    <-    id
    }
    Sample_ID   <-    as.character(res$geo_accession)
    if(is.null(Platform)){
        Platform    <-    annotation(res)
    }


    trans2 <- function(name, ...){
        if(exists_Separator(pData(res)[, name])){
            result <- apply_decode(pData(res)[, name])
        }else{
            result <- as.character(pData(res)[, name])
        }
        # result <-ifelse(exists_Separator(pData(res)[, name]), apply_decode(pData(res)[, name]),
        #        as.character(pData(res)[, name]))
        return(result)
    }

    n <- length(Study_ID)

    Type    <-  trans2(keepSource)

    if(!is.na(Gender)){
        Gender  <-  trans2(Gender)
    }

    if(!is.na(Age)){
        Age     <-  trans2(Age)
    }

    if(!is.na(Tumor_stage)){
        Tumor_stage     <- trans2(Tumor_stage)
    }

    if(!is.na(Family_history)){
        Family_history  <- trans2(Family_history)
    }

    if(!is.na(Smoking_status)){
        Smoking_status  <- trans2(Smoking_status)
    }

    if(!is.na(Tumor_feature)){
        Tumor_feature  <- trans2(Tumor_feature)
    }

    if(!is.na(Mutation_status)){
        Mutation_status    <- trans2(Mutation_status)
    }

    if(!is.na(Survival_status)){
        Survival_status    <- trans2(Survival_status)
    }

    if(!is.na(Survival_time)){
        Survival_time      <- trans2(Survival_time)
    }

    if(!is.na(Matched_CNV_ID)){
        Matched_CNV_ID  <-  trans2(Matched_CNV_ID)
    }

    if(!is.na(Tumor_Normal_Matched_ID)){
        Tumor_Normal_Matched_ID  <- trans2(Tumor_Normal_Matched_ID)
    }

    p_data <- construct_PhenoData(Study_ID=Study_ID, Sample_ID=Sample_ID, Platform=Platform, Type=Type,
                                  Gender=Gender, Age=Age, Tumor_stage=Tumor_stage, Family_history=Family_history,
                                  Smoking_status=Smoking_status, Tumor_feature=Tumor_feature,
                                  Mutation_status=Mutation_status, Survival_status=Survival_status,
                                  Survival_time=Survival_time, Preprocessed_Method=Preprocessed_Method,
                                  Matched_CNV_ID=Matched_CNV_ID, Tumor_Normal_Matched_ID=Tumor_Normal_Matched_ID)
    # p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = as.character(pData(res)[, "source_name_ch1"]), Gender = apply_decode(pData(res)[, "characteristics_ch1"]), Age = apply_decode(pData(res)[, "characteristics_ch1.1"]), Tumor_stage = apply_decode(pData(res)[, "characteristics_ch1.3"]), Smoking_status = apply_decode(pData(res)[, "characteristics_ch1.2"]))
    pData(res_new) <- p_data

    ### assign feature data
    f_data <- fData(res)
    colnames(f_data) <- make.names(colnames(f_data))
    # make data have some sort, and only keep id, gene symbol and gene id..
    f_data <- f_data[featureNames(res_new), features]
    # make sure the names of probe are same
    all(rownames(f_data) == featureNames(res_new))
    # assign
    fData(res_new) <- f_data

    ## remove some variabes and do next one
    rm(f_data, p_data, res, rev_nams, keepIDs, keepSource, Mat, Res); gc();

    return(res_new)
}

#' @title collapse feature data of \code{ExpressionSet} object by \code{identifier}
#' @description This function is used to collapse multiple \code{identifier} to unique one by a function
#' @details In expression dataset, we often meet there are multiple probes mapping to one gene id or
#' symbol. While we want make gene expression unique, this function works for this purpose.
#' Due to multiple probes mapping to one gene,
#' the \code{identifier} should be "SYMBOL" (gene symbol) or "ENTREZ_ID (entrez id), then \code{collapse_fun}
#' function will be called and unique identifiers will substitute probe IDs, unique gene expression value
#' computed by \code{collapse_fun} will substitute multiple probe values mapping to one gene.
#' @param exprz a \code{ExpressionSet} object. The first column of \code{fData} of this object is equal to
#' \code{rownames(exprs(exprz))}, and alWways stores probe IDs.
#' @param  name a character tells the names of the \code{ExpressionSet} object. You can treat it as a ID for
#' the \code{exprz} object.
#' @param identifier a character. "SYMBOL" is default to set gene symbols will be the rownames of expression
#' matrix. You can also use "ENTREZ_ID".
#' @param collapse_fun a function object used for collapsing multiple identifiers with same name to
#' unique one. NA and "" will be removed.
#' @return a \code{ExpressionSet} object.
#' @author Shixiang Wang <w_shixiang@163.com>
#' @import Biobase
#' @export
ArrayBuildfData <-function(exprz=NULL, name=NULL, identifier=c("SYMBOL", "ENTREZ_ID"),collapse_fun=median) {

    if(is.null(name)){
        name <- names(exprz)
    }
    # subset expressionSet, remove unmatched labels in case
    match_fData <- match(rownames(exprs(exprz)), fData(exprz)[,1])
    match_fData <- na.omit(match_fData)
    exprz <- exprz[match_fData]

    exprs_z <- as.data.frame(exprs(exprz))

    identifier <- match.arg(identifier)
    if(identifier=="SYMBOL"){
        # label_name <- "Gene.symbol"
        label_name <- grep("symbol", colnames(fData(exprz)), value = TRUE, ignore.case = TRUE)
    }else if (identifier=="ENTREZ_ID"){
        label_name <- grep("entrez", colnames(fData(exprz)), value = TRUE, ignore.case = TRUE)
    }

    exprs_z <- cbind(identifier=fData(exprz)[,label_name],exprs_z,row.names=NULL)

    cat("\nNow preprocessing raw data of ",as.character(name),": Collapsing expression values to their ",as.character(as.list(collapse_fun)[[4]])[2],"...\n")
    # library(reshape2)
    exprs_z <- recast(data=exprs_z,formula=identifier~variable,fun.aggregate=collapse_fun)
    cat("\nNow preprocessing raw data of ",as.character(name),": Annotating expression values with ",as.character(label_name),"...\n")


    fData(exprz) <- fData(exprz)[fData(exprz)[,label_name]%in%exprs_z$identifier, ]
    fData(exprz) <- fData(exprz)[!duplicated(fData(exprz)[,label_name]), ]
    fData(exprz) <- fData(exprz)[fData(exprz)[, label_name]!="", ]
    rownames(fData(exprz)) <- fData(exprz)[,label_name]

    sorted_name <- as.character(fData(exprz)[, label_name])

    rownames(exprs_z) <- exprs_z$identifier
    exprs_z <- as.matrix(exprs_z[, -1])
    exprs_z <- exprs_z[sorted_name, ]

    exprz <- exprz[1:length(sorted_name),]
    dimnames(exprz)[[1]] <- fData(exprz)[,label_name]
    dimnames(exprz)[[2]] <- colnames(exprs_z)
    # featureNames(z) <- fData(z)[,1]
    exprs(exprz) <- exprs_z
    # exprs_z <- data.matrix(exprs_z, rownames.force = TRUE)
    # creat a new object
    # z2 <- ExpressionSet(assayData = exprs_z, phenoData = z@phenoData,
    #                     featureData = z@featureData, experimentData = z@experimentData,
    #                     annotation = z@annotation, protocolData = z@protocolData)
    cat("\nSize of expression matrix of",as.character(name),":",dim(exprs(exprz))[1],"rows and",dim(exprs(exprz))[2],"columns\n")
    return(exprz)
}

#' @title Merge multiple \code{ExpressionSet} objects
#' @description This function is used to merge multiple \code{ExpressionSet} objects by \code{Features}.
#' @details The mechanism of \code{\link{biobase}} package has no function to merge multiple
#' \code{ExpressioSet} object. Due to different feature names in different platforms, we use custom
#' \code{Features} to construct a uniform feature data.frame.
#'
#' @param Exprs_list a list of \code{ExpressionSet} objects. They must have same \code{identifier}, you can
#' use either gene symbol or gene id. This can be done by \code{\link{ArrayBuildfData}}.
#' @param Features a character vector at least include all \code{\link{featureNames}} of \code{Exprs_list}.
#' You can get HUGO gene symbol or id from \code{data(hugo)}.
#' @param annot set a annotation platform or package for merged \code{ExpressionSet} object.
#'
#' @author Shixiang Wang <w_shixiang@163.com>
#' @export
ArrayMerge <- function(Exprs_list=NULL, Features=NULL, annot="org.Hs.eg.db"){
    Res <- eval(as.symbol(Exprs_list))
    # Res <- eval(Res)

    # construct assayData of ExpressionSet
    lens <- sum(sapply(Res, function(x)(length(sampleNames(x)))))
    exprs <- matrix(data = NA, nrow=length(Features), ncol = lens)
    rownames(exprs) <- Features
    colnames(exprs) <- 1:lens

    assignExprs <- function(part_exprs=NULL, res=NULL){
        matched_id <- match(rownames(part_exprs) ,rownames(exprs(res)))
        matched_id <- na.omit(matched_id)
        cat(length(matched_id), " feature identifiers matched!---\n")
        matched_names <- rownames(exprs(res))[matched_id]

        # assign values
        part_exprs[matched_names,] <- exprs(res)[matched_names, ]

        return(part_exprs)
    }

    # cumsum lengths of datasets
    Cs <- as.integer(cumsum(sapply(Res, function(x)(length(sampleNames(x))))))
    for(i in 1:length(Cs)){
        if(i == 1){
            # accelarate speed
            part_exprs <- matrix(NA, nrow = length(Features), ncol = Cs[i])
            part_exprs <- exprs[, 1:Cs[i]]
        }else{
            # accelarate speed
            part_exprs <- matrix(NA, nrow = length(Features), ncol = Cs[i] - Cs[i-1])
            part_exprs <- exprs[, (Cs[i-1]+1):Cs[i]]
        }

        colnames(part_exprs) <- sampleNames(Res[[i]])
        rownames(part_exprs) <- Features

        # assign part-exprs
        part_exprs <- assignExprs(part_exprs=part_exprs, res=Res[[i]])

        # assign part- back to exprs
        if(i == 1){
            exprs[,1:Cs[i]] <- part_exprs
            colnames(exprs)[1:Cs[i]] <- sampleNames(Res[[i]])
        }else{
            exprs[, (Cs[i-1]+1):Cs[i]] <- part_exprs
            colnames(exprs)[(Cs[i-1]+1):Cs[i]] <- sampleNames(Res[[i]])
        }
    }



    # build phenoData
    merged_pData <- Reduce(rbind, lapply(Res, function(x)pData(x)))

    meta_data <- data.frame(labelDescription=rep(NA,length(colnames(merged_pData))), row.names = colnames(merged_pData))
    pheno_data <- new("AnnotatedDataFrame", data=merged_pData, varMetadata=meta_data)

    # build featureData
    merged_fData <- as.data.frame(Features)
    rownames(merged_fData) <- Features
    meta_data2 <- data.frame(labelDescription=rep(NA,length(colnames(merged_fData))), row.names = colnames(merged_fData))

    feature_data <- new("AnnotatedDataFrame", data=merged_fData, varMetadata=meta_data2)
    # create a expressionSet
    Eset <- ExpressionSet(assayData = exprs, phenoData = pheno_data, featureData = feature_data,annotation = annot)

    return(Eset)
}
ShixiangWang/iProfile documentation built on May 11, 2019, 6:25 p.m.