R/diff-ml-utilities.R

Defines functions seedind split_data cal_ci remove_constant get_sampledflist rfimportance LDAeffectsize

Documented in get_sampledflist split_data

#' @importFrom dplyr select
#' @importFrom MASS lda
#' @keywords internal
LDAeffectsize <- function(datalist, compareclass, classgroup, bootnums=30, LDA=2, ci=0.95){
    res <- list()
    tmpformula <- as.formula(paste0(classgroup, " ~. "))
    for (i in seq_len(bootnums)){
    compareres <- list()
    df <- datalist[[i]]
    for (p in seq_len(nrow(compareclass))){
        tmppairs <- compareclass[p,]
        z <- suppressWarnings(lda(tmpformula,data=df, tol=1e-6))
        w <- z$scaling[,1]
        w.unit <- w/sqrt(sum(w^2))
        ss <- df %>% select(-c(classgroup)) %>% as.matrix()
        LD <- ss %*% w.unit
        tmpp1 <- df[[match(classgroup, colnames(df))]]==tmppairs[1]
        tmpp2 <- df[[match(classgroup, colnames(df))]]==tmppairs[2]
        effect.size <- abs(mean(LD[tmpp1,])-mean(LD[tmpp2,]))
        wfinal <- w.unit * effect.size
        coeff <- abs(wfinal)
        mm <- z$means
        gm <- abs(mm[match(tmppairs[1], rownames(mm)),,drop=FALSE]- mm[match(tmppairs[2], rownames(mm)),,drop=FALSE])
        tmpres <- (gm+coeff) *0.5
        compareres[[p]] <- tmpres
    }
    compareres <- do.call("rbind", compareres)
    res[[i]] <- compareres
    }
    res <- cal_ci(x=res, classgroup=classgroup, ci=ci, method="lda")
    colnames(res) <- c("f", paste0("LDA", colnames(res)[-1]))
    res <- res[res$LDAmean>=LDA,]
    #res <- Reduce("+", res)
    #res <- apply(res/bootnums, 2, function(x)max(x))
    #res <- log(1+res,10)
    #res <- data.frame(f=names(res), LDA=res)
    #res$f <- gsub("`", "", as.vector(res$f))
    #res <- res[res$LDA>=LDA,]
    if (!nrow(res)>0){stop("No features with significant differences after LDA analysis.")}
    #rownames(res) <- NULL
    return(res)
}

#' @keywords internal
rfimportance <- function(datalist, classgroup, bootnums, effsize=2, ci=0.95){
    rfres <- list()
    #tmpformula <- as.formula(paste0(class, " ~. "))
    for (i in seq_len(bootnums)){
        df <- datalist[[i]]
        classindex <- match(classgroup, colnames(df))
        X <- df[,-classindex]
        X <- apply(X, 2, function(x)(x-min(x))/(max(x)-min(x)))
        Y <- df[, classindex, drop = TRUE]
        dfres <- randomForest::randomForest(x=X, y=Y, importance=TRUE, proximity=TRUE)
        imres <- randomForest::importance(dfres, type=1)
        rfres[[i]] <- imres
    }
    rfres <- cal_ci(x=rfres, classgroup=classgroup, ci=ci, method="rf")
    colnames(rfres) <- c("f", paste0("MDA", colnames(rfres)[-1]))
    #rfres <- Reduce("+", rfres)
    #rfres <- data.frame(rfres/bootnums)
    #rfres <- data.frame(f=rownames(rfres), MeanDecreaseAccuracy=rfres$MeanDecreaseAccuracy)
    #rfres <- rfres[rfres$MeanDecreaseAccuracy>=effsize,]
    return(rfres)
}


#' @title Generate random data list from a original data.
#' @param dalist list, a list contained multi data.frame.
#' @param bootnums integer, the number of bootstrap iteration, default is 30.
#' @param ratio numeric, the ratios of each data.frame to keep.
#' @param makerownames logical, whether build row.names,default is FALSE.
#' @return the list contained the data.frame generated by bootstrap iteration.
#' @author Shuangbin Xu
#' @export
#' @examples
#' \dontrun{
#'     data(iris)
#'     irislist <- split(iris, iris$Species)
#'     set.seed(1024)
#'     irislist <- get_sampledflist(irislist)
#' }
get_sampledflist <- function(dalist, 
    bootnums=30, 
    ratio=0.7, 
    makerownames=FALSE){
    datalist <- list()
    for (i in seq_len(bootnums)){
        res <- lapply(dalist,function(x){x[sample(nrow(x), trunc(nrow(x)*ratio)),,drop=FALSE]})
        res <- do.call("rbind", c(res, make.row.names=makerownames))
        datalist[[i]] <- res
    }
    return(datalist)
}

#' @importFrom stats var
#' @keywords internal
remove_constant <- function(dflist){
    noconstant <- list()
    factornames <- colnames(dflist[[1]][!vapply(dflist[[1]],
					is.numeric,logical(1))])
    for (i in seq_len(length(dflist))){
        tmpdata <- dflist[[i]] %>%
                   dplyr::group_split(!!rlang::sym(factornames)) %>%
                   as.list() %>%
                   lapply(., function(x)x[,vapply(x, is.numeric, logical(1))]) %>%
                   lapply(., function(x)apply(x,2,var,na.rm=T)==0) 
        tmpdata2 <- tmpdata %>%
                    dplyr::bind_rows() %>% 
                    colSums()
        noconstant[[i]] <- names(tmpdata2[tmpdata2 != length(tmpdata)])
    }
    noconstant <- Reduce(intersect,noconstant)
    for (i in seq_len(length(dflist))){
        dflist[[i]] <- dflist[[i]][,match(c(noconstant,factornames),
					colnames(dflist[[i]])),drop=FALSE]
    }
    return(dflist)
}



#' @importFrom tibble rownames_to_column
cal_ci <- function(x, classgroup, ci=0.95, method){
    if (method=="rf"){
        x <- do.call("cbind", x)
        x <- data.frame(t(x), check.names=FALSE)
    }else{
        x <- do.call("rbind", x)
    }
    x <- apply(x, 2, CI, ci=ci)
    x <- data.frame(t(x), check.names=FALSE)
    x <- log(1+x, 10)
    rownames(x) <- gsub("`", "", rownames(x))
    x <- rownames_to_column(x, var="f")
    return(x)
}

#' @title Split Large Vector or DataFrame
#' 
#' @description
#' Split large vector or dataframe to list class, which contian subset vectors
#' or dataframe of origin vector or dataframe.
#'
#' @param x vector class or data.frame class.
#' @param nums integer.
#' @param chunks integer. use chunks if nums is missing.
#' Note nums and chunks shouldn't concurrently be NULL, 
#' default is NULL. 
#' @param random bool, whether split randomly, default is FALSE,
#' if you want to split data randomly, you can set TRUE, and
#' if you want the results are reproducible, you should add 
#' seed before.
#' @return the subset of x, vector or data.frame class.
#' @export
#' @author Shuangbin Xu
#' @examples
#' data(iris)
#' irislist <- split_data(iris, 40)
#' dalist <- c(1:100)
#' dalist <- split_data(dalist, 30)
split_data <- function(x, 
    nums, 
    chunks=NULL, 
    random=FALSE){
    if (missing(nums)){
    	nums <- trunc(x/chunks)
    }
    if (missing(nums) && is.null(chunks)){
    	stop("The nums and chunks shouldn't concurrently be missing!")
    }
    if (!is.null(nums) && !is.null(chunks)){
    	message("We would use nums to split. and the chunk numbers is the length(x) provided by nums.")
    }
    if (is.vector(x) && length(x)>0){
    	ind <- seedind(length(x), nums, random=random)	    
    }
    if (is.vector(x) && length(x)==0){
    	stop("The legth of vector should be larger than zero, if the class of x is vector.")
    }
    if (is.data.frame(x)){
    	ind <- seedind(nrow(x), nums, random=random)
    }
    return(split(x, ind))
}

#' @keywords internal
seedind <- function(l, n, random=FALSE, seed=TRUE){
    tmpind <- rep(seq_len(trunc(l/n)+1),n)
    if (random){
    #if ((l%%n)==0){
    #	tmpind <- tmpind[!tmpind %in% max(tmpind)]
    #}
    tmpind <- sample(sort(tmpind)[seq_len(l)])
    }else{
    	tmpind <- sort(tmpind)[seq_len(l)]
    }
    return(tmpind)
}
xiangpin/MicrobitaProcess documentation built on Nov. 6, 2024, 1:15 a.m.