R/run_ica.R

#' Custom ICA function for analyzing gene expression data.
#'
#' Performing ICA on a dataset and create a list object with results.
#'
#' @param pheno_mx Phenotype matrix with diemnsions g x N
#' @param k_est Number of components to be estimated or method to estimate it.
#' @param scale_pheno Logical value specifying the scaling of row of the pheno_mx.
#' @param h_clust_cutoff is the cutoff value used in hierarchical clustering. Default is set to 0.3.
#' @param n_runs Number of runs for estimating k. Default value is set to 5.
#' @param max_iter Maximum iterations for estimating k for each run. Default value is set to 10.
#' @param n_cores Number of cores to be used for estimating k. Default is set to 1.
#' @param cor_threshold Threshold for significant correlation calling. Default is set to 0.05.
#' @param similarity_measure How to measure the similarity between ICs.
#' If set to "peaks" only gene weights that are greater than 1 sd are used to calculate similarity.
#' @return List with the following entries.
#' @keywords keywords
#'
#' @export

run_ica <- function(pheno_mx = NULL,
                    k_est = NULL,
                    var_cutoff = 99,
                    n_runs = 1,
                    n_cores = NULL,
                    scale_pheno = FALSE,
                    h_clust_cutoff = 0.3,
                    max_iter = 10,
                    cor_threshold = 0.05,
                    similarity_measure = "peaks"){

    if(is.null(pheno_mx)){
        stop("Error: Phenotype matrix is missing \n")
    } else {
        pheno_nrow <- nrow(pheno_mx)
        pheno_ncol <- ncol(pheno_mx)
        message("Original dimensions of <pheno_mx> = ", pheno_nrow , " x ", pheno_ncol, "\n")

        if (pheno_nrow < pheno_ncol){
            message("[Caution] Number of samples exceeding number of measured features,
                    please check rows and columns of <pheno_mx> \n")
            message("* If you are from the future and have more samples than measured features,
                    disregard the above message and please proceed. \n")
        }
    }

    if(is.null(colnames(pheno_mx))){
        message("<pheno_mx> is missing column names, setting to default (sample#). \n")
        colnames(pheno_mx) <- paste("sample",c(1:pheno_ncol), sep = "_")
    }

    if(is.null(rownames(pheno_mx))){
        message("<pheno_mx> is missing row names, set to default. \n")
        rownames(pheno_mx) <- paste("feature",c(1:pheno_nrow), sep = "_")
    }

    if(is.null(n_cores)){
        message("<n_cores> not specified. Using single core for calculations. \n")
        n_cores = 1
    }

    message("------ Pre-processing Data ------- \n")
    # removing 0 variance genes and scaling and centering the phenotype matirx
    pheno_mx <- pre_process_data(pheno_mx, scale_pheno = scale_pheno)

    if(is.null(k_est)){
        message("Missing input for <k_est>, using the number of principal components explaining 99% of total variance \n")
        # running PCA on data
        pca_pheno <- prcomp(t(pheno_mx))
        percent <- (cumsum(pca_pheno$sdev^2) /sum(pca_pheno$sdev^2)) * 100
        k_est <- which(percent > var_cutoff)[1]
        message(k_est," components needed to explain more than ",var_cutoff,"% of the variance \n")
        message("<k_est> set to ", k_est, "\n")

        if(k_est == 1){
            stop("1 component explains more than", var_cutoff, "% of the variance,
            check your data or set <k_est> to a number bigger than 1 \n")
        }
    }

    ica_list <- list()
    ica_result <- list()


    message("------ Running ICA ------- \n")
    message("* This may take some time depending on the size of your dataset \n")
    if(n_runs > 1){
        message("<n_runs> set to ", n_runs, " / initiating multiple ICA runs \n")
        message("- Generating ", n_runs, " replicates using ", n_cores, " core(s) \n")
        message("- Estimated components = ", k_est, "\n")

        ica_list <- parallel::mclapply(1:n_runs,
                                       function(x) fastICA_gene_expr(pheno_mx, k_est,
                                                                     fun = "logcosh",
                                                                     alpha = 1, scale_pheno = FALSE,
                                                                     maxit=500, tol = 0.0001,
                                                                     verbose = FALSE),
                                       mc.cores = n_cores)

        for(i in 1:length(ica_list)){

          ica_list[[i]]$peak_mx <- apply(ica_list[[i]]$S, 2, function(x) 1*(abs(x) > 2*sd(x)))

        }

        # After running ICA sevaral times
        # combined.A <- do.call(rbind, lapply(ica_list, function(x) x$A))
        # combine all components into a single matrix
        combined_S <- do.call(cbind, lapply(ica_list, function(x) x$S))

        # combine all peak position matrices as well
        peak_matrix <- do.call(cbind, lapply(ica_list, function(x) x$peak_mx))

        if(similarity_measure == "peaks"){
            if(0 %in% apply(peak_matrix, 2, sum)){
              cor.mx <- cor(combined_S)
            } else{
              # do element-wise multiplication to only save values for peaks
              peak.component <- peak_matrix * combined_S
              # calculate correlation between components (only with their peak values)
              cor.mx <- cor(peak.component)
            }
        } else {
            # if similarity measure is not based on "peaks"
            # use all elements for calculating the similarity between estimates
            cor.mx <- cor(combined_S)
        }

        # clustering of the components
        # create dissimilarity matrix
        dissimilarity <- 1 - abs(cor.mx)

        # convert into distance matrix format for clustering
        cor.dist <- as.dist(dissimilarity)

        # run hierarchical clustering
        h_clust <- hclust(cor.dist)

        # cut tree at h_clust_cutoff (absolute correlation > 1 - h_clust_cutoff)
        groups <- cutree(h_clust, h=h_clust_cutoff)

        # count member components for each group
        group_table <- table(groups)

        # get groups with more than n_runs * 0.9 members
        multi_component_group <- which(group_table >= (n_runs * 0.9))

        k_update <- length(multi_component_group)
        message("- ",k_update," Replicating Components Estimated \n")

        if(k_update < 1){
            stop('None of the ICs replicated. You could try to the increase sample size or <n_runs>. \n')
        }

        # Averaging the components estimated in multiple runs
        Avg_S <- matrix(0,nrow = dim(combined_S)[1],ncol = k_update)

        # for each group calculate the average component
        for(i in 1:length(multi_component_group)){

          # get component indexes for groups with multiple components
          group_members <- which(groups %in% multi_component_group[i])

          # subset matrix for those components
          sub_group_S <- combined_S[,group_members]

          # calculate correlation between them
          group_cor_mx <- cor(sub_group_S)

          # in order to average the components signs need to be matched (positive vs negative)
          match_sign <- ifelse(group_cor_mx[,1] < 0, -1,1 )

          # calculate the mean component
          avg_component <- (sub_group_S %*% match_sign) / length(match_sign)

          Avg_S[,i] <- avg_component

        }

        hclust_list <- list()
        hclust_list$plot <- h_clust
        hclust_list$cutoff <- h_clust_cutoff
        hclust_list$dist.mx <- dissimilarity
        ica_result$hclust <- hclust_list
        ica_result$S <- Avg_S
        ica_result$A <- solve(t(Avg_S) %*% Avg_S) %*% t(Avg_S) %*% pheno_mx

        rm(Avg_S)

    } else if (n_runs == 1){
        message("<n_runs> set to ", n_runs, " / initiating single ICA run \n")
        message("- Estimated components = ", k_est, "\n")

        ica_result <- fastICA_gene_expr(pheno_mx, k_est,
                                        fun = "logcosh",
                                        alpha = 1, scale_pheno = FALSE,
                                        maxit=500, tol = 0.0001, verbose = FALSE)
        k_update <- k_est
    }

    # Setting appropriate names for signals and mixing matrix
    rownames(ica_result$S) <- rownames(pheno_mx)
    colnames(ica_result$S) <- paste("IC",c(1:dim(ica_result$S)[2]),sep="")
    colnames(ica_result$A) <- colnames(pheno_mx)
    rownames(ica_result$A) <- paste("IC",c(1:dim(ica_result$A)[1]),sep="")

    # Attaching the sample info dataframe to the ica list


    message("------ Post-ICA Processing ------- \n")

    message("- Labeling peak genes in each IC \n")
    # peaks are defined as gene contributions that are larger than 2 standard deviations

    ica_result$peaks <- apply(ica_result$S, 2, peak_detection)
    ica_result$peak_mx <- apply(ica_result$S, 2, function(x) 1*(abs(x) > 2*sd(x)))

    message("- Calculating variance explained by each IC \n")
    # get the total variance by the sums of squares of the scaled pheno_mx
    total_var <- sum(pheno_mx^2)

    # applying IC component-wise variance calculations
    var_IC <- sapply(1:dim(ica_result$A)[1],
                     function (x) IC_variance_calc(ica_result$S[,x], ica_result$A[x,]))

    # % variance explained by each IC
    percent_var <- (var_IC / total_var) * 100

    message("- Sanity Check : Total % of variance explained by ",
            k_update," ICs = ", sum(percent_var), "\n")

    message("- Creating index based on Variance explained \n")
    # ordering the ICs based on the amount of variance they explain
    ica_result$order <- order(percent_var,decreasing = TRUE)
    ica_result$percent_var <- percent_var


    class(ica_result) <- "ICAobject"
    attr(ica_result, 'method') <- "ica"
    attr(ica_result, 'covar_cor') <- "no"

    message("------ Process completed without any interuptions -------  \n")

    return(ica_result)
}
jinhyunju/icreport documentation built on May 19, 2019, 10:35 a.m.