R/MapObject.R

MapObject <- R6::R6Class("MapObject", list(
    
    dual_datasets = FALSE,
    dataset1 = NULL,
    target_col1 = NULL,
    
    dataset2 = NULL,
    target_col2 = NULL,
    joint_indices1 = NULL,
    joint_indices2 = NULL,
    has_missing1 = NULL,
    has_missing2 = NULL,
    
    samples1 = NULL,
    samples2 = NULL,
    
    correlations = NULL,

    initialize = function(dataset1, target_col1, dataset2=NULL, target_col2=NULL, samples1=NULL, 
                          samples2=NULL, matched=FALSE, skip_correlation=FALSE, discard_dups=FALSE) {
        
        self$dataset1 <- dataset1 %>% arrange(UQ(as.name(target_col1)))
        if (discard_dups) {
            self$dataset1 <- self$dataset1 %>% distinct(UQ(as.name(target_col1)), .keep_all=TRUE)
        }
        self$target_col1 <- target_col1
        
        if (!is.null(dataset2) && !is.null(target_col2)) {
            self$dual_datasets <- TRUE
            self$dataset2 <- dataset2 %>% arrange(UQ(as.name(target_col2)))
            if (discard_dups) {
                self$dataset2 <- self$dataset2 %>% distinct(UQ(as.name(target_col2)), .keep_all=TRUE)
            }
            self$target_col2 <- target_col2
            self$joint_indices1 <- which(self$dataset1[[target_col1]] %in% self$dataset2[[target_col2]])
            self$joint_indices2 <- which(self$dataset2[[target_col2]] %in% self$dataset1[[target_col1]])
        }
        
        if (!is.null(samples1) && length(samples1) > 0) {
            if (!all(samples1 %in% colnames(dataset1))) {
                warning("Invalid samples, not all present in colnames")
            }
            self$samples1 <- samples1
        }
        
        if (!is.null(samples2) && length(samples2) > 0) {
            if (!all(samples2 %in% colnames(dataset2))) {
                warning("Invalid samples, not all present in colnames")
            }
            self$samples2 <- samples2
        }
        
        if (matched) {
            
            if (length(self$samples1) != length(self$samples2)) {
                warning(sprintf("Number of samples (%s and %s) does not match, not calculating correlations", length(self$samples1), length(self$samples2)))
            }
            else if (skip_correlation) {
                message("Option 'skip_correlation' set, skipping...")
            }
            else {            
                message("Calculating correlations")
                ref_rdf <- self$dataset1
                comp_rdf <- self$dataset2
                
                ref_id_col <- ref_rdf %>% dplyr::select(self$target_col1) %>% unlist() %>% unname()
                comp_id_col <- comp_rdf %>% dplyr::select(self$target_col2) %>% unlist() %>% unname()
                
                joint_ids <- ref_id_col[ref_id_col %in% comp_id_col]
                
                ref_rdf_joint <- ref_rdf %>% 
                    dplyr::filter(UQ(as.name(self$target_col1)) %in% joint_ids) %>% 
                    arrange(UQ(as.name(self$target_col1)))
                comp_rdf_joint <- comp_rdf %>% 
                    dplyr::filter(UQ(as.name(self$target_col2)) %in% joint_ids) %>% 
                    arrange(UQ(as.name(self$target_col2)))
                
                ref_sdf_joint <- ref_rdf_joint %>% dplyr::select(self$samples1)
                comp_sdf_joint <- comp_rdf_joint %>% dplyr::select(self$samples2)
                
                showNotification(sprintf("Correlation started for %s features, might take a few moments...", nrow(ref_sdf_joint)))
                self$correlations <- self$generate_correlation_table(ref_sdf_joint, comp_sdf_joint, id_column=ref_rdf_joint[[self$target_col1]])
            }
        }
    },
    # Provides correlation values for Pearson, Spearman and Kendall
    generate_correlation_table = function(ref_sdf_joint, comp_sdf_joint, id_column=NULL, corr_types=list("pearson", "spearman", "kendall")) {
        
        corrs <- lapply(
            corr_types,
            function(corr_type) {
                lapply(seq_len(nrow(ref_sdf_joint)), function(row_i, ref_mat, comp_mat, corr_type, ids) {
                    ref_row <- ref_mat[row_i, ] %>% unlist()
                    comp_row <- comp_mat[row_i, ] %>% unlist()
                    if (length(na.omit(ref_row + comp_row)) < 3) {
                        data.frame(pval=NA, cor=NA)
                    }
                    else {
                        cor_val <- cor.test(ref_row, comp_row, na.action="pairwise.complete.obs", method=corr_type, exact=FALSE)
                        data.frame(pval=cor_val$p.value, cor=cor_val$estimate)
                    }
                }, ref_mat=ref_sdf_joint, comp_mat=comp_sdf_joint, corr_type=corr_type) %>% 
                    do.call("rbind", .) %>%
                    rename_all(~paste(corr_type, ., sep="."))
            }
        ) %>% 
            do.call("cbind", .) %>% 
            dplyr::mutate(
                pearson.fdr=p.adjust(pearson.pval, method = "BH"),
                spearman.fdr=p.adjust(spearman.pval, method = "BH"),
                kendall.fdr=p.adjust(kendall.pval, method = "BH")
            )
        
        if (!is.null(id_column)) {
            corrs <- corrs %>%
                dplyr::mutate(id=id_column) %>%
                dplyr::select(id, everything())
        }
        corrs
    },
    has_correlations = function() {
        !is.null(self$correlations)
    },
    get_full_entries = function(dataset, samples) {
        
        sdf <- dataset[, samples]
        complete.cases(sdf) & !apply(sdf, 1, function(elem) { any(is.infinite(elem)) } )
    },
    get_dataset1_nrow = function() {
        self$dataset1 %>% nrow()
    },
    get_dataset2_nrow = function() {
        self$dataset2 %>% nrow()
    },
    get_matching_dataset1 = function() {
        if (!is.null(self$dataset2)) {
            self$dataset1[self$joint_indices1, ]
        }
        else {
            self$dataset1
        }
    },
    get_matching_dataset2 = function() {
        if (!is.null(self$dataset2)) {
            self$dataset2[self$joint_indices2, ]
        }
        else {
            stop("Second dataset not present!")
        }
    },
    has_full_entries = function() {
        if (!is.null(self$dataset1) && !is.null(self$dataset2)) {
            !is.null(self$samples1) && !is.null(self$samples2)
        }
        else if (!is.null(self$dataset1)) {
            !is.null(self$samples1)
        }
        else if (!is.null(self$dataset2)) {
            !is.null(self$samples2)
        } 
        else {
            stop("Unknown situation, for self$dataset1 and self$dataset2: ", self$dataset1, " ", self$dataset2)
        }
    },
    has_combined = function() {
        length(self$joint_indices1) > 0
    },
    has_same_number_entries = function() {
        length(self$joint_indices1) == length(self$joint_indices2)
    },
    prepare_single_dataset = function(out_df, samples, sample_prefix, only_no_na_entries) {
        if (only_no_na_entries) {
            out_df_full_entries <- out_df %>% self$get_full_entries(samples)
            out_df <- out_df[out_df_full_entries, ]
        }
        colnames(out_df) <- paste0(sample_prefix, ".", colnames(out_df))
        out_df
    },
    # only_no_na_entries: Include only entries with no missing values
    # include_one_dataset_entries: Include entries only present in one dataset by including a row of NA-values in the other
    get_combined_dataset = function(only_no_na_entries = FALSE, include_one_dataset_entries = TRUE) {
    
        if (only_no_na_entries && include_one_dataset_entries) {
            include_one_dataset_entries <- FALSE
        }
        
        if (!is.null(self$dataset1) && !is.null(self$dataset2)) {
            if (!self$has_combined() || !self$has_same_number_entries()) {
                return(NULL)
            }
            
            if (!include_one_dataset_entries) {
                
                out_df1 <- self$dataset1[self$joint_indices1, ]
                out_df2 <- self$dataset2[self$joint_indices2, ]
                
                if (only_no_na_entries) {
                    out_df1_full_entries <- out_df1 %>% self$get_full_entries(self$samples1)
                    out_df2_full_entries <- out_df2 %>% self$get_full_entries(self$samples2)
                    all_full_entries <- out_df1_full_entries & out_df2_full_entries
                    out_df1 <- out_df1[all_full_entries, ]
                    out_df2 <- out_df2[all_full_entries, ]
                }
                colnames(out_df1) <- paste0("d1.", colnames(out_df1))
                colnames(out_df2) <- paste0("d2.", colnames(out_df2))
                out_df <- cbind(out_df1, out_df2)
            }
            else {
                out_df1 <- self$dataset1 %>% rename_all(~paste0("d1.", .))
                out_df2 <- self$dataset2 %>% rename_all(~paste0("d2.", .))
                out_df <- full_join(out_df1, out_df2, by=setNames(paste0("d2.", self$target_col2), paste0("d1.", self$target_col1)), keep=TRUE)
            }
            
            if (!is.null(self$correlations)) {
                out_df <- out_df %>% left_join(., self$correlations, by=setNames("id", sprintf("d1.%s", self$target_col1)))
            }
            
            out_df
        }
        else if (!is.null(self$dataset1)) {
            out_df <- self$prepare_single_dataset(self$dataset1, self$samples1, "d1", only_no_na_entries)
            out_df
        }
        else if (!is.null(self$dataset2)) {
            out_df <- self$prepare_single_dataset(self$dataset2, self$samples2, "d2", only_no_na_entries)
            out_df
        }
        else {
            stop("Unknown situation, for self$dataset1 and self$dataset2: ", self$dataset1, " ", self$dataset2)
        }
        
        if (nrow(out_df) > 0) {
            out_with_id <- cbind(comb_id = paste0("C", seq_len(nrow(out_df))), out_df)
            out_with_id
        } 
        else {
            NULL
        }
    }
))
ComputationalProteomics/OmicLoupe documentation built on Feb. 12, 2023, 3:57 p.m.