R/add_edge_data.R

Defines functions add_edge_data

Documented in add_edge_data

#' Annotate KEGG edge mappings with user data
#' @description Add data column[s] to object created from function 
#' expand_KEGG_edges 
#' @export
#' @importFrom gtools smartbind
#' @importFrom plyr rename
#' @param expanded_edges The data frame object generated via the function 
#' expand_KEGG_edges  
#' @param KEGG_mappings KEGG_mappings The data.frame object generated by the 
#' function expand_KEGG_mappings
#' @param user_data A data frame where in which the first two columns contain 
#' gene symbols representing an edge and any/all other column[s] contain 
#' corresponding edge data.
#' @param data_column_no The column index for desired user data to be added
#' @param only_mapped A logical indicator; if set to FALSE will return 'de-novo'
#' edges that 'exist' in data but are not documented in KEGG 
#' @return A data frame object with detailed KEGG edge mappings annotated with 
#' user data
#' @examples 
#' p53_KGML <- get_KGML('hsa04115')
#' p53_KEGG_mappings <- expand_KEGG_mappings(p53_KGML)
#' p53_edges <- expand_KEGG_edges(p53_KGML, p53_KEGG_mappings)
#' p53_HA1E_data <- overlap_info(p53_KGML, p53_KEGG_mappings, 'HA1E', 
#'                                data_type = '100_bing', only_mapped = FALSE)
#'                                
#' p53_edges_HA1E_ALL <- add_edge_data(p53_edges, p53_KEGG_mappings, 
#'                                      p53_HA1E_data, c(3, 10,12))
#' p53_edges_HA1E_MAPPED <- add_edge_data(p53_edges, p53_KEGG_mappings, 
#'                                         p53_HA1E_data, c(3, 10,12), 
#'                                         only_mapped = TRUE)


add_edge_data <- function(expanded_edges, KEGG_mappings, 
                        user_data, data_column_no = 3, only_mapped = FALSE) {
    
    expanded_edges <- expanded_edges[expanded_edges$type != "maplink", ]
    
    if ("pre_mapped" %in% names(user_data)) {
        user_data <- user_data[, -c(which(colnames(user_data) == "pre_mapped"))]
    }
    
    if (nrow(expanded_edges) > 0) {
        
        expanded_edges$unique_ID = paste0(expanded_edges$entry1symbol, ",", 
            expanded_edges$entry2symbol)
        expanded_edges$unique_IDR = paste0(expanded_edges$entry2symbol, ",", 
            expanded_edges$entry1symbol)
        user_data$unique_ID = paste0(user_data$knockout1, ",", 
                                        user_data$knockout2)
        pre_mapped1 <- subset(user_data, user_data$unique_ID %in% 
                                    expanded_edges$unique_ID)
        pre_mapped2 <- subset(user_data, user_data$unique_ID %in% 
                                    expanded_edges$unique_IDR)
        pre_mapped2 <- pre_mapped2[, c(2, 1, 3:ncol(pre_mapped2))]
        names(pre_mapped2) = names(pre_mapped1)
        if (nrow(pre_mapped2) >= 1 & nrow(pre_mapped1) >= 1) {
            pre_mapped2$unique_ID <- paste0(pre_mapped2$knockout1, ",", 
                                            pre_mapped2$knockout2)
            pre_mapped <- rbind(pre_mapped1, pre_mapped2)
        }
        if (nrow(pre_mapped2) == 0 & (nrow(pre_mapped1) == 0)) {
            pre_mapped <- data.frame(unique_ID = NA)
        }
        if (nrow(pre_mapped1) == 0 & nrow(pre_mapped2) >= 1) {
            pre_mapped2$unique_ID <- paste0(pre_mapped2$knockout1, ",", 
                                            pre_mapped2$knockout2)
            pre_mapped <- pre_mapped2
        }
        if (nrow(pre_mapped2) == 0 & nrow(pre_mapped1) >= 1) {
            pre_mapped <- pre_mapped1
        }
        if (!is.na(pre_mapped[1, 1])) {
            expanded_edges_1 <- subset(expanded_edges, expanded_edges$unique_ID
                                        %in% pre_mapped$unique_ID)
            expanded_edges_1 <- expanded_edges[expanded_edges$unique_ID %in% 
                pre_mapped$unique_ID, ]
            expanded_edges_2 <- expanded_edges[!expanded_edges$unique_ID %in% 
                pre_mapped$unique_ID, ]
            expanded_edges_1$has_data = 1
            testval <- nrow(expanded_edges_2)
            if (testval > 0) {
                expanded_edges_2$has_data = 0
                edge_set <- rbind(expanded_edges_1, expanded_edges_2)
            } else {
                edge_set <- expanded_edges_1
            }
            
            edge_set <- edge_set[order(edge_set$unique_ID), ]
            
            data_to_add <- cbind(as.character(pre_mapped[, "unique_ID"]), 
                                pre_mapped[, data_column_no])
            data_to_add <- data.frame(data_to_add, stringsAsFactors = FALSE)
            names(data_to_add)[1] <- "unique_ID"
            if (ncol(data_to_add) == 2) {
                names(data_to_add)[2] <- names(pre_mapped)[data_column_no]
            }
            
            data_to_add <- data_to_add[order(data_to_add$unique_ID), ]
            
            annotated_edges <- merge(edge_set, data_to_add, "unique_ID", 
                                     all.x = TRUE)
            drops <- c("unique_ID", "unique_IDR")
            annotated_edges <- annotated_edges[, !(names(annotated_edges) %in% 
                drops)]
            cat(paste0("Number of edges documented in selected pathway = ", 
                nrow(annotated_edges)), "\n")
            cat(paste0("Number of edges with corresponding user data = ", 
                        sum(annotated_edges$has_data), "\n"))
            cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
                        nrow(annotated_edges) * 100, 2), "%", "\n"))
            annotated_edges$premapped <- 1
            un_mapped <- subset(user_data, !user_data$unique_ID %in% 
                                expanded_edges$unique_ID & 
                            !user_data$unique_ID %in% expanded_edges$unique_IDR)
            un_mapped_edges <- un_mapped[, c(1:2, data_column_no,
                                            ncol(un_mapped))]
            
            if (only_mapped) {
                return(annotated_edges)
            }
        } else {
            annotated_edges <- expanded_edges
            annotated_edges$premapped <- 1
            annotated_edges$has_data <- 0
            un_mapped <- user_data
            un_mapped$unique_ID <- paste0(un_mapped$knockout1, 
                                        un_mapped$knockout2)
            un_mapped_edges <- un_mapped[, c(1:2, data_column_no, 
                                            ncol(un_mapped))]
            un_mapped_edges$premapped <- 0
            un_mapped_edges$has_data <- 0
            cat(paste0("Number of edges documented in selected pathway = ", 
                nrow(annotated_edges)), "\n")
            cat(paste0("Number of edges with corresponding user data = ", 
                        sum(annotated_edges$has_data), "\n"))
            cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
                        nrow(annotated_edges) * 100, 2), "%", "\n"))
            if (only_mapped) {
                cat(paste0("No documented edges are found in data; 
                    only data for de-novo edges can be mapped \n"))
                return(annotated_edges)
            }
        }
    } else if (nrow(expanded_edges) == 0) {
        annotated_edges <- expanded_edges
        annotated_edges$premapped <- 1
        annotated_edges$has_data <- 0
        un_mapped <- user_data
        un_mapped$unique_ID <- paste0(un_mapped$knockout1, un_mapped$knockout2)
        un_mapped_edges <- un_mapped[, c(1:2, data_column_no, ncol(un_mapped))]
        un_mapped_edges$premapped <- 0
        un_mapped_edges$has_data <- 0
        cat(paste0("Number of edges documented in selected pathway = ", 
                    nrow(annotated_edges)), "\n")
        cat(paste0("Number of edges with corresponding user data = ", 
                    sum(annotated_edges$has_data), "\n"))
        cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
                    nrow(annotated_edges) * 100, 2), "%", "\n"))
        if (only_mapped) {
            return(annotated_edges)
        }
    }
    names(un_mapped_edges)[1:2] <- c("entryNAME_1", "entryNAME_2")
    for (i in 1:nrow(un_mapped_edges)) {
        un_mapped_edges$Source_eid[i] <- 
            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entryNAME_1[i])])
        un_mapped_edges$Target_eid[i] <- 
            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entryNAME_2[i])])
        x <- length(unlist(un_mapped_edges$Source_eid[i]))
        y <- length(unlist(un_mapped_edges$Target_eid[i]))
        if (x > 1 | y > 1) {
            un_mapped_edges$simple[i] <- FALSE
        } else {
            un_mapped_edges$simple[i] <- TRUE
        }
    }
    simple_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == TRUE, 
        -ncol(un_mapped_edges)]
    complex_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == FALSE, 
        -ncol(un_mapped_edges)]
    test_val <- nrow(complex_un_mapped_edges)
    if (test_val == 0) {
        un_mapped_edges <- simple_un_mapped_edges
    } else {
        keeps <- c("Source_eid", "Target_eid", "unique_ID")
        c_temp <- complex_un_mapped_edges[, (names(complex_un_mapped_edges) %in% 
            keeps)]
        
        for (i in 1:nrow(c_temp)) {
            x <- length(unlist(c_temp$Source_eid[i]))
            y <- length(unlist(c_temp$Target_eid[i]))
            l <- x * y
            
            c_temp$Source_eid[i] <- list(sort(unlist(rep(c_temp$Source_eid[i], 
                y))))
            c_temp$Target_eid[i] <- list(unlist(rep(c_temp$Target_eid[i], x)))
            c_temp$unique_ID[i] <- list(rep(c_temp$unique_ID[i], l))
        }
        
        c_temp <- data.frame(unique_ID = unlist(c_temp$unique_ID), Source_eid = 
                            unlist(c_temp$Source_eid),
                            Target_eid = unlist(c_temp$Target_eid))
        drops <- c("Source_eid", "Target_eid")
        complex_un_mapped_edges <- 
            complex_un_mapped_edges[, !(names(complex_un_mapped_edges) %in% 
            drops)]
        complex_un_mapped_edges <- merge(complex_un_mapped_edges, c_temp)
        un_mapped_edges <- rbind(simple_un_mapped_edges, 
                                 complex_un_mapped_edges)
    }
    un_mapped_edges$Source_eid <- unlist(un_mapped_edges$Source_eid)
    un_mapped_edges$Target_eid <- unlist(un_mapped_edges$Target_eid)
    un_mapped_edges <- plyr::rename(un_mapped_edges, c(Source_eid = "entry1", 
        Target_eid = "entry2", entryNAME_1 = "entry1symbol", 
        entryNAME_2 = "entry2symbol"))
    un_mapped_edges$subtype1 <- "de_novo"
    
    drops <- c("unique_ID")
    un_mapped_edges <- un_mapped_edges[, !(names(un_mapped_edges) %in% drops)]
    un_mapped_edges$has_data <- 1
    un_mapped_edges$premapped <- 0
    for (i in 1:nrow(un_mapped_edges)) {
        un_mapped_edges$entry1accession[i] <- 
            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entry1symbol[i])][1]
        un_mapped_edges$entry2accession[i] <- 
            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entry2symbol[i])][1]
        un_mapped_edges$entry1type[i] <- 
            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entry1symbol[i])][1]
        un_mapped_edges$entry2type[i] <- 
            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == 
            un_mapped_edges$entry2symbol[i])][1]
    }
    
    if (nrow(annotated_edges) > 0 & nrow(un_mapped_edges) > 0) {
        all_edges <- gtools::smartbind(annotated_edges, un_mapped_edges)
    } else if (nrow(annotated_edges) > 0) 
        all_edges <- annotated_edges else {
        all_edges <- un_mapped_edges
        all_edges$value <- NA
        all_edges$subtype2 <- NA
        all_edges$value2 <- NA
        all_edges$specific_subtype <- NA
        all_edges$type <- NA
        all_edges$is_direct <- 1
        all_edges$edgeID <- seq(1:nrow(un_mapped_edges))
        refcols <- c("edgeID", "entry1accession", "entry2accession", "entry1", 
            "entry2")
        all_edges <- all_edges[, c(refcols, setdiff(names(all_edges), refcols))]
        cat(paste0("All documented edges are of type maplink; only data for 
                    de-novo edges can be mapped \n"))
    }
    
    for (i in 1:nrow(all_edges)) {
        if (all_edges$entry1[i] == all_edges$entry2[i]) {
            all_edges$paralogs[i] <- 1
        } else {
            all_edges$paralogs[i] <- 0
        }
    }
    return(all_edges)
}

Try the KEGGlincs package in your browser

Any scripts or data that you put into this service are public.

KEGGlincs documentation built on May 31, 2017, 2:36 p.m.