R/capri.hypotheses.R

Defines functions pairwise.fisher.test hypothesis.adj.matrix get.lifted.pattern emap XOR OR AND aux.log hypotheses.expansion hypothesis.add.homologous hypothesis.add.group hypothesis.lifted.effects hypothesis.add

Documented in AND hypothesis.add hypothesis.add.group hypothesis.add.homologous OR XOR

#### TRONCO: a tool for TRanslational ONCOlogy
####
#### Copyright (c) 2015-2017, Marco Antoniotti, Giulio Caravagna, Luca De Sano,
#### Alex Graudenzi, Giancarlo Mauri, Bud Mishra and Daniele Ramazzotti.
####
#### All rights reserved. This program and the accompanying materials
#### are made available under the terms of the GNU GPL v3.0
#### which accompanies this distribution.

#' Add a new hypothesis by creating a new event and adding it to the compliant genotypes
#' @title hypothesis add
#' @param data A TRONCO compliant dataset.
#' @param pattern.label Label of the new hypothesis.
#' @param lifted.pattern Vector to be added to the lifted genotype resolving the pattern related to the new hypothesis 
#' @param pattern.effect Possibile effects for the pattern. 
#' @param pattern.cause Possibile causes for the pattern. 
#' @return A TRONCO compliant object with the added hypothesis
#' @export hypothesis.add
#' @importFrom stats fisher.test
#' @importFrom methods getPackageName is
#'
hypothesis.add <- function(data,
                           pattern.label,
                           lifted.pattern,
                           pattern.effect = "*",
                           pattern.cause = "*") {


    pattern.label = gsub('[[:space:]]+', '_', pattern.label)

    

    is.compliant(data)

    # check if there is a reconstructed model
    if(has.model(data)) {
        stop('This dataset has a reconstructed model and no more hypothesis can be added.')
    }

    genotypes = data$genotypes;
    annotations = data$annotations;

    if (!is.null(data$hypotheses)) {
        hypotheses = data$hypotheses;
    } else {
        hypotheses = NA;
    }
    
    ## The Boolean functions look for a set of variables.
    ## If there are already variables named as the ones
    ## used here, make the backup of them.
    
    do.roll.back.lifting.genotypes = FALSE;
    do.roll.back.lifting.annotations = FALSE;
    do.roll.back.lifting.edges = FALSE;
    do.roll.back.fisher.pvalues = FALSE;

    ## I need a variable to save the genotypes of the
    ## lifted pattern.
    
    ## If there is already a variable named
    ## lifting.genotypes, make the backup of it.

    hypotheses.env = get('hypotheses.env', asNamespace(getPackageName()))
    
    if (exists("lifting.genotypes", hypotheses.env)) {
        roll.back.lifting.genotypes = get('lifting.genotypes', envir = hypotheses.env)
        do.roll.back.lifting.genotypes = TRUE;
    }
    assign("lifting.genotypes", genotypes, envir = hypotheses.env);

    ## I need a variable to save the annotations of the
    ## lifted pattern.
    
    ## if there is already a variable named
    ## lifting.annotations, make the backup of it.
    
    if (exists("lifting.annotations", hypotheses.env)) {
        roll.back.lifting.annotations = get('lifting.annotations', envir = hypotheses.env)
        do.roll.back.lifting.annotations = TRUE;
    }
    assign("lifting.annotations", annotations, envir = hypotheses.env);

    ## I need a variable to save the edges of the lifted
    ## pattern.
    
    ## If there is already a variable named lifting.edges,
    ## make the backup of it.
    
    if (exists("lifting.edges", hypotheses.env)) {
        roll.back.lifting.edges = get('lifting.edges', envir = hypotheses.env)
        do.roll.back.lifting.edges = TRUE
    }
    assign("lifting.edges", NULL, envir = hypotheses.env);

    ## I need a variable to save the pvalues of the lifted
    ## pattern.
    
    ## If there is already a variable named fisher.pvalues,
    ## make the backup of it.
    
    if (exists("fisher.pvalues", hypotheses.env)) {
        roll.back.fisher.pvalues = get('fisher.pvalues', envir = hypotheses.env)
        do.roll.back.fisher.pvalues = TRUE;
    }
    assign("fisher.pvalues", NULL, envir = hypotheses.env);

    ## Save the lifted genotypes and its hypotheses for the
    ## current pattern.
    
    curr_pattern = lifted.pattern$pattern;
    curr_hypotheses = lifted.pattern$hypotheses;
    curr_pvalues = get('fisher.pvalues', envir = hypotheses.env)

    ## Save the edges of the lifted pattern.
    
    hstructure = get('lifting.edges', envir = hypotheses.env)

    ## Roll back to the previous value of the variable
    ## lifting.genotypes if any or remove it.
    
    if (do.roll.back.lifting.genotypes) {
        assign("lifting.genotypes",
               roll.back.lifting.genotypes,
               envir = hypotheses.env);
    } else {
        rm('lifting.genotypes', pos = hypotheses.env);
    }

    ## Roll back to the previous value of the variable
    ## lifting.annotations if any or remove it.
    
    if (do.roll.back.lifting.annotations) {
        assign("lifting.annotations",
               roll.back.lifting.annotations,
               envir = hypotheses.env);
    } else {
        rm('lifting.annotations', pos = hypotheses.env)
    }

    ## Roll back to the previous value of the variable
    ## lifting.edges if any or remove it.
    
    if (do.roll.back.lifting.edges) {
        assign("lifting.edges",
               roll.back.lifting.edges,
               envir = hypotheses.env)
    } else {
        rm('lifting.edges', pos = hypotheses.env)
    }

    ## Roll back to the previous value of the variable
    ## fisher.pvalues if any or remove it.
    
    if (do.roll.back.fisher.pvalues) {
        assign("fisher.pvalues",
               roll.back.fisher.pvalues,
               envir = hypotheses.env)
    } else {
        rm('fisher.pvalues', pos = hypotheses.env)
    }

    ## Set the hypotheses number.
    
    if (!is.na(hypotheses[1])) {
        num.hypotheses = hypotheses$num.hypotheses
    } else {
        num.hypotheses = 0
    }

    ## * is a special pattern.effect which indicates to use all
    ## the events as effects for this pattern.
    
    is.to.all.effects = FALSE;
    if (pattern.effect[[1]][1] == "*") {

        pattern.effect =
            colnames(genotypes)[1:(length(colnames(genotypes)) - num.hypotheses)];

        ## Any event can not be both causes and effects for the
        ## pattern to be well-formed.
        
        pattern.effect =
            list(pattern.effect[-which((pattern.effect %in% unlist(curr_hypotheses$llist)))]);
        is.to.all.effects = TRUE;

        if (length(pattern.effect) == 0) {
            stop(paste("[ERR] Missing list of effects to test or wildcard \'*\'.",
                       sep = ''));
        }
    }

    ## Check the pattern to be well-formed.
    
    all.col.nums = vector();
    if (length(pattern.effect) == 0) {
        stop(paste("[ERR] Missing list of effects or wildcard \'*\'.",
                   sep = ''));
    } else {

        ## Check the effects of the pattern to be well-formed.
        
        for (i in 1:length(pattern.effect)) {

            curr.pattern.effect = pattern.effect[[i]];
            if (is.to.all.effects == FALSE) {
                col.num = -1;
                if (length(curr.pattern.effect) == 1) {
                    event.map =
                        emap(c(curr.pattern.effect, "*"),
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                    events.name = event.map$events.name;
                } else if (length(curr.pattern.effect) == 2) {
                    event.map =
                        emap(curr.pattern.effect,
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                    events.name = event.map$events.name;
                }
            } else {
                col.num = which(colnames(genotypes) %in% curr.pattern.effect);
                if (length(col.num) == 0) {
                    col.num = -1;
                }
                events.name = curr.pattern.effect;
            }

            ## Check the effect to be a valid event.
            
            if (col.num[1] == -1) {
                stop(paste("[ERR] Unknown gene among effects: \"",
                           curr.pattern.effect,
                           "\".",
                           sep = ''));
            }
            all.col.nums = append(all.col.nums,col.num);

            ## Check the pattern to be well-formed.
            ## If the effect is in the pattern, the pattern is not
            ## well-formed.
            
            if (length(which(unlist(curr_hypotheses$llist) %in%
                             events.name)) > 0) {
                stop(paste("[ERR] Bad formed pattern, event \"",
                           curr.pattern.effect,
                           "\" yields a loop.",
                           sep = ''));
            }
        }
    }

    ## Look for duplicated effects in the pattern.
    
    if (anyDuplicated(all.col.nums) > 0) {
        stop(paste("[ERR] Bad formed pattern, duplicated events ", 
                   paste(pattern.effect[duplicated(pattern.effect)],
                         collapse = ', ',
                         sep = ''),
                   "within effects.",
                   sep = ''));
    }

    ## Check that the we are not duplicating any name by adding
    ## the new pattern.
    
    if (length(which(colnames(genotypes) == pattern.label)) > 0) {
        stop(paste("[ERR] This pattern already exists.", sep = ''));
    }

    ## Add the pattern to the genotypes.
    
    genotypes = cbind(genotypes,curr_pattern);

    ## Check that the pattern is valid according to Suppes'
    ## theory.
    ## Compute the observed and observed joint probabilities.
    
    pair.count <-
        array(0, dim = c(ncol(genotypes), ncol(genotypes)));
    
    ## Compute the probabilities on the genotypes.
    for (i in 1:ncol(genotypes)) {
        for (j in 1:ncol(genotypes)) {
            val1 = genotypes[ , i];
            val2 = genotypes[ , j];
            pair.count[i, j] = (t(val1) %*% val2);
        }
    }
    
    ## Marginal.probs is an array of the observed marginal
    ## probabilities.
    
    marginal.probs <-
        array(as.matrix(diag(pair.count) / nrow(genotypes)),
              dim = c(ncol(genotypes), 1));
    
    ## joint.probs is an array of the observed joint probabilities
    
    joint.probs <- as.matrix(pair.count / nrow(genotypes));
    
    ## Check that the probability of the pattern is in (0,1)
    
    if (marginal.probs[ncol(genotypes)] == 0
        || marginal.probs[ncol(genotypes)] == 1) {
        stop(paste("[ERR] The pattern has marginal probability ",
                   marginal.probs[ncol(genotypes)], 
                   ", which should be in (0, 1).",
                   sep = ''));
    }

    ## Check that the pattern does not duplicate any existing
    ## column.
    
    i = ncol(genotypes);
    for (j in 1:ncol(genotypes)) {

        ## If the edge is valid, i.e., not self cause.
        
        if (i != j) {

            ## If the two considered events are not
            ## distinguishable.
            
            if ((joint.probs[i, j] / marginal.probs[i]) == 1
                && (joint.probs[i, j] / marginal.probs[j]) == 1) {
                stop(paste("[ERR] Pattern duplicates ",
                           paste(as.events(data)[j , ],
                                 collapse = ' ',
                                 sep = ''), 
                           ".",
                           sep = ''));
            }
        }
    }

    ## * is a special pattern.cause which indicates to use all the
    ## events as causes for this pattern.
    
    is.to.all.causes = FALSE;
    if (pattern.cause[[1]][1] == "*") {

        pattern.cause =
            colnames(genotypes)[1:(length(colnames(genotypes)) - num.hypotheses-1)];

        ## Any event can not be both causes and effects for the
        ## pattern to be well-formed.
        
        pattern.cause =
            list(pattern.cause[- which((pattern.cause %in% unlist(curr_hypotheses$llist)))]);
        is.to.all.causes = TRUE;
    }

    ## Check the pattern to be well-formed.
    
    all.col.nums = vector();
    if (length(pattern.cause) > 0) {

        ## Check the causes of the pattern to be well-formed.
        
        for (i in 1:length(pattern.cause)) {

            curr.pattern.cause = pattern.cause[[i]];
            if (is.to.all.causes == FALSE) {
                col.num = -1;
                if (length(curr.pattern.cause) == 1) {
                    event.map =
                        emap(c(curr.pattern.cause, "*"),
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                    events.name = event.map$events.name;
                } else if (length(curr.pattern.cause) == 2) {
                    event.map =
                        emap(curr.pattern.cause,
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                    events.name = event.map$events.name;
                }
            } else {
                col.num = which(colnames(genotypes) %in% curr.pattern.cause);
                if (length(col.num) == 0) {
                    col.num = -1;
                }
                events.name = curr.pattern.cause;
            }

            ## Check the cause to be a valid event.
            
            if (col.num[1] == -1) {
                stop(paste("[ERR] Unknown gene among causes: \"",
                           curr.pattern.cause,
                           "\".",
                           sep = ''));
            }
            all.col.nums = append(all.col.nums, col.num);

            ## Check the pattern to be well-formed.
            ## If the cause is in the pattern, the pattern is not
            ## well-formed.
            
            if (length(which(unlist(curr_hypotheses$llist) %in% events.name)) > 0) {
                stop(paste("[ERR] Bad formed pattern, event \"",
                           curr.pattern.cause,
                           "\" yields a loop.",
                           sep = ''));
            }
        }
    }

    ## Look for duplicated causes in the pattern.
    
    if (anyDuplicated(all.col.nums)>0) {
        stop(paste("[ERR] Bad formed pattern, duplicated events ", 
                   paste(pattern.cause[duplicated(pattern.cause)],
                         collapse = ', ',
                         sep = ''),
                   "within causes.",
                   sep = ''));
    }

    ## Now I can finally add the hypothesis.
    
    colnames(genotypes)[ncol(genotypes)] = pattern.label;
    if (is.na(hypotheses[1])) {
        hypotheses = list();
    }
    hypotheses$num.hypotheses = num.hypotheses + 1;

    ## Add the new hypothesis in the annotations.
    
    annotations = rbind(data$annotations,c("Pattern", pattern.label));
    rownames(annotations)[nrow(annotations)] = pattern.label;

    ## Add the color of the type "Hypothesis" is not already
    ## defined.
    
    if (any(rownames(data$types) == "Pattern") == FALSE) {
        types = rbind(data$types, 'slateblue');
        rownames(types)[nrow(types)] = "Pattern";
        data$types = types;
    }

    ## Create the list of added hypotheses.
    
    if (length(hypotheses$hlist)==0) {
        hypotheses$hlist = vector();
    }
    
    ## Add the new hypothesis to the list.
    for (i in 1:length(pattern.effect)) {
        curr.pattern.effect = pattern.effect[[i]];
        if (is.to.all.effects == FALSE) {
            if (length(curr.pattern.effect) == 1) {
                event.map =
                    emap(c(curr.pattern.effect, "*"),
                         genotypes,
                         annotations);
                col.num = event.map$col.num;
            } else if (length(curr.pattern.effect) == 2) {
                event.map =
                    emap(curr.pattern.effect,
                         genotypes,
                         annotations);
                col.num = event.map$col.num;
            }
        } else {
            col.num = which(colnames(genotypes) %in% curr.pattern.effect);
            if (length(col.num) == 0) {
                col.num = -1;
            }
        }
        
        for (j in 1:length(col.num)) {
            hypotheses$hlist =
                rbind(hypotheses$hlist,
                      t(c(colnames(genotypes)[ncol(genotypes)],
                          colnames(genotypes)[col.num[j]])));
        }
        if (is.null(colnames(hypotheses$hlist))) {
            colnames(hypotheses$hlist) = c("cause", "effect");
        }
    }

    ## Add the causes of the new hypothesis to the list.
    
    if (length(pattern.cause) > 0) {
        for (i in 1:length(pattern.cause)) {
            curr.pattern.cause = pattern.cause[[i]];
            if (is.to.all.causes==FALSE) {
                if (length(curr.pattern.cause)==1) {
                    event.map =
                        emap(c(curr.pattern.cause, "*"),
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                }
                else if (length(curr.pattern.cause)==2) {
                    event.map =
                        emap(curr.pattern.cause,
                             genotypes,
                             annotations);
                    col.num = event.map$col.num;
                }
            } else {
                col.num = which(colnames(genotypes) %in% curr.pattern.cause);
                if (length(col.num) == 0) {
                    col.num = -1;
                }
            }
            
            for (j in 1:length(col.num)) {
                hypotheses$hlist =
                    rbind(hypotheses$hlist,
                          t(c(colnames(genotypes)[col.num[j]],
                              colnames(genotypes)[ncol(genotypes)])));
            }
            
            if (is.null(colnames(hypotheses$hlist))) {
                colnames(hypotheses$hlist) = c("cause", "effect");
            }
        }
    }

    ## Create the list of hypotheses' structures.
    
    if (length(hypotheses$hstructure) == 0) {
        #hypotheses$hstructure = new.env(hash = TRUE, parent = emptyenv());
        hypotheses$hstructure = list()
    }
    hypotheses$hstructure[pattern.label] = list(get.lifted.pattern(hstructure))

    ## Add the atoms of the hypothesis.
    
    if (length(hypotheses$patterns) == 0) {
        hypotheses$patterns = list();
    }
    hypotheses$patterns[pattern.label] = lifted.pattern$hypotheses$llist;

    ## Add the hypotheses of the atoms.
    
    if (length(hypotheses$atoms) == 0) {
        hypotheses$atoms =
            vector(mode = "list",
                   length = (ncol(genotypes) - hypotheses$num.hypotheses));
        names(hypotheses$atoms) =
            colnames(genotypes)[1:(ncol(genotypes) - hypotheses$num.hypotheses)];
    }
    atoms.in.pattern =
        which(names(hypotheses$atoms) %in% unlist(hypotheses$patterns[pattern.label]));
    if (length(atoms.in.pattern) > 0) {
        for (i in 1:length(atoms.in.pattern)) {
            hypotheses$atoms[[atoms.in.pattern[i]]] =
                append(hypotheses$atoms[[atoms.in.pattern[i]]], pattern.label);
        }
    }

    ## Add the fisher pvalues.
    
    if (length(hypotheses$pvalues) == 0) {
        hypotheses$pvalues = vector();
    }
    hypotheses$pvalues = append(hypotheses$pvalues, list(curr_pvalues))
    names(hypotheses$pvalues)[length(hypotheses$pvalues)] = pattern.label;

    ## Return the new (compliant) data structure as result.
    
    data$genotypes = genotypes;
    data$annotations = annotations;
    data$hypotheses = hypotheses;

    return(data);
}


# resolve the ellipsis for the effects

hypothesis.lifted.effects <- function( ... ) {
    return(list(...));
}


#' Add all the hypotheses related to a group of events
#' @title hypothesis add group
#' @param x A TRONCO compliant dataset.
#' @param FUN Type of pattern to be added, e.g., co-occurance, soft or hard exclusivity.
#' @param group Group of events to be considered. 
#' @param pattern.cause Possibile causes for the pattern. 
#' @param pattern.effect Possibile effects for the pattern. 
#' @param dim.min Minimum cardinality of the subgroups to be considered.
#' @param dim.max Maximum cardinality of the subgroups to be considered.
#' @param min.prob Minimum probability associated to each valid group.
#' @param silent A parameter to disable/enable verbose messages.
#' @return A TRONCO compliant object with the added hypotheses
#' @export hypothesis.add.group
#' @importFrom utils flush.console combn
#' 
hypothesis.add.group <- function(x, 
                                 FUN, 
                                 group, 
                                 pattern.cause = '*',
                                 pattern.effect = '*',
                                 dim.min = 2,
                                 dim.max = length(group),
                                 min.prob = 0,
                                 silent = FALSE) {

    is.compliant(x)
    # check if there is a reconstructed model
    if (has.model(x)) {
        stop('This dataset has a reconstructed model and no more hypothesis can be added.')
    }

    op = deparse(substitute(FUN))

    effect = paste0("c('", paste(pattern.effect, collapse = "', '"), "')")
    cause = paste0("c('", paste(pattern.cause, collapse = "', '"), "')")

    ngroup = length(group)
    if (ngroup < 2) {
        warning("No hypothesis will be created for groups with less than 2 elements.")
        return(x)
    }

    if (!silent) {
        cat("*** Adding Group Hypotheses\n")
        cat(' Group:', paste(group, collapse = ", ", sep = ""), '\n')
        cat(' Function:', op, '\n')
        cat(' Cause:', paste(pattern.cause, collapse=", "), '; ')
        cat(' Effect:', paste(pattern.effect, collapse=", "), '.\n')
    }

    if (min.prob > 0) {
        if (!silent)  {
            cat('\nFiltering genes within the group with alteration frequency below',
                min.prob,
                '\n')
        }

        temp = events.selection(x, filter.in.names = group, silent = silent)
        temp = as.alterations(temp, silent = silent)
        temp = events.selection(temp, filter.freq = min.prob, silent = silent)

        group = as.genes(temp)
        if (!silent) {
            cat('New group:', paste(group, collapse = ", ", sep = ""), '\n')
        }
    }

    ngroup = length(group)
    if (ngroup < 2) {
        warning("No hypothesis will be created for groups with less than 2 elements.")
        return(x)
    }

    hom.group =
        lapply(group,
               function(g, x) {
                   ## Isn't (nevents(x, genes = g) > 1) enough?
                   if (nevents(x, genes = g) > 1) {
                       TRUE
                   } else { 
                       FALSE
                   }
               },
               x)
    hom.group = group[unlist(hom.group)]

    gene.hom <- function(g, h) {
        if (g %in% h) {
            if (any(rowSums(as.gene(x, genes = g)) > 1) ) return(paste0("OR('", g, "')"))
            else return(paste0("XOR('", g, "')"))
        }
        return(paste0("'", g, "'"))
    }

    max.groupsize = min(dim.max, ngroup)
    min.groupsize = max(2, dim.min)

    if (dim.min > dim.max) {
        stop('ERROR - dim.min > dim.max')
    }
    
    if (min.groupsize > max.groupsize) {
        stop('ERROR - min.groupsize > max.groupsize')
    }

    if (length(hom.group) > 0 && !silent) { 
        cat("Genes with multiple events: ",
            paste(unlist(hom.group),
                  collapse=', ',
                  sep = ''),
            "\n")
    }

    error.summary = data.frame()

    ## Get an analytical pattern... !
    
    tot.patterns = 0
    for (i in min.groupsize:max.groupsize) {
        tot.patterns = tot.patterns + ncol(combn(unlist(group), i))
    }

    ## Create a progress bar.
    if (!silent) {
        cat('Generating ',
            tot.patterns,
            'patterns [size: min =',
            max.groupsize,
            ' -  max =',
            max.groupsize,
            '].\n')
    }

    ## pb <- txtProgressBar(0, tot.patterns, style = 3)
    
    pbPos = 0
    for (i in min.groupsize:max.groupsize) {
        gr = combn(unlist(group), i)

        for (j in 1:ncol(gr)) {
            genes = as.list(gr[, j])

            ## Start the progress bar.
            pbPos = pbPos + 1

            hypo.name = paste(unlist(genes), sep = "_", collapse = "_")
            hypo.genes =
                paste(lapply(genes,
                             function(g, hom.group) {
                                 gene.hom(g, hom.group)
                             },
                             hom.group),
                      collapse = ", ")

            hypo.add = paste0("hypothesis.add(x, ", 
                "pattern.label = '", op, "_", hypo.name, "', ",
                "lifted.pattern = ", op, "(", hypo.genes, "), ",
                "pattern.effect = ", effect, ", ",
                "pattern.cause = ", cause, ")")

            err =
                tryCatch({
                    x = eval(parse(text = hypo.add))
                }, error = function(cond) {

                    m = paste("Error on", hypo.add, ".\n", cond)
                    code = strsplit(as.character(cond), " ")[[1]]
                    idx.errcode = which(code == "[ERR]", arr.ind = TRUE) + 1

                    return(
                        data.frame(
                            pattern = paste(unlist(genes), collapse = ", ", sep = ""), 
                            error = paste(code[idx.errcode:length(code)], collapse = " ")
                        ))

                }, warning = function(cond) {
                    m = paste("Warning on", hypo.add, ".\n", cond)
                    return(genes)
                })
            ## Dummy errors detection.
            
            if (!("genotypes" %in% names(err))) 
                error.summary = rbind(error.summary, err)
        }
    }


    if (nrow(error.summary) > 0) {
        cat(paste(nrow(error.summary),
                  " genes pattern could not be added -- showing errors\n",
                  sep = ""))
        print(error.summary)
    } else if (!silent) {
        cat("Hypothesis created for all possible patterns.\n")
    }
    return(x)
}


#' Add all the hypotheses related to homologou events
#' @title hypothesis.add.homologous
#' @param x A TRONCO compliant dataset.
#' @param pattern.cause Possibile causes for the pattern. 
#' @param pattern.effect Possibile effects for the pattern. 
#' @param genes List of genes to be considered as possible homologous. For these genes, all the types of mutations will be considered functionally equivalent.
#' @param silent A parameter to disable/enable verbose messages.
#' @return A TRONCO compliant object with the added hypotheses
#' @export hypothesis.add.homologous
#' 
hypothesis.add.homologous <- function(x, 
                                      pattern.cause = '*',
                                      pattern.effect = '*',
                                      genes = as.genes(x),
                                      silent = FALSE) {

    is.compliant(x)
    # check if there is a reconstructed model
    if(has.model(x)) {
        stop('This dataset has a reconstructed model and no more hypothesis can be added.')
    }

    hom.group =
        lapply(genes,
               function(g, x) {
                   if (nevents(x, genes = g) > 1) {
                       TRUE
                   } else {
                    FALSE
                   }
               },
               x)

    hom.group = genes[unlist(hom.group)]

    if (length(hom.group) == 0) {
        warning("No genes with multiple events.")
        return(x)
    }

    if (!silent) {
        cat("*** Adding hypotheses for Homologous Patterns\n")
        cat(' Genes:', paste(hom.group, collapse = ", ", sep = ""), '\n')
        cat(' Cause:', paste(pattern.cause, collapse=", "), '\n')
        cat(' Effect:', paste(pattern.effect, collapse=", "), '\n')
    }

    effect = paste0("c('", paste(pattern.effect, collapse = "', '"), "')")
    cause = paste0("c('", paste(pattern.cause, collapse = "', '"), "')")

    if (length(hom.group) == 0) {
        warning("No genes with multiple events.")
        return(x)
    }

    ## Create a progress bar.
    
    #pb <- txtProgressBar(0, length(hom.group), style = 3)

    error.summary = data.frame()

    for (i in 1:length(hom.group)) {

        ## Start the progress bar.
        
        #setTxtProgressBar(pb, i)

        if (!silent) {
            cat('.')
        }
                
        ## Check if the joint probability of homologous events is > 0,
        ## if yes the event will be added as 'OR', otherwise 'XOR'.
        
        if ( any(rowSums(as.gene(x, genes = hom.group[[i]])) > 1)) {
            FUN = 'OR'
        } else {
            FUN = 'XOR'       
        }

        hypo.add =
            paste0("hypothesis.add(x, ",
                   "pattern.label = '", FUN, "_", hom.group[[i]], "', ",
                   "lifted.pattern = ", FUN, "('", hom.group[[i]], "'), ",
                   "pattern.cause = ",  cause, ", ",
                   "pattern.effect =", effect, ")")

        err =
            tryCatch({
                x = eval(parse(text = hypo.add))
            }, error = function(cond) {
                m = paste("Error on", hypo.add, ".\n", cond)
                code = strsplit(as.character(cond), " ")[[1]]
                idx.errcode = which(code == "[ERR]", arr.ind = TRUE) + 1

                return(data.frame(pattern =
                                      paste(unlist(hom.group[[i]]),
                                            collapse = ", ",
                                            sep = ""),
                                  error =
                                      paste(code[idx.errcode:length(code)],
                                            collapse = " ")))

            }, warning = function(cond) {
                m = paste("Warning on", hypo.add, ".\n", cond)
                return(genes)
            })
        
        ## Dummy errors detection.

        if (!("genotypes" %in% names(err))) 
            error.summary = rbind(error.summary, err)
    }

    ## Close progress bar.
    #close(pb)
    
    if (nrow(error.summary) > 0) {
        cat(paste(nrow(error.summary),
                  " patterns could not be added -- showing errors\n",
                  sep = ""))
        print(error.summary)
    } else if (!silent) {
        cat("Hypothesis created for all possible gene patterns.\n")
    }

    return(x)
}

# Internal function for hypotheses expansion
# @title hypotheses.expansion
# @param input_matrix A TRONCO adjacency matrix
# @param map hypothesis name - hypothesis adjacency matrix map
# @param expand Should I expand the hypotheses?
# @param skip.disconnected Hide disconnected node
#
hypotheses.expansion <- function(input_matrix, 
                                 map = list(),
                                 expand = TRUE,
                                 skip.disconnected = TRUE
                                 ) {
    
    ## Get node list.
    
    node_list = colnames(input_matrix)

    ## Cut input matrix.
    
    num_hypos = 0
    if (length(map) > 0) {
        num_hypos =
            Reduce(sum,
                   lapply(ls(map),
                          function(x, y) {
                              if (x %in% y)
                                  return(1)
                          },
                          y = node_list))
    }

    margin = length(node_list) - num_hypos
    hypos_new_name = list()

    ## Check if there are hypotheses.
    
    if (num_hypos == 0 || !expand) {
        
        ## If no hypos do nothing...
        
        min_matrix = input_matrix
    } else {
        ## ..else expand them.
        
        min_matrix =
            input_matrix[-(margin + 1):-length(node_list),
                         -(margin + 1):-length(node_list)]

        ## Create graph from matrix.
        
        min_graph = graph.adjacency(min_matrix)

        for (h in ls(map)) {

            if (! h %in% node_list) {
                next
            }

            hypo = map[[h]]

            ## Create graph from hypo.
            
            hypo_graph = graph.adjacency(hypo)

            ## Name of this node.
            
            h_mat <- rowSums(get.adjacency(hypo_graph, sparse=FALSE))

            initial_node <- names(h_mat)[which(h_mat == 0)]
            hypos_new_name[initial_node] = h

            ## Change names in confidence matrix according to
            ## hypotesis.
            
            display.up = FALSE
            if (length(which(input_matrix[, h] == 1)) != 0) {
                display.up = TRUE
            }

            display.down = FALSE
            if (length(which(input_matrix[h, ] == 1)) != 0) {
                display.down = TRUE
            }

            ## Display up hypo and reconnect.
            
            if (display.up) {

                hypo_pre = t(hypo)

                node_names = rownames(hypo_pre)
                node_names =
                    lapply(node_names,
                           function(x) {
                               if (is.logic.node(x)) {
                                   paste0('UP', x)
                               } else {
                                   return(x)
                               }
                           })

                rownames(hypo_pre) = node_names
                colnames(hypo_pre) = node_names

                ## Create graph from hypo.
                
                hypo_graph_pre = graph.adjacency(hypo_pre)

                ## Name of this node.
                
                h_mat_pre = colSums(get.adjacency(hypo_graph_pre, sparse = FALSE))

                final_node = names(h_mat_pre)[which(h_mat == 0)]
                hypos_new_name[final_node] = h

                ## Edge to reconstruct.
                
                h_edge = input_matrix[, h]
                initial_node_up = names(h_edge)[which(h_edge == 1)]

                ### Expand any hipothesis within final nodes.
                # 
                # if (length(map) > 0) {
                #   initial_node_up_hyp = which(initial_node_up%in%names(map))
                #   if (length(initial_node_up_hyp) > 0) {
                #     final_node_hyp_expanded = colnames(map[[initial_node_up[initial_node_up_hyp]]])[1:(length(colnames(map[[initial_node_up[initial_node_up_hyp]]]))-1)]
                #     initial_node_up = initial_node_up[-initial_node_up_hyp]
                #     initial_node_up = c(initial_node_up, final_node_hyp_expanded)
                #   }
                # }
                
                ## Add this graph to main graph.
                
                min_graph = graph.union(min_graph, hypo_graph_pre)

                ## Recreate lost edge.
                
                for (node in initial_node_up) {
                    min_graph = min_graph + edge(node, final_node)
                }
            }

            ## Display down hypo.
            
            if (display.down) {
                
                ## Edge to reconstruct.
                
                h_edge <- input_matrix[h,]
                final_node <- names(h_edge)[which(h_edge == 1)]
                
                ### Expand any hipothesis within final nodes.
                # 
                # if (length(map) > 0) {
                #   final_node_hyp = which(final_node%in%names(map))
                #   if (length(final_node_hyp) > 0) {
                #     final_node_hyp_expanded = colnames(map[[final_node[final_node_hyp]]])[1:(length(colnames(map[[final_node[final_node_hyp]]]))-1)]
                #     final_node = final_node[-final_node_hyp]
                #     final_node = c(final_node, final_node_hyp_expanded)
                #   }
                # }
                
                ## Add this graph to main graph.
                
                min_graph = graph.union(min_graph, hypo_graph)

                ## Recreate lost edge.
                
                for (node in final_node) {
                    min_graph = min_graph + edge(initial_node, node)
                }
                
            }
        }
        min_matrix = get.adjacency(min_graph, sparse = FALSE)
    }
    ## Sort col and row (igraph wants the same order).
    
    min_matrix = min_matrix[,order(colnames(min_matrix))]
    min_matrix = min_matrix[order(rownames(min_matrix)),]
    return(list(min_matrix, hypos_new_name))
}

# Utility function to add the hypotheses
aux.log <- function( genotypes, annotations, function.name, ... ) {

    hypotheses.env = get('hypotheses.env', asNamespace(getPackageName()))

    if (!is.null(genotypes)
        && !is.null(annotations)
        && length(list(...)) > 0) {

        clauses = list(...)
        curr_genotypes = array(0, c(nrow(genotypes), length(clauses)))
        hypotheses = list()
        function.inputs = list()
        fisher.tests = vector()

        for (i in 1:length(clauses)) {

            ## If the clause is given by name, get the column from the
            ## genotypes.
            
            if (typeof(clauses[[i]]) == "character") {

                col.num = -1
                ## If I have the label, get the column in the
                ## genotypes for this event.
                
                if (length(clauses[[i]]) == 1) {
                    event.map = emap(c(clauses[[i]],"*"), genotypes, annotations)
                    col.num = event.map$col.num
                }
                else if (length(clauses[[i]]) == 2) {
                    event.map = emap(clauses[[i]], genotypes, annotations)
                    col.num = event.map$col.num
                }

                if (col.num[1] == -1) {
                    stop(paste("[ERR] No events for gene ",
                               paste(clauses[[i]], collapse = ', ', sep = '')))
                }
                else {
                    curr_genotypes[, i] = genotypes[, col.num[1]]
                    if (length(col.num) > 1) {
                        curr_genotypes =
                            cbind(curr_genotypes,
                                  genotypes[ , col.num[2:length(col.num)]])
                    }

                    if (length(hypotheses$llist) == 0) {
                        hypotheses$llist = list(event.map$events.name)
                    }
                    else {
                        hypotheses$llist = list(c(unlist(hypotheses$llist), event.map$events.name))
                    }
                    
                    for (j in 1:length(event.map$events.name)) {
                        function.name =
                            paste(function.name, "_",
                                  event.map$events.name[j],
                                  sep = "")
                        function.inputs = c(function.inputs, event.map$events.name[j])
                    }
                }
            } else {
                ## Otherwise I already have the column as a vector.
                
                curr_genotypes[,i] = clauses[[i]]$pattern
                
                ## if it is a list
                
                if (length(hypotheses$llist) == 0) {
                    hypotheses$llist = clauses[[i]]$hypotheses$llist
                } else {
                    hypotheses$llist = list(c(unlist(clauses[[i]]$hypotheses$llist),unlist(hypotheses$llist)))
                }
                function.name = paste(function.name, "_", clauses[[i]]$function.name, sep="")
                function.inputs = c(function.inputs, clauses[[i]]$function.name)
            }
        }

        result =
            list(curr_genotypes = curr_genotypes,
                 hypotheses = hypotheses,
                 function.name = function.name,
                 function.inputs = function.inputs,
                 tests = pairwise.fisher.test(curr_genotypes))

        ## Save the new edges
        
        for (k in 1:length(result$function.inputs)) {
            lifting.edges.temp =
                rbind(get('lifting.edges', envir = hypotheses.env),
                      c(result$function.inputs[[k]], result$function.name))
            assign("lifting.edges", lifting.edges.temp, envir = hypotheses.env)
        }
        return(result)
    } else {
        stop("[ERR] Either the genotypes or the pattern not provided! No hypothesis will be created.")
    }
    return(NA)
}


#' AND hypothesis
#' @title AND
#' @param ... Atoms of the co-occurance pattern given either as labels or as partielly lifted vectors.
#' @return Vector to be added to the lifted genotype resolving the co-occurance pattern
#' @export AND
#
AND <- function( ... ) {
    
    ## Look for the variables named lifting.genotypes and
    ## lifting.annotations.

    hypotheses.env = get('hypotheses.env', asNamespace(getPackageName()))
    
    genotypes = get('lifting.genotypes', envir = hypotheses.env)
    annotations = get('lifting.annotations', envir = hypotheses.env)
    fisher.pvalues.temp = get('fisher.pvalues', envir = hypotheses.env)
    if (!is.null(genotypes)
        && !is.null(annotations)
        && length(list(...)) > 0) {
        ## Get the vector of the clauses of the pattern from the
        ## genotypes.
        
        result = aux.log(genotypes,annotations, "AND" ,...)
        curr_genotypes = result$curr_genotypes
        hypotheses = result$hypotheses
        function.name = result$function.name
        function.inputs = result$function.inputs

        ## Save the fisher exact tests.
        if (length(result$tests) > 0) {
            for (i in 1:length(result$tests)) {
                curr.test = result$tests[[i]]
                odds.ratio = curr.test[2]

                if (curr.test[2] <= 0) {
                    curr.pvalue = 1
                } else {
                    curr.pvalue = curr.test[1]
                }

                fisher.pvalues.temp = append(fisher.pvalues.temp, curr.pvalue)
            }
            assign("fisher.pvalues", fisher.pvalues.temp, envir = hypotheses.env)
        }

        ## Evaluate the AND operator.
        
        pattern = rep(0, nrow(genotypes))
        for (i in 1:nrow(genotypes)) {
            pattern[i] = sum(curr_genotypes[i, ])
            if (pattern[i]<ncol(curr_genotypes)) {
                pattern[i] = 0
            } else {
                pattern[i] = 1
            }
        }
        pattern = as.integer(pattern)
        result =
            list(pattern = pattern,
                 hypotheses = hypotheses,
                 function.name = function.name,
                 function.inputs = function.inputs,
                 fisher.pvalues = fisher.pvalues.temp)
        return(result)
    } else {
        stop("[ERR] Either the genotypes or the pattern not provided! No hypothesis will be created.");
    }
    return(NA)
}


#' OR hypothesis
#' @title OR
#' @param ... Atoms of the soft exclusive pattern given either as labels or as partielly lifted vectors.
#' @return Vector to be added to the lifted genotype resolving the soft exclusive pattern
#' @export OR
#
OR <- function( ... ) {
    ## Look for the variables named lifting.genotypes and
    ## lifting.annotations.

    hypotheses.env = get('hypotheses.env', asNamespace(getPackageName()))
    
    genotypes = get('lifting.genotypes', envir = hypotheses.env)
    annotations = get('lifting.annotations', envir = hypotheses.env)
    fisher.pvalues.temp = get('fisher.pvalues', envir = hypotheses.env)
    if (!is.null(genotypes)
        && !is.null(annotations)
        && length(list(...)) > 0) {
        
        ## Get the vector of the clauses of the pattern from the
        ## genotypes.
        
        result = aux.log(genotypes,annotations, "OR", ...)
        curr_genotypes = result$curr_genotypes
        hypotheses = result$hypotheses
        function.name = result$function.name
        function.inputs = result$function.inputs

        ## Save the fisher exact tests.
        
        if (length(result$tests) > 0) {
            for (i in 1:length(result$tests)) {
                curr.test = result$tests[[i]]
                curr.p.value = 1 - curr.test[1]

                fisher.pvalues.temp = append(fisher.pvalues.temp, curr.p.value)
            }
            assign("fisher.pvalues", fisher.pvalues.temp, envir = hypotheses.env)
        }

        ## Evaluate the OR operator.
        
        pattern = rep(0, nrow(genotypes))
        for (i in 1:nrow(genotypes)) {
            pattern[i] = sum(curr_genotypes[i, ])
            if (pattern[i] > 1) {
                pattern[i] = 1
            }
        }
        pattern = as.integer(pattern)
        result =
            list(pattern = pattern,
                 hypotheses = hypotheses,
                 function.name = function.name,
                 function.inputs = function.inputs,
                 fisher.pvalues = fisher.pvalues.temp)
        return(result)
    } else {
        stop("[ERR] Either the genotypes or the pattern not provided! No hypothesis will be created.");
    }
    return(NA)
}

#' XOR hypothesis
#' @title XOR
#' @param ... Atoms of the hard exclusive pattern given either as labels or as partielly lifted vectors.
#' @return Vector to be added to the lifted genotype resolving the hard exclusive pattern
#' @export XOR
#' 
XOR <- function( ... ) {
    ## Look for the variables named lifting.genotypes and
    ## lifting.annotations.
    
    hypotheses.env = get('hypotheses.env', asNamespace(getPackageName()))

    genotypes = get('lifting.genotypes', envir = hypotheses.env)
    annotations = get('lifting.annotations', envir = hypotheses.env)
    fisher.pvalues.temp = get('fisher.pvalues', envir = hypotheses.env)
    if (!is.null(genotypes)
        && !is.null(annotations)
        && length(list(...)) > 0) {
        
        ## Get the vector of the clauses of the pattern from the
        ## genotypes.
        
        result = aux.log(genotypes, annotations, "XOR", ...)
        curr_genotypes = result$curr_genotypes
        hypotheses = result$hypotheses
        function.name = result$function.name
        function.inputs = result$function.inputs

        ## Save the fisher exact tests.
        
        if (length(result$tests)>0) {
            for (i in 1:length(result$tests)) {
                curr.test = result$tests[[i]]
                odds.ratio = curr.test[2]

                if (curr.test[2] >= 0) {
                    curr.pvalue = 1
                }
                else {
                    curr.pvalue = curr.test[1]
                }
                fisher.pvalues.temp = append(fisher.pvalues.temp, curr.pvalue)
            }
            assign("fisher.pvalues", fisher.pvalues.temp, envir = hypotheses.env)
        }

        ## Evaluate the XOR operator.
        
        pattern = rep(0,nrow(genotypes))
        for (i in 1:nrow(genotypes)) {
            pattern[i] = sum(curr_genotypes[i,])
            if (pattern[i]>1) {
                pattern[i] = 0
            }
        }
        pattern = as.integer(pattern)
        result =
            list(pattern = pattern,
                 hypotheses = hypotheses,
                 function.name = function.name,
                 function.inputs = function.inputs,
                 fisher.pvalues = fisher.pvalues.temp)
        return(result)
    } else {
        stop("[ERR] Either the genotypes or the pattern not provided! No hypothesis will be created.");
    }
    return(NA);
}


### Return the position in the genotypes of the event referring to the
### given label.event.

emap <- function(label.event, genotypes, annotations) {
    col.num = -1;
    events.name = "";
    if (!is.null(genotypes) && !is.null(annotations)) {
        if (label.event[2] != "*") {
            curr.events = which(annotations[, "event"] == label.event[1] &
                                    annotations[, "type"] == label.event[2]);
        } else {
            curr.events = which(annotations[, "event"] == label.event[1]);
        }
        if (length(curr.events) > 0) {
            events.name = names(curr.events);
            col.num = which(colnames(genotypes) %in% events.name);
        }
    } else {
        stop("[ERR] A genotypes must be available in order to define any hypothesis!",call.=FALSE);
    }
    results = list(col.num = col.num, events.name = events.name);
    return(results);
}


### Return the adjacency matrix of the pattern given the list of edges
### involving it.

get.lifted.pattern <- function(lifted.edges) {
    ## Structure to save the adjacency matrix.
    
    lifted.adj.matrix =
        array(0,
              c(length(unique(c(lifted.edges[,1],lifted.edges[,2]))),
                length(unique(c(lifted.edges[,1],lifted.edges[,2])))));
    rownames(lifted.adj.matrix) = unique(c(lifted.edges[,1],lifted.edges[,2]));
    colnames(lifted.adj.matrix) = rownames(lifted.adj.matrix);
    
    ## Build the matrix given the lifted.edges.
    
    for (i in 1:nrow(lifted.edges)) {
        lifted.adj.matrix[lifted.edges[i, 1], lifted.edges[i, 2]] = 1;
    }
    return(lifted.adj.matrix);
}


### given the hypotheses and the adj.matrix, return the updated
### adj.matrix

hypothesis.adj.matrix <- function(hypotheses, adj.matrix) {

    if (!is.na(hypotheses[1])) {

        ## Set the invalid entries in the adj.matrix hypotheses can
        ## not be causing other hypotheses.
        
        adj.matrix[(ncol(adj.matrix) - hypotheses$num.hypotheses + 1):ncol(adj.matrix),
                   (ncol(adj.matrix) - hypotheses$num.hypotheses + 1):ncol(adj.matrix)] = 0;

        ## Consider the given hypotheses only against the specified possible effects.

        adj.matrix[(ncol(adj.matrix) - hypotheses$num.hypotheses + 1):ncol(adj.matrix),
                   1:(ncol(adj.matrix) - hypotheses$num.hypotheses)] = 0
        adj.matrix[1:(ncol(adj.matrix) - hypotheses$num.hypotheses),
                   (ncol(adj.matrix) - hypotheses$num.hypotheses + 1):ncol(adj.matrix)] = 0

        ## Set the elements from the hlist.
        
        for (i in 1:nrow(hypotheses$hlist)) {
            cause = which(colnames(adj.matrix) %in% hypotheses$hlist[i, "cause"]);
            effect = which(colnames(adj.matrix) %in% hypotheses$hlist[i, "effect"]);
            if (length(cause)>0 && length(effect)>0) {
                adj.matrix[cause,effect] = 1;
            }
        }
    }
    return(adj.matrix);
}


### performs pairwise exact fisher test.
pairwise.fisher.test <- function(data) {

    ## Structure to save the results.
    
    results = vector();

    if (ncol(data) > 1) {
        for (i in 1:ncol(data)) {
            for (j in i:ncol(data)) {
                if (i != j) {

                    df = data[ , c(i, j)]
                    df_x = data[ , 1]
                    df_y = data[ , 2]

                    ## Lifting xor.
                    df_xor = df_x + df_y
                    df_xor[df_xor > 1] = 0

                    ## Lifting or.
                    df_or = df_x + df_y
                    df_or[df_or > 1] = 1

                    ## Lifting and.
                    df_and = df_x + df_y
                    df_and[ df_and < 2] = 0
                    df_and[ df_and == 2] = 1

                    ## 2x2 contingency table.
                    table.xor =
                        rbind(c(nrow(df) - sum(df_or), sum(df_or - df_y)),
                              c(sum(df_or - df_x), sum(df_and))
                              )

                    ## Fisher 2-sided.
                    test = fisher.test(table.xor)

                    ## Save the results.
                    curr_result = c(test$p.value, log(test$estimate['odds ratio']))
                    results = append(results, list(curr_result))
                }
            }
        }
    }

    return(results)
}


#### end of file -- capri.hypotheses.R
BIMIB-DISCo/TRONCO documentation built on March 31, 2024, 9:20 p.m.