R/Utility.R

Defines functions Shared_traits Plot_Trait_Expression Plot_Pathway_modules getMetricDistModule getMetricDist Get_module_categories Draw_Expression Association_Rules Plot_Background_Modules Plot_Metric_Comparison Plot_Background_Individual_Genes Export_EdgeList Create_Filtered_SBS_Matrix Plot_Trait_Attribute_Expression Add_Custom_Trait calcmfrow Traitattributes_To_Sbsmatrix

Documented in Add_Custom_Trait Association_Rules Create_Filtered_SBS_Matrix Draw_Expression getMetricDist getMetricDistModule Get_module_categories Plot_Background_Individual_Genes Plot_Background_Modules Plot_Metric_Comparison Plot_Trait_Attribute_Expression Traitattributes_To_Sbsmatrix

#' Plot Trait Attribute
#' @name Plot Trait Attribute
#' @description Convert a list with trait attributes to a matrix where each
#' collumn is a trait attribute and each row a genome. Presence or absence is
#' indicated with either a zero (0) or one (1).
#' @param trait.attributes The full list of all trait attributes
#' @param genomes A vector containing the names of all genomes present in the
#' sample.
#' @export
#' @author JJM van Steenbrugge
Traitattributes_To_Sbsmatrix <- function(trait.attributes, genomes) {
  sbs.matrix <- matrix(
    nrow = length(genomes),
    ncol = 0
  )

  traits.names <- names(trait.attributes)
  cols <- c()

  # Traits
  for (trait.name in traits.names) {
    trait <- trait.attributes[[trait.name]]
    if (length(trait) == 0) {
      print('skip')
      next()
    }
    # Attributes
    for (i in 1:length(trait)) {
      attribute.genomes <- trait[[i]]$genomes
      occurences <- genomes %in% attribute.genomes
      sbs.matrix <- cbind(sbs.matrix, occurences)
      cols <- c(cols, paste(trait.name, ".", i, sep = ""))
      # include names
    }
  }
  colnames(sbs.matrix) <- cols
  sbs.matrix <- as.data.frame(sbs.matrix)
  cols <- sapply(sbs.matrix, is.logical)
  sbs.matrix[, cols] <- lapply(sbs.matrix[, cols], as.numeric)

  rownames(sbs.matrix) <- genomes
  return(as.matrix(sbs.matrix))
}

calcmfrow <- function(x) {
  temp <- sqrt(x)

  if ((temp %% floor(temp)) == 0) {
    return(c(temp, temp))
  } else if ((temp %% floor(temp)) < 0.5) {
    return(c(floor(temp), ceiling(temp)))
  } else {
    return(c(ceiling(temp), ceiling(temp)))
  }
}


#' Add Custom Trait
#' @name Add Custom Trait
#' @description Add a custom trait to an existing trait library
#' @param RNAseq.data
#' @export
#' @author JJM van Steenbrugge
Add_Custom_Trait <- function(RNAseq.data, Custom.traits, pairwise.distances, distance.metrics, bkgd.traits){
  nbins <- length(RNAseq.data$features$bins)
  for(custom.trait in names(Custom.traits)){
    KO_terms <- Custom.traits[[custom.trait]]

    for(KO in KO_terms){
      distances <- Pairwise_Distance(KO, nbins, RNAseq.data, distance.metrics)
      rownames(distances) <- RNAseq.data$features$bins
      colnames(distances) <- RNAseq.data$features$bins
      pairwise.distances[[KO]] <- distances
    }
  }

  Custom_annotation_db <- list('module.dict' = Custom.traits)
  Custom_TAs <- Identify_Trait_Attributes(RNAseq.data = RNAseq.data, pairwise.distances =  pairwise.distances, annotation.db = Custom_annotation_db,3)

  trait_PA <- Get_trait_presence_absence(RNAseq.data, module.dict = Custom.traits)

  trait.var <- Prune_Trait_Attributes(
    trait.attributes = Custom_TAs,
    annotation.db = Custom_annotation_db,
    bkgd.traits = bkgd.traits,
    RNAseq.data = RNAseq.data,
    pairwise.distances = pairwise.distances,
    bkgd.individual.Zscores,
    trait_presence_absence = trait_PA,
    filter_complete = T
  )


  return(trait.var)
}


#' Plot Trait Attribute
#' @name Plot Trait Attribute
#' @description Plot the expression profile for all genomes expressing that
#' attribute. For each annotation a separate plot is created where the thick
#' black line represents the mean expression of all genomes with that annotation.
#' The dashed blue and red lines represent the mean + 0.5*sd and the mean -
#' 0.5 * sd, respectively.
#' @param trait.attribute The name of the trait attribute (as character string)
#' @param trait.attributes The full list of all trait attributes
#' @param RNAseq.data Collection of multple components, include RNA seq data,
#' annotations, etc. See \code{\link{Pre_process_input}} for the full list.
#' @example Plot_Trait_Attribute('M00027.2', trait.attributes.pruned, RNAseq.data)
#' @export
#' @author JJM van Steenbrugge
Plot_Trait_Attribute_Expression <- function(trait.attribute = "M00793_1",
                                            trait.attributes.pruned,
                                            RNAseq.data) {
  t.data <- trait.attributes.pruned[[trait.attribute]]
  annotations <- RNAseq.data$features$annotation.db$module.dict[[trait.attribute]]
  n.att <- length(t.data)
  n.annotations <- length(annotations)





  plots <- list()

  for(i in 1:length(t.data)){

  plot.df <- matrix(ncol = 4, nrow = 0)
  trait.genomes <- t.data[[i]]$genomes

  for(genome in trait.genomes){

    for(annotation in annotations){
      expr_values <- RNAseq.data$table[which(RNAseq.data$table$Annotation == annotation),]
      expr_values <- expr_values[ which(expr_values$Bin == genome), RNAseq.data$features$sample.columns]


      if(nrow(expr_values) >= 2) expr_values <- expr_values[1,]
      else if(nrow(expr_values) == 0){
        print(paste(genome, "lacks", annotation))
        next
      }


      if(! NA %in% expr_values){
        expr_values <- log2(expr_values)
        expr_values <- expr_values-min(expr_values)
      }


      for (timepoint in 1:length(expr_values)){

          tp_expr <- expr_values[timepoint] %>% as.numeric
          row <- c(genome,  annotation, paste0("TP",timepoint) , tp_expr)
          print(row)
          plot.df <- rbind(plot.df, row )

      }

    }

  }


  plot.df <- data.frame(plot.df, stringsAsFactors = FALSE)
  colnames(plot.df) <- c("Genome", "KID", "TimePoint", "Expr")

  plot.df$Expr %<>% as.numeric
  plots[[i]] <- ggplot(plot.df) + geom_line(aes(x = TimePoint, y = Expr, group = Genome, color = Genome)) + facet_wrap(~KID, nrow = 1)



  }
  gridExtra::grid.arrange(grobs = plots, nrow = 2)


}

#Plot_Trait_Attribute_Expression("Phosphate transporter", trait.attributes.pruned = ret$trait.attributes.pruned, ret$RNAseq.data)

#' Create_Filtered_SBS_Matrix
#' @name Create_Filtered_SBS_Matrix
#' @description Filters pruned trait attributes for attributes with a specific name. Then returns a sbs matrix of those
#' traits.
Create_Filtered_SBS_Matrix <- function(trait.attributes.pruned,
                                       RNAseq.data,
                                       traits=c("M00171", "M00172")){
  filtered_attributes <- list()

  for (trait in traits)
  {
    if(startsWith(trait, 'M0')) # not implemented
    {
      next
    } else {
      filtered_attributes[[trait]] <- trait.attributes.pruned[[trait]]
    }
  }

  sbs_matrix <- Traitattributes_To_Sbsmatrix(filtered_attributes,
                                             RNAseq.data$features$bins)
  return(sbs_matrix)
}

#big_modules <- c("M00001", "M00048", "M00009", "M00089", "M00173", "M00003" )
#small_modules <- which( table(substr(colnames(sbs.trait.attributes), 1, 6)) < 5) %>% names

#df <- sbs.trait.attributes[, which(substr(colnames(sbs.trait.attributes), 1,6 ) %in% small_modules)]
#df <- sbs.trait.attributes[,which(substr(colnames(sbs.trait.attributes), 1, 6)=="M00009")]


#large_genomes <- which(rowSums(sbs.trait.attributes) >= 2000)
#df <- df[large_genomes,]

#edge_list.filtered <- edge_list[which(as.numeric(edge_list[, 3]) >= 200),]
#write.csv(edge_list, file = "~/Desktop/edge_list_small.csv", row.names = F, quote = F)





#' Export_EdgeList
#' @name Export_EdgeList
#' @description Creates a network file you can export to cytoscape
#' @param df A (subset) of the sbs.trait.attributes variable
#' @param network_type Use 'GA'
#' @export
Export_EdgeList <- function(df, network_type = 'GA'){


  if (network_type == 'GA'){
    edges <- which(df !=0, arr.ind = T)
    edge_list <- cbind(
      rownames(df)[as.numeric(edges[,1])],colnames(df)[as.numeric(edges[,2])])
    return(edge_list)

  } else if (network_type == 'GG'){

    genome_combs <- combn(rownames(df), m = 2)
    edge_list <- matrix(ncol = 3, nrow = 0)

    for(i in 1:ncol(genome_combs) ){
      combination <- genome_combs[,i]

      genomeA <- df[combination[1],]
      traits.A <- names(genomeA[which(genomeA == 1)])

      genomeB <- df[combination[2],]
      traits.B <- names(genomeB[which(genomeB == 1)])
      traits.shared <- intersect(traits.A, traits.B)
      num.shared <- length(traits.shared)

      edge_list <- rbind(edge_list, c(combination[1], combination[2], num.shared))
    }

    colnames(edge_list) <- c("GenomeA","GenomeB", "Number_Shared")
    return(edge_list)
  }
}

#' Plot_Background_Individual_Genes
#' @name Plot Background Individual Genes
#' @description Plotting of each background distribution.
#' @param bkgd.individual.Zscores A list containing (composite) Zscores based on
#' a random background distribution.
#' See \code{\link{Calc_Z_scores}} for more information.
#' @export
#' @author BO Oyserman
Plot_Background_Individual_Genes <- function(bkgd.individual.Zscores) {
  require(hexbin)
  require(RColorBrewer)
  rf <- colorRampPalette(rev(brewer.pal(11, "Spectral")))

  all_scores_x <- c(
    bkgd.individual.Zscores$zscores$`Random Genes`$PC,
    bkgd.individual.Zscores$zscores$`Random Annotated Genes`$PC,
    bkgd.individual.Zscores$zscores$`Genes with the same annotation`$PC
  )

   all_scores_y <- c(
    bkgd.individual.Zscores$zscores$`Random Genes`$NRED,
    bkgd.individual.Zscores$zscores$`Random Annotated Genes`$NRED,
    bkgd.individual.Zscores$zscores$`Genes with the same annotation`$NRED
  )

  random.genes.hexb <- hexbin(bkgd.individual.Zscores$zscores$`Random Genes`$PC,
    bkgd.individual.Zscores$zscores$`Random Genes`$NRED,
    ybnds = c(min(all_scores_y), max(all_scores_y)),
    xbnds = c(min(all_scores_x), max(all_scores_x))
  )

  random.annotated.genes.hexb <- hexbin(bkgd.individual.Zscores$zscores$`Random Annotated Genes`$PC,
    bkgd.individual.Zscores$zscores$`Random Annotated Genes`$NRED,
    ybnds = c(min(all_scores_y), max(all_scores_y)),
    xbnds = c(min(all_scores_x), max(all_scores_x))
  )

  random.identical.annotated.genes.hexb <- hexbin(bkgd.individual.Zscores$zscores$`Genes with the same annotation`$PC,
    bkgd.individual.Zscores$zscores$`Genes with the same annotation`$NRED,
    ybnds = c(min(all_scores_y), max(all_scores_y)),
    xbnds = c(min(all_scores_x), max(all_scores_x))
  )

  cnt.max <- max(c(
    random.identical.annotated.genes.hexb@count,
    random.annotated.genes.hexb@count
  ))

  plot(random.genes.hexb,
  colramp = rf, mincnt = 1, maxcnt = cnt.max,
  xlab = "PC", ylab = "NRED", main = "Random Genes"
  )

  plot(random.annotated.genes.hexb,
    colramp = rf, mincnt = 1, maxcnt = cnt.max,
    xlab = "PC", ylab = "NRED", main = "Random Annotated Genes"
  )

  plot(random.identical.annotated.genes.hexb,
    colramp = rf, mincnt = 1, maxcnt = cnt.max,
    xlab = "PC", ylab = "NRED", main = "Genes with the same annotation"
  )
}


#' Plot_Metric_Comparison
#' @name Plot Metric Comparison
#' @description Plotting densities and significance of each distance metric
#' @param bkgd.individual
#' @export
#' @author BO Oyserman
Plot_Metric_Comparison <- function(bkgd.individual) {
  t_test_KO_random_pearson <- t.test(bkgd.individual$`Random Annotated Genes`$PC,
    bkgd.individual$`Random Genes`$PC,
    alternative = "less"
  ) # x > y (NULL)

  t_test_KO_random_euclidean <- t.test(bkgd.individual$`Random Annotated Genes`$NRED,
    bkgd.individual$`Random Genes`$NRED,
    alternative = "greater"
  ) # x > y (NULL)

  par(
    mfrow = c(2, 2),
    mar = c(3, 3, 3, 1)
  )
  # plot 1
  plot(density(bkgd.individual$`Random Genes`$PC,
    adjust = 2,
    na.rm = TRUE
  ),
  ylim = c(0, 1),
  xlab = "",
  ylab = "",
  main = ""
  )
  points(density(bkgd.individual$`Random Annotated Genes`$PC, adjust = 2),
    typ = "l",
    col = "blue"
  )
  mtext(paste("p-value = ", signif(t_test_KO_random_pearson$p.value, 2)),
    side = 3,
    col = "blue",
    padj = 2,
    cex = 0.75
  )
  title(
    ylab = "Density",
    line = 2,
    cex.lab = 1
  )
  title(
    xlab = "PC",
    line = 2,
    cex.lab = 1
  )

  # plot 2
  plot(density(bkgd.individual$`Random Annotated Genes`$PC,
    adjust = 2
  ),
  ylim = c(0, 1),
  xlab = "",
  ylab = "",
  main = " "
  )
  points(density(bkgd.individual$`Genes with the same annotation`$PC,
    adjust = 2
  ),
  typ = "l",
  col = "red"
  )
  mtext(paste("p-value = ", signif(t_test_KO_random_pearson$p.value, 2)),
    side = 3,
    col = "red",
    padj = 2,
    cex = 0.75
  )
  title(
    ylab = "Density",
    line = 2,
    cex.lab = 1
  )
  title(
    xlab = "PC",
    line = 2,
    cex.lab = 1
  )

  # plot 3
  plot(density(bkgd.individual$`Random Genes`$NRED,
    adjust = 2
  ),
  typ = "l",
  ylim = c(0, 1.25),
  xlab = "",
  ylab = "",
  main = ""
  )
  points(density(bkgd.individual$`Random Annotated Genes`$NRED, adjust = 2),
    typ = "l",
    col = "blue"
  )
  title(
    ylab = "Density",
    line = 2,
    cex.lab = 1
  )
  title(xlab = "NRED", line = 2, cex.lab = 1)
  mtext(paste("p-value = ", signif(t_test_KO_random_euclidean$p.value, 2)),
    side = 3, col = "blue", padj = 2, cex = .75
  )

  # plot 4
  plot(density(bkgd.individual$`Random Annotated Genes`$NRED,
    adjust = 2
  ),
  typ = "l",
  ylim = c(0, 1.25),
  xlab = "",
  ylab = "",
  main = ""
  )
  points(density(bkgd.individual$`Random Annotated Genes`$NRED,
    adjust = 2
  ),
  typ = "l",
  col = "red"
  )
  title(
    ylab = "Density",
    line = 2,
    cex.lab = 1
  )
  title(
    xlab = "NRED",
    line = 2,
    cex.lab = 1
  )
  title(" \n\nComparison of random & functional \n pairwise comparisons",
    outer = TRUE
  )
  mtext(paste(
    "p-value = ",
    signif(t_test_KO_random_euclidean$p.value, 2)
  ),
  side = 3,
  col = "red",
  padj = 2,
  cex = .75
  )
}

#' Plot_Background_Modules
#' @name Plot Background Modules
#' @description Plotting of all background distributions of different sizes
#' together. Each distribution is represented by a different colour. This plot
#' hints wether or not the background distribution is indeed random.
#' @param bkgd.traits A collection of random background distributions of traits.
#' See \code{\link{Random_Trait_Background}}.
#' @export
#' @author JJM van Steenbrugge
Plot_Background_Modules <- function(bkgd.traits) {
  if (length(bkgd.traits) < 1) {
    return("There are no random background distributions calculated")
  }
  colours <- rainbow(length(bkgd.traits))
  bkgd.names <- names(bkgd.traits)
  legend <- sapply(
    bkgd.names,
    function(x) paste("N = ", x, sep = "")
  )

  plot(density(bkgd.traits[[1]],
    na.rm = TRUE
  ),
  ylim = c(0, 1),
  xlim = c(-4, 4),
  ylab = "Density",
  xlab = "Composite Zscore",
  cex.main = 0.75,
  col = colours[1],
  main = "Random background distributions of modules"
  )

  for (i in 2:length(bkgd.traits)) {
    points(density(bkgd.traits[[i]],
      na.rm = TRUE
    ),
    type = "l",
    col = colours[i]
    )
  }

  legend("topleft",
    legend = legend,
    col = colours,
    pch = 1,
    cex = 0.7
  )
}

#' Calculate Association Rules
#' @name Calculate Association Rules
#' @description Calculate Association Rules using the apriori algorithm (as
#' implemented in the arules package). This function lets the user decide on the
#' number of rules to produce and estimates parameters to produce those results.
#' @param sbs.trait.attributes Matrix where each collumn is a trait attribute
#' and each row a genome. Presence or absence is indicated with either a one
#' (1) or zero (0).
#' @param lhs A vector containing one or multiple terms for the left-hand-side
#' (antecedent) of the association rules. For more information see
#' \url{https://en.wikipedia.org/wiki/Association_rule_learning}.
#' @param rhs A vector containing one or multiple terms for the right-hand-side
#' (consequent) of the association rules. For more information see
#' \url{https://en.wikipedia.org/wiki/Association_rule_learning}.
#' @param N The number of association rules to create (e.g. N = 100 will result
#' in 100 association rules).
#' @export
#' @author JJM van Steenbrugge
Association_Rules <- function(sbs.trait.attributes,
                              lhs, rhs, N) {
  require(arules)

  .Reach_N <- function(N, lhs = c(), rhs = c()) {
    n <- 0
    support <- 1.0
    confidence <- 1.0
    while (n < N) {
      apri <- apriori(sbs.trait.attributes,
        parameter = list(
          support = support,
          confidence = confidence,
          minlen = 2
        ),
        appearance = list(
          lhs = lhs,
          rhs = rhs
        )
      )

      rules <- as(apri, "data.frame")
      n <- nrow(rules)
      print(n)
      support <- support - 0.1
      if (support <= 0) {
        support <- 1.0
        confidence <- confidence - 0.1
      }
    }
    return(rules)
  }



  if (missing(lhs) && missing(rhs)) {
    rules <- .Reach_N(N)
  } else if (!missing(lhs) && !missing(rhs)) {
    rules1 <- .Reach_N(N / 2, lhs = lhs)
    rules2 <- .Reach_N(N / 2, rhs = rhs)

    rules1 <- rules1[order(rules1[, 1]), ]
    rules2 <- rules2[order(rules2[, 1]), ]

    rules <- rbind(rules1[1:(N / 2), ], rules2[1:(N / 2), ])
  } else if (!missing(lhs)) {
    rules <- .Reach_N(N, lhs = lhs)
  } else if (!missing(rhs)) {
    rules <- .Reach_N(N, rhs = rhs)
  }

  return(rules)
}

#' Draw_Expression
#' @name Draw Expression
#' @description Creates a drawing of a trait (possibly with multiple different
#' trait-attributes) where all its composing genes are plotting individually
#' with their corresponding expression values.
#' @param trait character string with the name of a trait.
#' @param RNAseq.data Collection of multple components, include RNA seq data,
#' annotations, etc.
#' @seealso \code{\link{Pre_process_input}} for the full list of parameters,
#' \url{https://github.com/Jorisvansteenbrugge/TcT/wiki/The-RNAseq.data-object}
#' for more information.
#' @param trait.attributes.pruned The full list of all pruned trait attributes.
#' @export
#' @author JJM van Steenbrugge
Draw_Expression <- function(trait, RNAseq.data, trait.attributes.pruned) {
  y.max <- 140
  x.max <- 60


  attributes <- trait.attributes.pruned[[trait]]
  n.attributes <- length(attributes)
  genes <- RNAseq.data$features$annotation.db$module.dict[[trait]]
  n.genes <- length(genes)

  plot(c(0, as.numeric(x.max)), c(0, y.max), type = "n", xlab = "", ylab = "", axes = F)

  y.coords <- seq(y.max, 1, by = -15) # With this setting max of 7 genes
  x.coords <- seq(0, x.max, by = 11)

  # For each gene
  for (i in 1:n.genes) {
    gene <- genes[i]

    # Gene name
    graphics::text(
      x = (x.coords[1] + 5),
      y = (y.coords[i] - 5),
      labels = gene
    )

    # for each attribute
    for (y in 1:n.attributes) {
      #      Draw Labels
      graphics::text(x = (x.coords[y + 1] + 5), y = y.max, labels = paste(trait,
        y,
        sep = "."
      ))

      rect(
        xleft = x.coords[y + 1], ybottom = y.coords[i] - 10,
        xright = x.coords[y + 1] + 10, ytop = y.coords[i]
      )

      # Draw expression line
      genomes <- attributes[[y]]$genomes
      genomes.rna <- RNAseq.data$table[which(RNAseq.data$table$Bin %in% genomes), ]
      genomes.rna.gene <- genomes.rna[
        which(genomes.rna$Annotation == gene),
        RNAseq.data$features$rank.columns
      ]
      genomes.rna.gene.mean <- apply(genomes.rna.gene, 2, mean)

      # Generate box colours
      values <- seq(0.1, 1, by = 0.1)
      cols <- colorRamp(c("white", "red"))
      colours <- rgb(cols(values) / 255)

      # Draw all the boxes!
      box.coords <- seq(0, 10, length.out = (length(genomes.rna.gene.mean) + 1))
      for (z in 2:length(box.coords)) {
        rank <- genomes.rna.gene.mean[z - 1]
        col <- "red"
        if (is.nan(rank) || is.na(rank)) {
          rank <- 1
        }
        # I am not proud of this
        if (rank <= 1 && rank > 0.9) {
          col <- colours[1]
        }
        else if (rank <= 0.9 && rank > 0.8) {
          col <- colours[2]
        } else if (rank <= 0.8 && rank > 0.7) {
          col <- colours[3]
        } else if (rank <= 0.7 && rank > 0.6) {
          col <- colours[4]
        } else if (rank <= 0.6 && rank > 0.5) {
          col <- colours[5]
        } else if (rank <= 0.5 && rank > 0.4) {
          col <- colours[6]
        } else if (rank <= 0.4 && rank > 0.3) {
          col <- colours[7]
        } else if (rank <= 0.3 && rank > 0.2) {
          col <- colours[8]
        } else if (rank <= 0.2 && rank > 0.1) {
          col <- colours[9]
        } else if (rank <= 0.1 && rank >= 0.0) {
          col <- colours[10]
        }
        rect(
          xleft = x.coords[y + 1] + box.coords[(z - 1)], ybottom = y.coords[i] - 10,
          xright = x.coords[y + 1] + box.coords[z], ytop = y.coords[i],
          col = col
        )
        for (line.idx in 1:nrow(genomes.rna.gene)) {
          y.bot <- y.coords[i] - 10
          rank.pos <- (genomes.rna.gene[line.idx, ] * 10) + y.bot
          x.pos <- seq(1, 10, length.out = 6) + x.coords[y + 1]
          graphics::lines(
            x.pos,
            rank.pos
          )
        }
      }
    }
  }
}

#' Get module categories
#' @description Returns a list of the kegg categories with their corresponding modules
#' @export
Get_module_categories <- function(pythonfile){
  reticulate::source_python(pythonfile)
  parsed <- get_module_categories(kegg_categories)

  cat.genes <- parsed[[1]]
  names(cat.genes) <- parsed[[2]]

  return(cat.genes)
}

#' Get Metric Dist
#' @param metric either {'PC','NRED'}
#' @param cat.genes list("Carbon")
getMetricDist <- function(metric, cat.genes, distance.metrics, RNAseq.data,
                          bkgd.individual, bkgd.individual.Zscores) {
  mean.distances <- matrix(nrow = 0, ncol = 4)

  for (category in names(cat.genes)) {
    distances.list <- list()
    cat(paste(category, "\n"))

    for (KO in cat.genes[[category]]) {
      data.rows <- RNAseq.data$table[which(RNAseq.data$table$Annotation == KO), ]

      genomes.present <- unique(data.rows$Bin)
      if (length(genomes.present) <= 1) {
        next()
      }
      dist.matrix <- matrix(data = NA, ncol = length(genomes.present), nrow = length(genomes.present))
      for (x in 1:(length(genomes.present) - 1)) {
        gen.x <- data.rows[which(data.rows$Bin == genomes.present[x]), ][1, ]

        for (y in (x + 1):length(genomes.present)) {
          gen.y <- data.rows[which(data.rows$Bin == genomes.present[y]), ][1, ]
          dist.matrix[x, y] <- distance.metrics[[metric]](gen.x, gen.y, RNAseq.data$features)
        }
      }

      distances.list[[KO]] <- dist.matrix
    } # ko

    distances <- unlist(distances.list)

    p.val <- NA
    p.c <- "not significant"

    try(p.val <- t.test(distances, bkgd.individual$`Random Annotated Genes`[[metric]],
      na.action = na.omit
    )$p.value,
    silent = T
    )

    try(if (p.val <= 0.05) {
      p.c <- "significant"
    } else {
      p.c. <- "not significant"
    }
    ,
    silent = T
    )


    cex <- median(distances, na.rm = T)
    if (is.null(cex)) {
      cex <- NA
    }


    # actual distances
    mean.dist <- mean(distances, na.rm = T)
    mean.dist.Z <- (mean.dist - bkgd.individual.Zscores$mu$`Random Annotated Genes`[[metric]]) /
      bkgd.individual.Zscores$sd$`Random Annotated Genes`[[metric]]



    mean.distances <- rbind(
      mean.distances,
      c(mean.dist.Z, p.c, category, cex)
    )
  }

  return(mean.distances)
}

#' Get Metric Dist module

getMetricDistModule <- function(metric, cat.modules, annotation_db) {
  mean.distances <- matrix(nrow = 0, ncol = 5)

  for (category in names(cat.modules)) {
    print(category)
    for (module in cat.modules[[category]]) {
      distances.list <- list()

      for (KO in annotation_db[[module]]) {
        data.rows <- RNAseq.data$table[which(RNAseq.data$table$Annotation == KO), ]

        genomes.present <- unique(data.rows$Bin)
        if (length(genomes.present) <= 1) {
          next()
        }
        dist.matrix <- matrix(data = NA, ncol = length(genomes.present), nrow = length(genomes.present))
        for (x in 1:(length(genomes.present) - 1)) {
          gen.x <- data.rows[which(data.rows$Bin == genomes.present[x]), ][1, ]

          for (y in (x + 1):length(genomes.present)) {
            gen.y <- data.rows[which(data.rows$Bin == genomes.present[y]), ][1, ]
            dist.matrix[x, y] <- distance.metrics[[metric]](gen.x, gen.y, RNAseq.data$features)
          }
        }

        distances.list[[KO]] <- dist.matrix
      } # KOs

      distances <- unlist(distances.list)
      distances <- as.numeric(distances[-which(is.na(distances))])

      p.val <- NA
      p.c <- "not significant"
      n.ko <- as.character(length(
        RNAseq.data$features$annotation.db$module.dict[[module]]
      ))

      try(p.val <- t.test(distances, bkgd.traits[[n.ko]])$p.value,
        silent = T
      )

      try(if (p.val <= 0.05) {
        p.c <- "significant"
      } else {
        p.c. <- "not significant"
      }
      ,
      silent = T
      )
      # actual distances
      mean.dist <- median(distances, na.rm = T)
      mean.dist.Z <- (mean.dist - bkgd.individual.Zscores$mu$`Random Annotated Genes`[[metric]]) /
        bkgd.individual.Zscores$sd$`Random Annotated Genes`[[metric]]
      mean.distances <- rbind(
        mean.distances,
        c(mean.dist.Z, p.val, p.c, category, module)
      )
    } # module
  }

  return(mean.distances)
}


#' @export
Plot_Pathway_modules <- function() {
  library(ggplot2)
  library(gridExtra)



  nred.modules.Z <- getMetricDistModule("NRED", categories, modules_ko)
  nred.modules.Z <- cbind(nred.modules.Z, (1:nrow(nred.modules.Z)))
  colnames(nred.modules.Z) <- c("value", "pval", "sig", "pathway", "module", "ids")
  nred.modules.Z.df <- as.data.frame(nred.modules.Z, stringsAsFactors = F)
  nred.modules.Z.df$value <- as.numeric(nred.modules.Z.df$value)
  nred.modules.Z.df$ids <- as.numeric(nred.modules.Z.df$ids)
  nred.modules.Z.df$ids <- factor(nred.modules.Z.df$ids, levels = nred.modules.Z.df$ids)



  nred <- ggplot(nred.modules.Z.df, aes(x = ids, y = value, colour = pathway, shape = factor(sig))) +
    theme(strip.background = element_blank(), strip.text = element_blank()) +
    geom_point(size = 2) +
    xlab("") +
    ylab("NRED Z score") +
    facet_grid(. ~ pathway, scales = "free_x", space = "free_x") +
    geom_hline(yintercept = 0) +
    ggtitle("NRED") +
    guides(colour = FALSE)




  pc.modules.Z <- getMetricDistModule("PC", categories, modules_ko)
  pc.modules.Z <- cbind(pc.modules.Z, (1:nrow(pc.modules.Z)))
  colnames(pc.modules.Z) <- c("value", "pval", "sig", "pathway", "module", "ids")
  pc.modules.Z.df <- as.data.frame(pc.modules.Z, stringsAsFactors = F)
  pc.modules.Z.df$value <- as.numeric(pc.modules.Z.df$value)
  pc.modules.Z.df$ids <- as.numeric(pc.modules.Z.df$ids)
  pc.modules.Z.df$ids <- factor(pc.modules.Z.df$ids, levels = pc.modules.Z.df$ids)


  pc <- ggplot(pc.modules.Z.df, aes(x = ids, y = value, colour = pathway, shape = factor(sig))) +
    theme(strip.background = element_blank(), strip.text = element_blank()) +
    geom_point(size = 2) +
    xlab("") +
    ylab("PC Z score") +
    facet_grid(. ~ pathway, scales = "free_x", space = "free_x") +
    geom_hline(yintercept = 0) +
    ggtitle("PC") +
    guides(colour = FALSE)


 grid.arrange(nred, pc, ncol = 1, nrow =2)
}



Plot_Trait_Expression <- function(trait, subset_genomes) {
  .getRank <- function(genome, rows) {
    genome.rows <- rows[
      which(rows$Bin == genome),
      RNAseq.data$features$rank.columns
    ]

    x <- genome.rows[sample.int(nrow(genome.rows), 1), ]

    return(x)
  }


  annotations <- RNAseq.data$features$annotation.db$module.dict[[trait]]
  par(mfrow = calcmfrow(length(annotations)))



  for (KO in annotations) {
    print(KO)
    rows <- RNAseq.data$table[which(RNAseq.data$table$Annotation == KO), ]

    if (missing(subset_genomes)) {
      genomes <- unique(rows$Bin)
    } else {
      genomes <- unique(rows$Bin)
      genomes <- subset_genomes[which(subset_genomes %in% genomes)]
    }

    print(genomes)
    if (length(genomes) == 0) {
      next()
    }

    plot(1:length(RNAseq.data$features$rank.columns), .getRank(genomes[1], rows),
      ylim = c(0, 1), main = KO, type = "l"
    )
    for (genome in genomes) {
      lines(1:length(RNAseq.data$features$rank.columns), .getRank(genome, rows))
    }
  }
}



Shared_traits <- function( RNAseq.data ) {

  interesting_bins <- c('16','39','22','29', '32')
  other_bins <- RNAseq.data$features$bins[which(!(RNAseq.data$features$bins %in% interesting_bins))]
  rows_shared <- RNAseq.data$features$trait_presence_absence %>% apply(1, function(row){
    if (sum(row[interesting_bins]) == length(interesting_bins) && sum(row[other_bins]) == 0) {
      return(T)
    } else{F}
  })

  shared_traits <- rows_shared[which(rows_shared)] %>% names


  distances <- pairwise.distances[RNAseq.data$features$annotation.db$module.dict$M00150] %>% sapply(function(x) {
    x[c("16","39"),c("16","39")]
  }) %>% .[which(!is.na(.))]

  t.test(distances, bkgd.traits$`4`, alternative = 'less')


}

Plot_Shared_Attributes <- function(trait.attributes.pruned, RNAseq.data) {
  library(magrittr)
  trait.names <- list()

  accum_bins <- c('16',"39")

  for (trait.name in names(trait.attributes.pruned)) {
    trait <- trait.attributes.pruned[[trait.name]]

    for (attribute.name in names(trait)) {
      attribute <- trait[[attribute.name]]
      genomes <- attribute$genomes
      if (sum(accum_bins %in% genomes) == 2) {
        trait.names[[trait.name]] <- attribute.name
      }
    }
  }

  other_bins <- RNAseq.data$features$bins[-which(RNAseq.data$features$bins %in% accum_bins)]

  bin_occurences <- sapply(other_bins, function(bin) {
    occurences <- sapply(1:length(trait.names), function(i) {
      ta <- trait.attributes.pruned[[names(trait.names)[i]]][[trait.names[[i]]]]
      if (bin %in% ta$genomes) {
        return(1)
      }
      else {
        return(0)
      }
    })

    return(sum(occurences))
  }) %>%
    sort(decreasing = T) %T>%
    plot(
      type = "l", xaxt = "n",
      xlab = "Genomic bins",
      ylab = "Number of attributes shared with CAA",
      ylim = c(0,30)
    )
  axis(1, at = 1:length(bin_occurences), labels = names(bin_occurences))
}


#' Plot_Redundancy_Traits
#' @name Plot_Redundancy_Traits
#' @description Plots a histogram of traits redundancy
#' @param RNAseq.data Collection of multple components, include RNA seq data, annotations, etc. See \code{\link{Pre_process_input}} for the full list.
#' @export
#' @author JJM van Steenbrugge
Plot_Redundancy_Traits <- function(RNAseq.data) {
  library(ggplot2)
  library(magrittr)

  ta.pa <- apply(RNAseq.data$features$trait_presence_absence, 1, function(row) {
    return(sum(row))
  })

  ta.pa <- ta.pa[which(ta.pa != 0)]
  sort(ta.pa) %>% barplot(xaxt='n', ylim = c(0,20))

}

#' @export
Calc_TnA_redundancy <- function(RNAseq.data ) {
  point.matrix <- matrix(ncol=4, nrow=0)

  t.pa <- RNAseq.data$features$trait_presence_absence

  combinations <- combn(RNAseq.data$features$bins, 2, simplify = F)

  for (pair in combinations){
    try({
      A <- pair[1] %>% as.character
      B <- pair[2] %>% as.character

      traits.A <- t.pa[, A] %>% which(. == T) %>% names
      traits.B <- t.pa[, B] %>% which(. == T) %>% names
      overlap.traits <- intersect(traits.A, traits.B) %>% length

      attributes.A <- which(sbs.trait.attributes[A,]  == 1) %>% names
      print(B)
      attributes.B <- which(sbs.trait.attributes[B,]  == 1) %>% names
      overlap.attributes <- intersect(attributes.A, attributes.B) %>% length

      point.matrix <- rbind(point.matrix,
                            c(A, B,
                              overlap.traits,
                              overlap.attributes))
    })

  }
  colnames(point.matrix) <- c("A", "B", 'traits', 'attributes')

  point.df <- data.frame(point.matrix, stringsAsFactors = F)
  point.df$traits %<>% as.numeric
  point.df$attributes %<>% as.numeric
  all_bins <- unique(c(point.df[, 1], point.df[, 2]))
  sum_traits <- NULL
  sum_attributes <- NULL

  for (i in all_bins) {
    bin_rows <- which(point.df[, 1] == i | point.df[, 2] == i)
    sum_traits <- c(sum_traits, sum(point.df[bin_rows, 3]))
    sum_attributes <- c(sum_attributes, sum(point.df[bin_rows, 4]))
  }

  return(cbind(all_bins, sum_traits, sum_attributes))
}

Plot_traits_vs_attributes <- function(model_bin) {
  point.matrix <- matrix(ncol=3, nrow=0)
  point.matrix_39 <- matrix(ncol=3, nrow=0)

  t.pa <- RNAseq.data$features$trait_presence_absence

  combinations <- combn(RNAseq.data$features$bins, 2, simplify = F)

  if(!missing(model_bin)){
    combinations_39 <- lapply(RNAseq.data$features$bins, function(bin){
      if(!bin==model_bin){
      return(c(bin, model_bin))
      }
    })
  }


  for (pair in combinations){
    try({
      A <- pair[1] %>% as.character
      B <- pair[2] %>% as.character

      traits.A <- t.pa[, A] %>% which(. == T) %>% names
      traits.B <- t.pa[, B] %>% which(. == T) %>% names
      overlap.traits <- intersect(traits.A, traits.B) %>% length

      attributes.A <- which(sbs.trait.attributes[A,]  == 1) %>% names
      print(B)
      attributes.B <- which(sbs.trait.attributes[B,]  == 1) %>% names
      overlap.attributes <- intersect(attributes.A, attributes.B) %>% length

      vs <- paste(pair, collapse = 'vs')

      point.matrix <- rbind(point.matrix,
                            c(vs,
                              overlap.traits,
                              overlap.attributes))
    })

  }
  colnames(point.matrix) <- c("pairs", 'traits', 'attributes')

  point.df <- data.frame(point.matrix, stringsAsFactors = F)
  point.df$traits %<>% as.numeric
  point.df$attributes %<>% as.numeric
  ylim_plot1 <- c(0,max(point.df$attributes)+5)
  xlim_plot1 <- c(0, max(point.df$traits)+5)

  plot(x = as.numeric(point.matrix[,2]),
       y = as.numeric(point.matrix[,3]),
       xlab = '# Overlap Traits',
       ylab = '# Overlap Attributes',
       main = "Pairwise Genome Comparisons",
       ylim = ylim_plot1,
       xlim = xlim_plot1)

  model <- lm(attributes~traits, data = point.df)
  abline(model$coefficients)

  cinterval <- confint(model, level=.99)
  abline(cinterval[,1])
  abline(cinterval[,2])


  if(!missing(model_bin)){
  for (pair in combinations_39){
    try({
      A <- pair[1] %>% as.character
      B <- pair[2] %>% as.character

      traits.A <- t.pa[, A] %>% which(. == T) %>% names
      traits.B <- t.pa[, B] %>% which(. == T) %>% names
      overlap.traits <- intersect(traits.A, traits.B) %>% length

      attributes.A <- which(sbs.trait.attributes[A,]  == 1) %>% names
      print(B)
      attributes.B <- which(sbs.trait.attributes[B,]  == 1) %>% names
      overlap.attributes <- intersect(attributes.A, attributes.B) %>% length

      vs <- paste(pair, collapse = 'vs')

      point.matrix_39 <- rbind(point.matrix_39,
                            c(vs,
                              overlap.traits,
                              overlap.attributes))
    })

  }
  colnames(point.matrix_39) <- c("pairs", 'traits', 'attributes')

  point.df <- data.frame(point.matrix_39, stringsAsFactors = F)
  point.df$traits %<>% as.numeric
  point.df$attributes %<>% as.numeric


  points(x = as.numeric(point.matrix_39[,2]),
       y = as.numeric(point.matrix_39[,3]),
       xlab = '# Overlap Traits',
       ylab = '# Overlap Attributes',
       main = "Pairwise Genome Comparisons",
       ylim = ylim_plot1,
       xlim = xlim_plot1,
       pch=19,
       cex=1.5,
       col="red")

  text(x=10,y=ylim_plot1[2]-5, paste("bin_",model_bin))
#  model_39 <- lm(attributes~traits, data = point.df)
#  abline(model_39$coefficients, col="red")

#  cinterval_39 <- confint(model_39, level=.99)
#  abline(cinterval_39[,1], col="red")
#  abline(cinterval_39[,2], col="red")
  #polygon

  }
  # identify(x= as.numeric(point.matrix[,2]), y = as.numeric(point.matrix[,3]), labels = point.matrix[,1])
}

#' @export
Plot_traits_vs_attributes_highlight <- function(model_bin, string_to_colors) {
  point.matrix <- matrix(ncol=3, nrow=0)
  point.matrix_39 <- matrix(ncol=3, nrow=0)

  t.pa <- RNAseq.data$features$trait_presence_absence

  combinations <- combn(RNAseq.data$features$bins, 2, simplify = F)

  if(!missing(model_bin)){
    combinations_39 <- lapply(RNAseq.data$features$bins, function(bin){
      if(!bin==model_bin){
        return(c(bin, model_bin))
      }
    })
  }


  for (pair in combinations){
    try({
      A <- pair[1] %>% as.character
      B <- pair[2] %>% as.character

      traits.A <- t.pa[, A] %>% which(. == T) %>% names
      traits.B <- t.pa[, B] %>% which(. == T) %>% names
      overlap.traits <- intersect(traits.A, traits.B) %>% length

      attributes.A <- which(sbs.trait.attributes[A,]  == 1) %>% names
      print(B)
      attributes.B <- which(sbs.trait.attributes[B,]  == 1) %>% names
      overlap.attributes <- intersect(attributes.A, attributes.B) %>% length

      vs <- paste(pair, collapse = 'vs')

      point.matrix <- rbind(point.matrix,
                            c(vs,
                              overlap.traits,
                              overlap.attributes))
    })

  }
  colnames(point.matrix) <- c("pairs", 'traits', 'attributes')

  point.df <- data.frame(point.matrix, stringsAsFactors = F)
  point.df$traits %<>% as.numeric
  point.df$attributes %<>% as.numeric
  ylim_plot1 <- c(0,max(point.df$attributes)+5)
  xlim_plot1 <- c(0, max(point.df$traits)+5)

  plot(x = as.numeric(point.matrix[,2]),
       y = as.numeric(point.matrix[,3]),
       xlab = '# Overlap Traits',
       ylab = '# Overlap Attributes',
       main = "Pairwise Genome Comparisons",
       ylim = ylim_plot1,
       xlim = xlim_plot1)

  model <- lm(attributes~traits, data = point.df)
  abline(model$coefficients)

  cinterval <- confint(model, level=.99)
  abline(cinterval[,1])
  abline(cinterval[,2])


  if(!missing(model_bin)){
    for (pair in combinations_39){
      try({
        A <- pair[1] %>% as.character
        B <- pair[2] %>% as.character

        traits.A <- t.pa[, A] %>% which(. == T) %>% names
        traits.B <- t.pa[, B] %>% which(. == T) %>% names
        overlap.traits <- intersect(traits.A, traits.B) %>% length

        attributes.A <- which(sbs.trait.attributes[A,]  == 1) %>% names
        print(B)
        attributes.B <- which(sbs.trait.attributes[B,]  == 1) %>% names
        overlap.attributes <- intersect(attributes.A, attributes.B) %>% length

        vs <- paste(pair, collapse = 'vs')

        point.matrix_39 <- rbind(point.matrix_39,
                                 c(vs,
                                   overlap.traits,
                                   overlap.attributes))
      })

    }
    colnames(point.matrix_39) <- c("pairs", 'traits', 'attributes')

    point.df <- data.frame(point.matrix_39, stringsAsFactors = F)
    point.df$traits %<>% as.numeric
    point.df$attributes %<>% as.numeric


    points(x = as.numeric(point.matrix_39[,2]),
           y = as.numeric(point.matrix_39[,3]),
           xlab = '# Overlap Traits',
           ylab = '# Overlap Attributes',
           main = "Pairwise Genome Comparisons",
           ylim = ylim_plot1,
           xlim = xlim_plot1,
           pch=19,
           cex=1.5,
           col=string_to_colors)

    text(x=10,y=ylim_plot1[2]-5, paste("bin_",model_bin))
    #  model_39 <- lm(attributes~traits, data = point.df)
    #  abline(model_39$coefficients, col="red")

    #  cinterval_39 <- confint(model_39, level=.99)
    #  abline(cinterval_39[,1], col="red")
    #  abline(cinterval_39[,2], col="red")
    #polygon

  }
  # identify(x= as.numeric(point.matrix[,2]), y = as.numeric(point.matrix[,3]), labels = point.matrix[,1])
}

Get_KEGG_Sub_Modules <- function(){
  library(reticulate)
  library(magrittr)

  source_python('/home/joris/TcT/python/parse_module_definition.py')

  # module_names <- RNAseq.data$features$annotation.db$module.dict %>% names

  start <- Sys.time()
  sub_modules <- lapply(module_names, function(module) {
    print(paste0("Parsing: ", module))

  if (length(grep('M[0-9]{5}', module)) != 0 ) {
    return(parse_module(module) )
  }else{
   return(list(RNAseq.data$features$annotation.db$module.dict[[module]]))
  }

  })
  end <- Sys.time()

  names(sub_modules) <- module_names

}

#' Identify whether an annotation term has a certain expression pattern in one
#' or more genomes
#' @name Go Fish
#' @description Identify whether a gene has a certain expression pattern in one or more genomes.
#' @param RNAseq.data Collection of multple components, include RNA seq data, annotations, etc. See \code{\link{Pre_process_input}} for the full list.
#' @export
#' @example Go_Fish('K00927', RNAseq.data, type = 'peak')
#' @author JJM van Steenbrugge
Go_Fish <- function(RNAseq.data){

  .Test_pattern <- function(pattern, gene_pattern, margin = 0.15){
    verdict <- T
    for( i in 1: length(pattern) ) {
      lower <- pattern[i] - margin
      upper <- pattern[i] + margin

      if(gene_pattern[i] < lower || gene_pattern[i] > upper) {
        verdict <- F
      }

    }
    return(verdict)
  }

  .Draw_pattern <- function(nsamples = 6){
    plot(c(1,nsamples), c(0,1), type='n',
         xlab = 'Time points',
         ylab = 'Normalized Rank')
    cat("Pick a rank value (Y axis) for each time point by clicking in the graph (start at 1)")

    positions <- list('x' = c(),
                      'y' = c()
    )

    for(i in 1:nsamples){
      pos <- locator(1)
      positions$x <- c(positions$x, pos$x)
      positions$y <- c(positions$y, pos$y)

      plot(positions,
           xlab = 'Time points',
           ylab = 'Normalized Rank',
           xlim = c(1, nsamples),
           ylim = c(0,1),
           type = 'b')

    }

    return(positions)

  }

  points <- .Draw_pattern()
  lines(points$x, points$y)
  text(3,0.5,'Using this pattern to match')

  for (i in 1:length(names(RNAseq.data$features$annotation.db$module.dict))) {
    trait <- RNAseq.data$features$annotation.db$module.dict[[i]]
    genes <- RNAseq.data$table[which(RNAseq.data$table$Annotation %in% trait),]


    verdicts <- apply(genes, 1, function(gene){
      gene <- gene[RNAseq.data$features$rank.columns]
      return(.Test_pattern(points$y, gene))
    })

    if (TRUE %in% verdicts) {
      print(names(RNAseq.data$features$annotation.db$module.dict)[i])
    }
  }

}

#' Model Module
#' @name Model_Module
#' @description Creates a bar plot of pairwise comparisons between the traits of a model genome.
#' If a distance is less than the 99% quantile of the background distances for a module of similar size,
#' The bar is presented in dark grey.
#' @param trait character string with the name of a trait.
#' @param RNAseq.data Collection of multple components, include RNA seq data,
#' annotations, etc.
#' @seealso \code{\link{Pre_process_input}} for the full list of parameters,
#' \url{https://github.com/Jorisvansteenbrugge/TcT/wiki/The-RNAseq.data-object}
#' for more information.
#' @param RNAseq.data
#' @param trait.attributes
#' @param Model_Bin
#' @param Module_Pool
#' @param Bin_Order
#' @param Yrange It is optional to provide a Yrange
#' @examples Module_Model_List <- Model_Module(RNAseq.data, trait.attributes, '39', Module_Names, c(-2.5,2.5), bkgd.traits)
#'\dontrun{
#' Model_Bin <- 39
#' Bin_Order <- c(48,32,31,29,22,11,39,16,53,45,42,28,20,25,19,8,36,26,17)
#' Yrange <- c(-2.5,2.5)
#' Module_Names_ATP_synthesis <- c("M00144","M00143","M00146","M00147","M00149","M00150","M00148","M00162","M00151","M00152","M00154","M00155","M00153","M00417","M00416","M00156","M00157","M00159")
#'}
#' @export
#' @author BO Oyserman
#'

Model_Module <- function(RNAseq.data, trait.attributes, Model_Bin, Module_Names,bkgd.traits) {


  annotation.db <- RNAseq.data$features$annotation.db

  # Step one, take the module list and match it to the module dictionary psoitions.
# This is done by parsing out the module.dict
# There are various parts of this function
# 1) making a matrix of pairwise distances to the model
# 2) calculate the order on the X and Y axis
#     X - axis: the order of the genomes based on similarity to the model
#     Y - axis: the order of the modules based on their similarity to the model


  Module_Positions         <- match(Module_Names,
                                   names(annotation.db$module.dict))
  Module_Pool              <- NULL
  Model_Comparison_Matrix  <- NULL
  Module_Name_vector       <- NULL
  Model_Sig_Matrix         <- NULL
  Fish_Backgrounds_trimmed <- NULL

  # Using non-disjunctive form
   Module_lengths           <- lapply(annotation.db$module.dict[Module_Names],
                                      length) %>% as.numeric

 # Module_lengths           <- d_module_size_range_all[which(names(annotation.db$module.dict)%in%Module_Names)]

  Module_background_distribution_index <- match(as.numeric(Module_lengths),
                                                as.numeric(names(bkgd.traits)))

  Fish_Backgrounds <- c()
  for (i in Module_background_distribution_index){

    Fish_Background  <- quantile(unlist( bkgd.traits[i]),
                                 probs = seq(0, 0.1, .01))[6]
    Fish_Backgrounds <- c(Fish_Backgrounds,
                          Fish_Background)

  }

  # For each module name, get the number of genes in the module. This needs to be updated to use the disjunctive normalized forms.

  for (i in 1: length(Module_Names)) {
    Module_Pool[i] <- list(trait.attributes[[ Module_Positions[i] ]] [2])
  }

  # Reorder the bins based on the input variable, hashed out for updated version in which order is defined based on similarity
  # If a predetermined bin order is  supplied ...

  # Bin_Order_Index <- match(Bin_Order, rownames(Module_Pool[[1]] [[1]]))

  # Make Model Comparison Matrix, filtering modules that were not present in the model organism

  for (i in 1: length(Module_Pool) ) {

    # Create full matrix from each module by filling in the bottom of the traingle
    fullmatrix                                      <- Module_Pool[[i]] [[1]]
    fullmatrix[lower.tri(fullmatrix, diag = FALSE)] <- fullmatrix[upper.tri(fullmatrix,
                                                                            diag = FALSE)]
    # Parse out the vector of the model organisms and all pairwise comparisons
    Model_Comparison_vector <- fullmatrix[,
                                           which(dimnames(fullmatrix) [[1]] == Model_Bin
                                                  )]
    # print (Model_Comparison_vector)

    # If the vector is empty, go to the next module. Else, cbind the vector to a growing matrix containing all Module comparisons for that Model genome
    if(sum(is.na(Model_Comparison_vector)) == length(Model_Comparison_vector)){
      next
    } else {
      Model_Comparison_Matrix  <- cbind(Model_Comparison_Matrix,
                                        Model_Comparison_vector)
      Module_Name_vector       <- c(Module_Name_vector,
                                    Module_Names[i])
      # Only keep relavent backgrounds
      Fish_Backgrounds_trimmed <- c(Fish_Backgrounds_trimmed,
                                    Fish_Backgrounds[i])
    }

  }


  # Clean up the values, converting NaN to NA
  Model_Comparison_Matrix[ which(Model_Comparison_Matrix == "NaN") ] < -NA

  # Identify the bin orders their similarity with the model, and create an index
  Sum_Similarity  <- apply(Model_Comparison_Matrix, 1, sum, na.rm = TRUE)
  Bin_Order       <- Model_Comparison_Matrix %>% apply(1, sum, na.rm = T) %>%
                          sort %>% names

  Bin_Order_Index <- match(Bin_Order,
                           rownames(Model_Comparison_Matrix)) %>% rev

  # Next make a matrix of whether a module is significant
  for (i in 1: ncol(Model_Comparison_Matrix) ) {

    sig_bins         <- (Model_Comparison_Matrix[,i] <= Fish_Backgrounds_trimmed[i])
    Model_Sig_Matrix <- cbind(Model_Sig_Matrix,
                              sig_bins)
  }

  # Name the columns by module name
  colnames(Model_Comparison_Matrix) <- Module_Name_vector
  names(Fish_Backgrounds_trimmed)   <- Module_Name_vector
  colnames(Model_Sig_Matrix)        <- Module_Name_vector

  # Identify the module orders based on their similarity with the model, and create an index.
  # Similarity is based on the number of significantly similar

  Sum_Similarity_M   <- apply(Model_Sig_Matrix, 2,
                              sum, na.rm = TRUE)
  Module_Order       <- names(sort(apply(Model_Sig_Matrix, 2,
                                         sum, na.rm = TRUE)))
  Module_Order_Index <- match(Module_Order,
                              colnames(Model_Comparison_Matrix))


  # Return the various things calculated

  Model_Module_List <- list("Model_Comparison_Matrix"  = Model_Comparison_Matrix,
                            "Fish_Backgrounds_trimmed" = Fish_Backgrounds_trimmed,
                            "Model_Sig_Matrix"         = Model_Sig_Matrix,
                            "Bin_Order_Index"          = Bin_Order_Index,
                            "Module_Order_Index"       = Module_Order_Index,
                            "Module_Order"             = Module_Order
                            )
  return(Model_Module_List)




}


#
#' Plot_Model_Module
#' @name Plot_Model_Module
#' @description Creates a bar plot of pairwise comparisons between the traits of a model genome.
#' If a distance is less than the 99% quantile of the background distances for a module of similar size,
#' The bar is presented in dark grey.
#' @seealso \code{\link{Pre_process_input}} for the full list of parameters,
#' \url{https://github.com/Jorisvansteenbrugge/TcT/wiki/The-RNAseq.data-object}
#' for more information.
#' @param Model_Module_List
#' @param Model_Bin
#' @param Module_Names
#' @export
#' @author BO Oyserman

Plot_Model_Module <- function(Model_Module_List, Model_Bin, Module_Names, margins, sortbygenome) {

num_genes <-  length( Model_Module_List[[4]])
par(mfrow = margins,
    mar   = c(2.1, 1, 2.1, 1))

# How it sorted? If by a particular genome then do the following
if (length(sortbygenome)==1) {
  j=0
  # if it is the first plot in a row, plot the labels

  for (i in order(Model_Module_List$Model_Comparison_Matrix[sortbygenome,])) {
    j = j +1
    if (j %in% (which(1:(margins[1]%*%margins[2]) %in% as.numeric(margins[2]%*%seq(1,margins[1]))==TRUE)-(margins[2]-1))) {
      barplot( Model_Module_List[[1]][ Model_Module_List[[4]], 1 ],
               col    = NA,
               border = NA,
               axes   = FALSE,
               xlim   = c(-2,1),
               ylim   = c(0,19),
               yaxt   = 'n',
               xaxt   = 'n')
      text(x = rep(-1,19),
           y = seq(from = 1, to = (num_genes-1), by = (num_genes-2) / (num_genes-1)),
           labels = rownames(Model_Module_List[[3]])[Model_Module_List[[4]]], cex = 1)

    } else {

      sig_colors                       <- rep("white", length(Model_Module_List[[4]]))
      sig_colors[Model_Module_List[[3]][,i]] <- "gray0"

      barplot(Model_Module_List[[1]][Model_Module_List[[4]], i],
              xlim = c(-2,1), horiz = TRUE,
              main = colnames(Model_Module_List[[1]])[i],
              col  = sig_colors[Model_Module_List[[4]]],
              yaxt = 'n'
      )
      abline(v = 0, lwd = 1)
    }
  }
  # How it sorted? If NOT by a particular genome then sort by most significant modules
} else {
  j=0

  for (i in rev(Model_Module_List[[5]])) {

    j = j +1
    if (j %in% (which(1:(margins[1]%*%margins[2]) %in% as.numeric(margins[2]%*%seq(1,margins[1]))==TRUE)-(margins[2]-1))) {
      barplot( Model_Module_List[[1]][ Model_Module_List[[4]], 1 ],
               col    = NA,
               border = NA,
               axes   = FALSE,
               xlim   = c(-2,1),
               ylim   = c(0,19),
               yaxt   = 'n',
               xaxt   = 'n')
      text(x = rep(-1,19),
           y = seq(from = 1, to = (num_genes-1), by = (num_genes-2) / (num_genes-1)),
           labels = rownames(Model_Module_List[[3]])[Model_Module_List[[4]]], cex = 1)

    } else {

      sig_colors                        <- rep("white", length(Model_Module_List[[4]]))
      sig_colors[Model_Module_List[[3]][,i]] <- "gray0"

      barplot(Model_Module_List[[1]][Model_Module_List[[4]], i],
              xlim = c(-2,1), horiz = TRUE,
              main = colnames(Model_Module_List[[1]])[i],
              cex.main = 0.75,
              col  = sig_colors[Model_Module_List[[4]]],
              yaxt = 'n'
      )
      abline(v = 0, lwd = 1)
    }
  }
}

}


#
#' Plot_Model_Module_Simple
#' @name Plot_Model_Module
#' @description Creates a bar plot of pairwise comparisons between the traits of a model genome.
#' If a distance is less than the 99% quantile of the background distances for a module of similar size,
#' The bar is presented in dark grey.
#' @seealso \code{\link{Pre_process_input}} for the full list of parameters,
#' \url{https://github.com/Jorisvansteenbrugge/TcT/wiki/The-RNAseq.data-object}
#' for more information.
#' @param Model_Module_List
#' @param Model_Bin
#' @param Module_Names
#' @export
#' @author BO Oyserman

Plot_Model_Module <- function(Model_Module_List, Model_Bin, Module_Names, margins, sortbygenome) {

  num_genes <-  length( Model_Module_List[[4]])
  par(mfrow = margins,
      mar   = c(2.1, 1, 2.1, 1))

  # How it sorted? If by a particular genome then do the following
  if (length(sortbygenome)==1) {

    for (i in order(Model_Module_List$Model_Comparison_Matrix[sortbygenome,])) {
        sig_colors                       <- rep("white", length(Model_Module_List[[4]]))
        sig_colors[Model_Module_List[[3]][,i]] <- "gray0"

        barplot(Model_Module_List[[1]][Model_Module_List[[4]], i],
                xlim = c(-2,1), horiz = TRUE,
                main = colnames(Model_Module_List[[1]])[i],
                col  = sig_colors[Model_Module_List[[4]]],
                yaxt = 'n'
        )
        abline(v = 0, lwd = 1)
      }
    # How it sorted? If NOT by a particular genome then sort by most significant modules
  } else {

    for (i in rev(Model_Module_List[[5]])) {
        sig_colors                        <- rep("white", length(Model_Module_List[[4]]))
        sig_colors[Model_Module_List[[3]][,i]] <- "gray0"

        barplot(Model_Module_List[[1]][Model_Module_List[[4]], i],
                xlim = c(-2,1), horiz = TRUE,
                main = colnames(Model_Module_List[[1]])[i],
                cex.main = 0.75,
                col  = sig_colors[Model_Module_List[[4]]],
                yaxt = 'n'
        )
        abline(v = 0, lwd = 1)
      }
    }
  }



##' Automatically convert a vector of strings into a color for easy plotting
##'
##' @title Convert between strings to colors
##' @param string a vector of strings representing groups.
##' @param colors a vector of colors, one for each unique element in \code{string}.
##' @export
##' @return a vector of colors, one for each element in \code{string}
##' @aliases string.to.colors stringtocolor stringToColors string.to.color
##' @author Dustin Fife
##' @seealso \code{\link{number.to.colors}}
##' @examples
##' groups = sample(LETTERS[1:5], size=100, replace=TRUE)
##' plot(rnorm(100), rnorm(100), col=string.to.colors(groups))
##' plot(rnorm(100), rnorm(100), col=string.to.colors(groups),
##'    pch=as.numeric(string.to.colors(groups, colors=c(16:20))))
##' @note This function can also be used to specify pch values, cex values, or any other plotting values
##' the user may wish to differ across groups. See examples.
string.to.colors = function(string, colors=NULL){
  if (is.factor(string)){
    string = as.character(string)
  }
  if (!is.null(colors)){
    if (length(colors)!=length(unique(string))){
      break("The number of colors must be equal to the number of unique elements.")
    }
    else {
      conv = cbind(unique(string), colors)
    }
  } else {
    conv = cbind(unique(string), rainbow(length(unique(string))))
  }
  unlist(lapply(string, FUN=function(x){conv[which(conv[,1]==x),2]}))
}


##' Automatically convert a vector of numbers into a color for easy plotting
##'
##' @title Convert from numbers to colors
##' @param value a vector of numbers.
##' @param colors a vector of two or more colors representing the inflection points of the gradients, passed to \code{\link{colorRampPalette}}.
##' @param num The number of unique intervals for colors. Chose larger numbers for finer gradients (higher resolution).
##' @export
##' @return a vector of colors.
##' @aliases number.to.color numbers.to.colors integers.to.colors integer.to.colors numberToColors numberToColor
##' @author Dustin Fife
##' @seealso \code{\link{string.to.colors}} \code{\link{colorRampPalette}} \code{\link{gradient.legend}}
##' @examples
##' #### plot three variables, represent the third with a color
##' d = mvrnorm(100, mu=c(0,0,0), Sigma=matrix(c(1, .6, .6, .6, 1, .6, .6, .6, 1), ncol=3))
##' plot(d[,1:2], col=number.to.colors(d[,3], c("black", "pink")), pch=16)
number.to.colors = function(value, colors=c("red", "blue"), num=100){
  cols = colorRampPalette(colors)(num)
  cols = 	cols[findInterval(value, vec=seq(from=min(value), to=max(value), length.out=num))]
  cols
}



##' Create a gradiented legend
##'
##' @param y the variable used to create the gradient, typically in \code{\link{number.to.colors}}
##' @param yrange The range of y values. If y is supplied, it will pulls these from the actual y values.
##' @param cols The color gradients to be used that are passed to \code{\link{colorRampPalette}}.
##' @param location The location of the subplot, expressed in fractions of the entire plot (left x, right x,
##' bottom y, top y).
##' @param n the number of values used for the gradient. Higher numbers make a higher resolution
##' @param ... other arguments passed to image
##' @aliases gradient
##' @seealso \code{\link{number.to.colors}} \code{\link{colorRampPalette}}
##' @author Dustin Fife
##' @export
##' @examples
##' y = rnorm(100); x = .6*y + rnorm(100,0,sqrt(1-.6^2))
##' randnum = runif(100)
##' plot(x,y, col=number.to.colors(randnum), pch=16)
##' gradient.legend(randnum, xlab="", ylab="")
gradient.legend = function(y=NULL, yrange=NULL, cols = c("red", "blue"),location=c(.075,.3,.575,.975), n=100,...){

  #### if they don't supply a y, make sure they give range
  if (is.null(y) & is.null(yrange)){
    stop("You must supply either a y value or a y range.")
  }
  if (is.null(yrange)){
    yrange = range(y, na.rm=T)
  }

  #### set location
  par(fig=location, new=T, las=1, ps=9)
  z=matrix(1:n,nrow=1)
  x=1
  y=seq(yrange[1],yrange[2],len=n)
  my.colors = colorRampPalette(cols)
  image(x,y,z,col=my.colors(n),axes=FALSE,...)
  axis(2)
  par(fig=c(0,1,0,1))
}
Jorisvansteenbrugge/TcT documentation built on Sept. 26, 2022, 6:50 a.m.