ffl: Identifies Feed Forward Gene Networks

#file: DE-method/DEm-2-arm2-fn-v1.R

# arm2: TF-gene -----------------------------------------------------------


# arm2 prep ---------------------------------------------------------------

# #!!! comment out TF-gene databases for now
# ###########load TF-gene databases
# ###database 1: TRRUST
# #load
# trrust <- read.table(file = "/Users/than/Dropbox/research/miRNA/ffl/new/databases/TF-gene/trrust_rawdata.human.tsv", sep = '\t', header = FALSE)
# #rename columns
# colnames(trrust) <- c("TF", "gene", "TRRUST_Regulation", "TRRUST_References_PMID")
# #check class of df columns
# table(sapply(trrust, class))
# trrust[ , 1:4] <- apply(trrust[ , 1:4], 2, as.character) #convert all columns from factor to character
# table(sapply(trrust, class))
# #add TRRUST column: 1 for all rows (represents whether TF-gene pair is in the TRRUST database; used when in step when searching whether pos. corr. pairs are in database)
# trrust$TRRUST <- 1
#
# ###database 2: ENCODE
# #load
# encode <- read.table(file = "/Users/than/Dropbox/research/miRNA/ffl/new/databases/TF-gene/ENCODE_gene_attribute_edges.txt", sep = '\t', header = TRUE)
# #remove first row ("GeneSym, NA, GeneID...")
# encode <- encode[2:nrow(encode), ]
# #check class of df columns
# table(sapply(encode, class))
# encode[ , 1:6] <- apply(encode[ , 1:6], 2, as.character)
# encode$weight <- as.numeric(encode$weight)
# table(sapply(encode, class))
# #keep only 4 columns (other columns are na)
# #1. target: TFs (there are 181 unique "target" values and 181 TFs in the dataset, see website)
# #2. target_id
# #3. source: genes
# #4. source_id
# encode <- encode[ , c("target", "source", "target_id", "source_id")]
# colnames(encode) <- c("TF", "gene", "ENCODE_TF_ID", "ENCODE_Gene_ID")
# #add ENCODE column: 1 for all rows (represents whether TF-gene pair is in the ENCODE database; used when in step when searching whether pos. corr. pairs are in database)
# encode$ENCODE <- 1
# ###########fin
#
#
# ###########create list of TFs
# ###get a list of TFs from TRRUST db
# tf_trrust <- trrust$TF
# tf_trrust <- unique(tf_trrust)
# #788/795 unique TFs are in unique(isoformAnnot$GeneSymbol)
# sum(tf_trrust %in% unique(isoformAnnot$GeneSymbol))
#
# ###get a list of TFs from ENCODE db
# tf_encode <- encode$TF
# tf_encode <- unique(tf_encode)
# #181/181 unique TFs are in unique(isoformAnnot$GeneSymbol)
# sum(tf_encode %in% unique(isoformAnnot$GeneSymbol))
#
# ###130/181 ENCODE TFs are in TRRUST TFs
# length(intersect(tf_trrust, tf_encode))
#
# ###all TFs
# arm2_tf_list <- unique(c(tf_trrust, tf_encode))
# ###########fin
#


# arm2 function -----------------------------------------------------------


###########arm2_tf_gene function
#' @title DEm_arm2_tf_gene
#' @description identifies valid TF-gene pairs for FFL
#' and that are found in databases
#'
#' @param mrna_DE dataframe of DE mRNAs
#'
#' @return list of length 3:
#' 1) tf_DE: dataframe of DE TFs
#' 2) mRNA_DE: dataframe of DE genes
#' 3) db_hits: dataframe of TF-miRNA pairs that were found in databases

DEm_arm2_tf_gene <- function(mrna_DE){
  #####DE TFs
  tf_DE <- mrna_DE[row.names(mrna_DE) %in% arm2_tf_list, ]
  #for the DE columns, change "(gene)" to "(TF)"
  names(tf_DE)[names(tf_DE) == "log-ratio(gene)"] <- "log-ratio(TF)"
  names(tf_DE)[names(tf_DE) == "P-Value(gene)"] <- "P-Value(TF)"
  names(tf_DE)[names(tf_DE) == "P-adjust(gene)"] <- "P-adjust(TF)"
  names(tf_DE)[names(tf_DE) == "mean_case(gene)"] <- "mean_case(TF)"
  names(tf_DE)[names(tf_DE) == "mean_control(gene)"] <- "mean_control(TF)"

  #####DE mRNAs
  #use mrna_DE itself
  print("1/3 -- created DE TF & DE gene dataframes")

  #####find all possible TF-gene pairs
  #logFC: case - control
  #logFC pos: upregulated in case group
  #logFC neg: downregulated in case group
  #pairs: TF up, gene up
  tf_up <- row.names(tf_DE[tf_DE$`log-ratio` > 0, ]) #upregulated TFs
  mrna_up <- row.names(mrna_DE[mrna_DE$`log-ratio` > 0, ]) #upregulated genes
  tf_up_mrna_up_pairs <- expand.grid(tf_up, mrna_up) #find all possible pairs
  colnames(tf_up_mrna_up_pairs) <- c("TF", "gene")
  #pairs: TF down, gene down
  tf_down <- row.names(tf_DE[tf_DE$`log-ratio` < 0, ]) #downregulated TFs
  mrna_down <- row.names(mrna_DE[mrna_DE$`log-ratio` < 0, ]) #downregulated genes
  tf_down_mrna_down_pairs <- expand.grid(tf_down, mrna_down) #find all possible pairs
  colnames(tf_down_mrna_down_pairs) <- c("TF", "gene")
  #put all pairs together
  tf_gene_pairs <- rbind(tf_up_mrna_up_pairs, tf_down_mrna_down_pairs)
  tf_gene_pairs[ , 1:2] <- apply(tf_gene_pairs[ , 1:2], 2, as.character)
  #print info
  print("2/3 -- finished gathering potential TF-gene pairs")
  print(paste(dim(tf_gene_pairs)[1], "potential TF-gene pairs", sep = " "))

  #####search TF-gene pairs in database
  #1. search TRRUST database
  db_hits <- merge(tf_gene_pairs, trrust[ , c("TF", "gene", "TRRUST")], by = c("TF", "gene"), all.x = TRUE)
  #if TRRUST column is NA (i.e. TF-gene pair not in TRRUST database), change NA value to 0 (integers are used to calculate sum_db_hits_TFgene later)
  db_hits$TRRUST[is.na(db_hits$TRRUST)] <- 0
  #2. search ENCODE database
  db_hits <- merge(db_hits, encode[, c("TF", "gene", "ENCODE")], by = c("TF", "gene"), all.x = TRUE)
  #if ENCODE column is NA (i.e. TF-gene pair not in ENCODE database), change NA value to 0 (integers are used to calculate sum_db_hits_TFgene later)
  db_hits$ENCODE[is.na(db_hits$ENCODE)] <- 0
  #3. create sum_db_hits_TFgene column (indicates: TF-gene pair found in X databases)
  db_hits$sum_db_hits_TFgene <- db_hits$TRRUST + db_hits$ENCODE
  #filter by sum_db_hits_TFgene column
  db_hits_filtered <- db_hits[db_hits$sum_db_hits_TFgene >= 1, ]
  print(paste(dim(db_hits_filtered)[1], "/", dim(tf_gene_pairs)[1], "TF-gene pairs found in at least one database", sep = " "))
  print("3/3 -- finished database searches")

  #####add DE columns
  #add TF DE columns
  tf_DE_copy <- tf_DE
  tf_DE_copy$TF <- row.names(tf_DE_copy)
  db_hits_filtered <- merge(x = db_hits_filtered,
                            y = tf_DE_copy[c("TF", "log-ratio(TF)", "P-Value(TF)", "P-adjust(TF)", "mean_case(TF)", "mean_control(TF)")],
                            by = "TF",
                            all.x = TRUE)
  #add gene DE columns
  mrna_DE_copy <- mrna_DE
  mrna_DE_copy$gene <- row.names(mrna_DE_copy)
  db_hits_filtered <- merge(x = db_hits_filtered,
                            y = mrna_DE_copy[c("gene", "log-ratio(gene)", "P-Value(gene)", "P-adjust(gene)", "mean_case(gene)", "mean_control(gene)")],
                            by = "gene",
                            all.x = TRUE)

  return(list(tf_DE = tf_DE, mrna_DE = mrna_DE, db_hits = db_hits_filtered))
}
###########fin