#----------------------------------------------------------------------------------------------------
#' Merge Footprints for Lymphoblast Data
#'
#' Merge the HINT and Wellington footprints into the dataframe of FIMO motifs and ChIPSeq Data for
#' all chromosomes
#'
#' @param fimo.df A data frame containing FIMO motifs and ChIPSeq data, generally constructed
#' using the \code{constructLymphoblastDataset.R} function
#' @param seedNum A number (either 16 or 20) denoting which seed size to use for the merge
#' @param host A string denoting the database host (default = "localhost")
#' @param port A string denoting the database port (default = "5432")
#' @param verbose A Boolean value that determines whether the function should print output during
#' the merge (default = TRUE)
#'
#' @return The supplied FIMO/ChIPSeq data frame with added columns for HINT count, HINT maximum
#' score, Wellington count, and Wellington minimum score
#'
#' @export
mergeLymphoblastFootprints <- function(fimo.df, seedNum, host = "localhost", port = "5432", verbose = TRUE){
# Create database name strings
db.hint <- paste0("lymphoblast_hint_", seedNum)
db.well <- paste0("lymphoblast_wellington_", seedNum)
# Read data from lymphoblast
db_lymph_hint <- DBI::dbConnect(drv = RPostgreSQL::PostgreSQL(),
user="trena",
password="trena",
dbname = db.hint,
host = host,
port = port)
db_lymph_well <- DBI::dbConnect(drv = RPostgreSQL::PostgreSQL(),
user="trena",
password="trena",
dbname = db.well,
host = host,
port = port)
hint_regions <- dplyr::tbl(db_lymph_hint, "regions")
hint_hits <- dplyr::tbl(db_lymph_hint, "hits")
well_regions <- dplyr::tbl(db_lymph_well, "regions")
well_hits <- dplyr::tbl(db_lymph_well, "hits")
# Run the big function using a for loop
chromosomes <- as.character(c(1:22, "X","Y"))
big_list <- list()
counter <- 1
now.time <- Sys.time()
for (chr_str in chromosomes) {
# Print if it's verbose
if(verbose){
message(paste("working on chromosome", chr_str))
}
big_list[[counter]] <- mergeFootprintsOneChrom(chr_str,
fimo.df,
hint_regions,
hint_hits,
well_regions,
well_hits)
counter <- counter + 1
if(verbose){
message(paste("Time elapsed:", Sys.time() - now.time))
}
}
# Combine them
merged.df <- dplyr::bind_rows(big_list)
rm(big_list)
# Return them
return(merged.df)
} # mergeLymphoblastFootprints
#----------------------------------------------------------------------------------------------------
#' Merge Footprints for 1 Chromosome
#'
#' Merge HINT and Wellington footprints into the dataframe of FIMO motifs and ChipSeq for
#' 1 chromosome
#'
#' @param chrom_str A string representing one chromosome; must be one of c(1:22, "X", "Y")
#' @param fimo_tbl A dataframe of FIMO motifs and ChIPSeq data
#' @param hint_regions_tbl A database table of HINT regions
#' @param hint_hits_tbl A database table of HINT hits
#' @param well_regions_tbl A database table of Wellington regions
#' @param well_hits_tbl A database table of Wellington hits
#'
#' @return The original FIMO dataframe with added columns for footprint counts and the maximum or
#' minimum score
#'
#' @export
mergeFootprintsOneChrom <- function(chrom_str,
fimo_tbl,
hint_regions_tbl,
hint_hits_tbl,
well_regions_tbl,
well_hits_tbl
) {
# some tables use chr22 and some just use 22
chrom_long_str = paste("chr",chrom_str, sep="")
# select one chromosome from my data table
fimo_tbl %>%
dplyr::filter(chrom==chrom_str) %>%
dplyr::select(-empty) ->
chrom_all_tf_df
# Create an overlap function
getOverlap <- function(motif_start, motif_end, fp_start, fp_end){
# Create the 3 terms for the numerator
term1 <- abs(max(motif_end, fp_end) - min(motif_start, fp_start))
term2 <- abs(motif_end - fp_end)
term3 <- abs(motif_start - fp_start)
# Put terms together and return number
num <- term1 - term2 - term3
denom <- motif_end - motif_start
perc.overlap <- 100*num/denom
return(perc.overlap)
}
# select one chromosome from hint
hint_regions_tbl %>%
dplyr::filter(chrom==chrom_long_str) %>%
dplyr::left_join(hint_hits_tbl, by="loc") %>%
dplyr::select(start, endpos, strand, name,
fp_start, fp_end, h_score = score1) %>%
dplyr::group_by(start, endpos, name, strand) %>%
dplyr::mutate(h_count = n(), h_max_score = max(h_score)) %>%
dplyr::select(-h_score) %>%
tibble::as_tibble() ->
chrom_hint_all_tbl
# Grab only distinct ones
chrom_hint_all_tbl %>%
dplyr::rowwise() %>%
dplyr::mutate(h_percent_overlap = getOverlap(start, endpos,
fp_start, fp_end)) %>%
dplyr::select(-fp_start, -fp_end) %>%
dplyr::distinct(start, endpos, name, strand, .keep_all = TRUE) ->
chrom_hint_unique_tbl
rm(chrom_hint_all_tbl)
# select one chromosome from wellington
well_regions_tbl %>%
dplyr::filter(chrom==chrom_long_str) %>%
dplyr::left_join(well_hits_tbl, by="loc") %>%
dplyr::select(start, endpos, strand, name,
fp_start, fp_end, w_score = score1) %>%
dplyr::group_by(start, endpos, name, strand) %>%
dplyr::mutate(w_count = n(), w_min_score = min(w_score)) %>%
dplyr::select(-w_score) %>%
tibble::as_tibble() ->
chrom_well_all_tbl
# keep only min wellington score but count total nontrivial scores
chrom_well_all_tbl %>%
dplyr::rowwise() %>%
dplyr::mutate(w_percent_overlap = getOverlap(start, endpos,
fp_start, fp_end)) %>%
dplyr::select(-fp_start, -fp_end) %>%
dplyr::distinct(start, endpos, name, strand, .keep_all = TRUE) ->
chrom_well_unique_tbl
rm(chrom_well_all_tbl)
# merge hint and wellington into my table
chrom_all_tf_df %>%
dplyr::left_join(chrom_hint_unique_tbl,
by=c("start", "endpos", "strand","motifname"="name")) %>%
dplyr::left_join(chrom_well_unique_tbl,
by=c("start", "endpos", "strand", "motifname"="name")) %>%
tidyr::replace_na(list(h_count=0, w_count=0,
h_max_score=0, w_min_score=0,
h_percent_overlap = 0,
w_percent_overlap = 0)) ->
chrom_all_tf_df_merged
return(chrom_all_tf_df_merged)
}# mergeFootprintsOneChrom
#----------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.