#----------------------------------------------------------------------------------------------------
#' Annotate the Footprint Data
#'
#' Annotate the footprint and FIMO dataframe by adding TF class, GC content, and TSS distance
#'
#' @param fp.df The data frame of FIMO motifs and footprints
#' @param chipseq.hits The dataframe of ChIPseq hits corresponding to the dataset
#'
#' @return The supplied data frame with added columns for the one-hot coded TF class data, GC
#' content, and asinh-transformed distance from the nearest transcription start site
#'
#' @export
annotateFootprintData <- function(fp.df, chipseq.hits, host = "localhost", port = "5432"){
# Create the TF-motif map using the function from "constructMotifDataset"
TFs.to.motifs <- mapTFsToMotifs(chipseq.hits)
# Get the motif data
motif_class_hot <- createMotifClassMap(fp.df, TFs.to.motifs)
# Add the motif class to the dataframe
annotated.df <- dplyr::left_join(fp.df,
motif_class_hot, by = "motifname")
# Add GC Content
annotated.df <- annotated.df %>%
dplyr::mutate("gc_content" = getGCContent(start,endpos,chrom))
# Add TSS distance
annotated.df <- addTSSDistance(annotated.df, host, port)
# Change counts to fracs
annotated.df %>%
dplyr::mutate(h_frac = h_count/max(h_count)) %>%
dplyr::mutate(w_frac = w_count/max(w_count)) %>%
dplyr::select(-one_of("h_count","w_count")) %>%
dplyr::select(motifname:w_percent_overlap,
h_frac, w_frac, gc_content,
asinh_tss_dist,
dplyr::everything()) ->
annotated.df
return(annotated.df)
} # annotateFootprintData
#----------------------------------------------------------------------------------------------------
#' Create Motif Class Map
#'
#' Create the one-hot form of the JASPAR motifs and their classes
#'
#' @param fp.df The data frame of FIMO motifs and footprints. It is used for filtering out only those
#' motifs that appear in the data frame.
#'
#' @return The one-hot data frame of motifs and their membership in different classes
#'
#' @export
createMotifClassMap <- function(fp.df, TFs.to.motifs){
# Load data on motif mapping
load(system.file(package="FPML", "extdata/motif_class_pairs.Rdata"))
# Make Jaspar translation table
jaspar.motifs <- MotifDb::subset(MotifDb::MotifDb, dataSource == "jaspar2016")
jaspar.df <- dplyr::data_frame(Long.Name = names(jaspar.motifs),
Short.Name = trimws(S4Vectors::values(jaspar.motifs)$providerName)
)
# Add long motif names to the Jaspar data
fixed.motif.class <- motif.class %>%
dplyr::left_join(jaspar.df, by = c("motif" = "Short.Name")) %>%
dplyr::select("motifname" = "Long.Name", class)
# Filter the motif class using the dataframe
filtered.motif.class <- dplyr::semi_join(fixed.motif.class,
fp.df,
by = "motifname")
# Make it into a one-hot form
filtered.motif.class %>%
# clean up and subset to only relevant motifs
dplyr::mutate_all(stringr::str_trim) %>%
# fix double classes
dplyr::mutate(class = stringr::str_split(class, "::")) %>%
tidyr::unnest(class) %>%
# create one-hot(ish, some double matches) version
dplyr::mutate(dummy_yesno = 1) %>%
dplyr::distinct() %>%
tidyr::spread(class, dummy_yesno, fill = 0) ->
motif_class_hot
return(motif_class_hot)
} # createMotifClassMap
#----------------------------------------------------------------------------------------------------
#' Add GC content
#'
#' Add GC content to a row of the data frame containing FIMO motifs and footprint data
#'
#' @param start_col The name of the column containing the starting location of each motif
#' @param end_col The name of the column containing the ending location of each motif
#' @param chrom_col The name of the column containing the chromosome for each motif
#' @param shoulder An integer indicating the distance on either side of the center of the sequence
#' to use as a window (default = 100)
#'
#' @return The GC content of the supplied region
#'
#' @export
getGCContent <- function(start_col, end_col, chrom_col, shoulder=100) {
window_center <- round((start_col + end_col)/2)
windows <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,
paste0("chr",chrom_col),
window_center-shoulder,
window_center+shoulder)
alph_freq <- Biostrings::alphabetFrequency(windows)
gc_content <- rowSums(alph_freq[,c("C","G")])/(2*shoulder+1)
return(gc_content)
} # getGCContent
#----------------------------------------------------------------------------------------------------
#' Add Transcription Start Site Distance
#'
#' Add transcription start site distance to a dataframe of FIMO motifs and footprints data
#'
#' @param annotated.df A dataframe containing FIMO motifs, ChIPSeq data, and footprints data
#' @param host A string indicating the location of the hg38 database (default = "localhost")
#' @param port A string indicating the port for the hg38 database (default = "5432")
#'
#' @return The dataframe with an added column containing the asinh-transformed distance from each
#' motif to the nearest transcription start site
#'
#' @export
addTSSDistance <- function(annotated.df, host = "localhost", port = "5432"){
db_gtf <- DBI::dbConnect(RPostgreSQL::PostgreSQL(),
user = "trena",
password = "trena",
dbname = "hg38",
host = host,
port = port)
query <- "select * from gtf where moleculetype='gene' and gene_biotype='protein_coding'"
tss_raw_table <- DBI::dbGetQuery(db_gtf,
query)[, c("chr", "gene_name", "start", "endpos","strand")]
tss_raw_table %>%
dplyr::mutate(ref = ifelse(strand == '+', start, endpos)) %>%
dplyr::select("chrom" = "chr", "ts_start" = "ref") %>%
dplyr::filter(chrom != 'chrMT') %>%
dplyr::mutate(chrom=stringr::str_sub(chrom, start = 4)) ->
tss_tbl
motif_gr <- GenomicRanges::makeGRangesFromDataFrame(annotated.df,
start.field="start",
end.field="endpos")
tss_gr <- GenomicRanges::makeGRangesFromDataFrame(tss_tbl,
start.field="ts_start",
end.field="ts_start")
dist_to_nearest_tss <- GenomicRanges::distanceToNearest(motif_gr,
tss_gr,
select="arbitrary")
tss_dists <- S4Vectors::mcols(dist_to_nearest_tss)[,1]
annotated.df %>%
dplyr::mutate(asinh_tss_dist = asinh(tss_dists)) ->
df.with.tss.dist
return(df.with.tss.dist)
} # addTSSDistance
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.