##################################################################################
## generate experiment data
#' Create sample information dataframe
#'
#' This is a utility function to prepare the sample information dataframe for sample
#' of interest using experiment information file. Experiment information file should
#' have columns: \code{sampleId, IP_tag, control, peakType}. \cr\cr
#' \strong{Possible values for compulsory columns and how they are handled:}
#' \itemize{
#' \item \strong{IP_tag:} One of \code{c("HA", "FLAG", "HIS", "MYC", "TAP", "TF", "polII")}.
#' This column value is used to determine whether the ChIPseq data is for polII ChIPseq
#' or transcription factor ChIPseq. If polII ChIPseq, \code{polIIExpFile, polIIExpMat}
#' columns are populated with appropriate file names for polII data.
#' \item \strong{peakType:} One of \code{c("narrowPeak", "broadPeak")}. This column value is used
#' to decide the macs2 peak output file name (i.e. narrowPeak or broadPeak).
#' \item \strong{control:} This column value is used to decide the suffix of macs2 output
#' files. If the value is blank or ".", \emph{withoutCtrl} is used in macs2 peak realated
#' file names and \emph{withCtrl} otherwise.
#' }
#'
#' @param exptInfoFile A tabular file with details for each sample. "sampleId" and "IP_tag" columns are must
#' @param dataPath Path where the data is stored
#' @param profileMatrixSuffix A character string to be used in the file name of profile
#' matrix file. Generating profile matrix takes long time and hence it is efficient to save
#' the profile matrix. If profile matrix is saved, this suffix is used to design the name of
#' profile matrix file. This will generate a file name "sample_ID.normalizedMatrix.tab.gz".
#' Different intutuve suffixes can be used to store profile matrix like
#' \emph{TSS_4kb_2kb_normalized, TSS_3kb_3kb_normalized, normalizedmatrix_5kb}, etc.
#' Default: \emph{normalizedmatrix}.
#' @param samples A vector of Sample IDs which are to be processed. Default: All samples are used
#' @param profileType Type of profile. This will be added as suffix to the profile name
#'
#' @return Data frame with columns \code{"sampleId", "IP_tag", "control", "peakType",
#' "sampleName", "profileName", "bwFile", "matFile", "clusterFile", "mergedDataFile",
#' "polIIExpFile", "polIIExpMat", "peakFile", "peakAnno", "peakTargetFile", "FE_bwFile"}.
#' @export
#'
#' @examples NA
get_sample_information <- function(exptInfoFile, samples = NULL, dataPath,
profileMatrixSuffix = "normalizedmatrix",
profileType = "profile"){
tfChipTags <- c("HA", "FLAG", "HIS", "MYC", "TAP", "TF")
## read the experiment sample details and select only those which are to be plotted
exptData <- suppressMessages(readr::read_tsv(file = exptInfoFile))
if(!is.null(samples)){
exptData <- dplyr::left_join(x = data.frame(sampleId = samples, stringsAsFactors = F),
y = exptData, by = "sampleId")
}
if (nrow(exptData) == 0) {
return(NULL)
}
exptData$sampleId <- factor(exptData$sampleId, levels = unique(exptData$sampleId))
if( !any("sampleName" %in% colnames(exptData)) ){
exptData$sampleName <- exptData$sampleId
}
if( !any("peakType" %in% colnames(exptData)) ){
warning("peakType column not found in sample information. Using \"narrowPeak\" for all samples")
exptData$peakType <- "narrowPeak"
}
if( !any("control" %in% colnames(exptData)) ){
exptData$control <- "."
}
exptData <- exptData[order(exptData$sampleId), ] %>%
dplyr::mutate_if(is.factor, as.character) %>%
dplyr::mutate(
IP_tag = toupper(IP_tag),
hasControl = if_else(condition = is.na(control) | control == ".",
true = FALSE, false = TRUE, missing = FALSE)
) %>%
## generic data
dplyr::mutate(
profileName = paste(sampleId, profileType, sep = "_"),
bwFile = paste(dataPath, "/", sampleId, "/", sampleId, "_normalized.bw", sep = ""),
matFile = paste(dataPath, "/", sampleId, "/", sampleId, ".",
profileMatrixSuffix,".tab.gz", sep = ""),
clusterFile = dplyr::if_else(
toupper(IP_tag) == "POLII",
"NA",
paste(dataPath, "/", sampleId, "/", sampleId, ".kmeans.clusters.txt", sep = "")
),
mergedDataFile = dplyr::if_else(
toupper(IP_tag) == "POLII",
"NA",
paste(dataPath, "/", sampleId, "/", sampleId, ".allGenes_clusters.tab", sep = "")
)
) %>%
## polII ChIPseq related data
dplyr::mutate(
polIIExpFile = dplyr::if_else(
toupper(IP_tag) == "POLII",
paste(dataPath, "/", sampleId, "/", sampleId, ".normalizedExpression.tab", sep = ""),
"NA"
),
polIIExpMat = dplyr::if_else(
toupper(IP_tag) == "POLII",
paste(dataPath, "/", sampleId, "/", sampleId, "_polii_expr.tab.rel.mat", sep = ""),
"NA"
)
) %>%
## TF ChIP related data
dplyr::mutate(
peakFile = dplyr::case_when(
IP_tag %in% !!tfChipTags & hasControl & peakType == "narrowPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl_peaks.narrowPeak", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl & peakType == "narrowPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl_peaks.narrowPeak", sep = ""),
IP_tag %in% !!tfChipTags & hasControl & peakType == "broadPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl_peaks.broadPeak", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl & peakType == "broad" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl_peaks.broadPeak", sep = "")
),
peakAnno = dplyr::case_when(
IP_tag %in% !!tfChipTags & hasControl & peakType == "narrowPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl.narrowPeak.annotation.tab", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl & peakType == "narrowPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl.narrowPeak.annotation.tab", sep = ""),
IP_tag %in% !!tfChipTags & hasControl & peakType == "broadPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl.broadPeak.annotation.tab", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl & peakType == "broadPeak" ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl.broadPeak.annotation.tab", sep = "")
),
peakTargetFile = dplyr::case_when(
IP_tag %in% !!tfChipTags & hasControl ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl_peaks.targets.tab", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl_peaks.targets.tab", sep = ""),
TRUE ~ "NA"
),
FE_bwFile = dplyr::case_when(
IP_tag %in% !!tfChipTags & hasControl ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withCtrl.FE.bw", sep = ""),
IP_tag %in% !!tfChipTags & !hasControl ~
paste(dataPath, "/", sampleId, "/", sampleId, ".withoutCtrl.FE.bw", sep = ""),
TRUE ~ "NA"
)
) %>%
dplyr::mutate_at(
.vars = c("polIIExpFile", "polIIExpMat", "clusterFile", "peakTargetFile", "mergedDataFile",
"peakFile", "peakAnno"),
.funs = list(~ dplyr::na_if(., "NA"))
) %>%
dplyr::select(-hasControl)
return(exptData)
}
##################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.