#' Clean text
#'
#' Remove erroneous text characters.
#'
#' @param text character vector
#' @param chars character vector of characters to remove from text
#' @param remove_empty remove empty elements from the vector.
#' @param remove_dup remove duplicated elements from the vector.
#' @param upper_case convert to upper case
#' @return A character vector
#' @examples
#' clean_text("\thello_world")
#' clean_text("\thello_world", upper_case = FALSE)
clean_text <- function(text,
chars = c("\t", "\r", "\n", "\\", "\ ", "\""),
remove_empty = TRUE,
remove_dup = TRUE,
upper_case = TRUE
) {
char_pattern <- paste(chars, collapse = "|")
char_pattern <- paste0("[", char_pattern, "]")
cleaned <- gsub(pattern = char_pattern, replacement = "", x = as.character(text))
if (remove_dup) cleaned <- remove_duplicates(cleaned)
if (upper_case) cleaned <- toupper(cleaned)
ifelse(remove_empty, return(cleaned[sapply(cleaned, nchar) > 0]), return(cleaned))
}
#' remove_duplicates
#'
#' remove duplicates from vector or data frame
#'
#' @keywords internal
#'
#' @param x vector or data frame to remove duplicates from
#' @param column If a data frame is passed in, the column that is used
#' for deduplicating (default = 1)
#'
#' @return A character vector or data frame.
#' @examples
#' hello_string <- c("h", "e", "l", "l", "o")
#' hello_string
#' remove_duplicates(hello_string)
#'
#' df <- data.frame(hello_string, 1:5)
#' df
#' remove_duplicates(df)
remove_duplicates <- function(x, column = 1) {
if (is.data.frame(x) | is.matrix(x)) {
return(x[!duplicated(x[,column]), ])
} else if (is.list(x)) {
stop("x must be a vector, data frame or matrix")
} else if (is.vector(x)) {
return(unique(x))
} else{
warning("data must be a vector, data frame or matrix")
}
}
#' Get GC content of genes.
#'
#' Looks up gene names from a dataframe and returns the GC content.
#' This will only work if there is a column containing the GC content, so if a gtf
#' file has been processed using the process_gtf function, it probably won't contain
#' the GC content as the gtf file doesn't contain this information.
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_gtf}},
#' containing genes and associated information including length of genes
#' @return A data frame containing the genes along with their GC content
#' \url{http://rstudio.com} some more text \link{process_gtf}
#' @keywords internal
#' @examples
#' get_GC()
get_GC <- function(query_names, gene_info, name_column = NULL, GC_column = NULL) {
query_names <- toupper(query_names)
if (!is.null(name_column) & !is.null(GC_column)) {
return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), GC_column]))
} else if (!is.null(gene_info$gene_name) & !is.null(gene_info$GC_content)) {
return(na.omit(gene_info$GC_content[match(query_names, toupper(gene_info$gene_name))]))
} else if (is.null(name_column) | is.null(GC_column)) {
stop("If the gene info dataset does not contain columns named 'gene_name' and 'GC_content'
then the column numbers must be supplied using the name_column and length_column arguments.")
} else warning("something unexpected here")
}
#' Get lengths of genes.
#'
#' Looks up gene names from a dataframe and returns the lengths
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including length of genes
#' @return A data frame containing the gene sets of interest along with information on
#' gene lengths etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_lengths()
get_lengths <- function(query_names, gene_info, name_column = NULL, length_column = NULL) {
query_names <- toupper(query_names)
if (!is.null(name_column) & !is.null(length_column)) {
return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), length_column]))
} else if (!is.null(gene_info$gene_name) & !is.null(gene_info$length)) {
return(na.omit(gene_info$length[match(query_names, toupper(gene_info$gene_name))]))
} else if (is.null(name_column) | is.null(length_column)) {
stop("If the gene info dataset does not contain columns named 'gene_name' and 'length'
then the column numbers must be supplied using the name_column and length_column arguments.")
} else warning("something unexpected here")
}
#' get_chromosomes.
#'
#' Looks up gene names from a dataframe and returns the chromosome on which the gene is located
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including location of genes
#' @return A data frame containing the gene sets of interest along with information on
#' chrs etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_chr()
get_chromosomes <- function(query_names, gene_info, name_column = NULL, chr_column = NULL) {
query_names <- toupper(query_names)
if (!is.null(name_column) & !is.null(chr_column)) {
return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), chr_column]))
} else if (!is.null(gene_info$gene_name) & !is.null(gene_info$chr)) {
return(na.omit(gene_info$chr[match(query_names, toupper(gene_info$gene_name))]))
} else if (is.null(name_column) | is.null(chr_column)) {
stop("If the gene info dataset does not contain columns named 'gene_name' and 'chr'
then the column numbers must be supplied using the name_column and chr_column arguments.")
} else warning("something unexpected here")
}
#' get_chr_percentage
#'
#' takes a named list (e.g. query and background) containing vectors of
#' chromosome names
#'
#' @param chr_list named list containing character vectors of chromosome names
get_chr_percentage <- function(chr_list) {
if (any(names(chr_list) == "")) stop("all components of the list must be named")
count_list <- lapply(chr_list, FUN = plyr::count)
# get all the chromosome names from the lists
all_chr <- levels(unlist(sapply(count_list, `[[`, 1)))
# order them numericlly
all_chr <- all_chr[order(as.integer(all_chr))]
# set up the data frame
df <- data.frame(chr = all_chr)
# calculate the percentages
chr_proportions <- lapply(count_list, function(x) {
proportions <- x$freq/sum(x$freq)
names(proportions) <- x$x
proportions
})
# match it all up
for (i in 1:length(chr_proportions)) {
x <- chr_proportions[[i]]
col_name <- names(chr_proportions)[i]
df[,col_name] <- x[match(all_chr, names(x))]
}
# replace NAs with 0s
df[is.na(df)] <- 0
# convert to matrix for the bar plot
chr_proportion_matrix <- as.matrix(df[,2:ncol(df)])
rownames(chr_proportion_matrix) <- df[,1]
chr_proportion_matrix*100
}
#' get_count_matrix.
#'
#' create a count matrix from a list
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including location of genes
#' @return A data frame containing the gene sets of interest along with information on
#' chrs etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_chr()
get_count_matrix <- function(x, name_column = NULL, chr_column = NULL) {
query_names <- toupper(query_names)
if (!is.null(name_column) & !is.null(chr_column)) {
return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), chr_column]))
} else if (!is.null(gene_info$gene_name) & !is.null(gene_info$chr)) {
return(na.omit(gene_info$chr[match(query_names, toupper(gene_info$gene_name))]))
} else if (is.null(name_column) | is.null(chr_column)) {
stop("If the gene info dataset does not contain columns named 'gene_name' and 'chr'
then the column numbers must be supplied using the name_column and chr_column arguments.")
} else warning("something unexpected here")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.