R/utilities.R

Defines functions clean_text remove_duplicates get_GC get_lengths get_chromosomes get_chr_percentage get_count_matrix

Documented in clean_text get_chromosomes get_chr_percentage get_count_matrix get_lengths remove_duplicates

#' Clean text
#'
#' Remove erroneous text characters.
#'
#' @param text character vector
#' @param chars character vector of characters to remove from text
#' @param remove_empty remove empty elements from the vector.
#' @param remove_dup remove duplicated elements from the vector.
#' @param upper_case convert to upper case
#' @return A character vector
#' @examples
#' clean_text("\thello_world")
#' clean_text("\thello_world", upper_case = FALSE)

clean_text <- function(text,
                       chars = c("\t", "\r", "\n", "\\", "\ ", "\""),
                       remove_empty = TRUE,
                       remove_dup = TRUE,
                       upper_case = TRUE
  ) {
      char_pattern <- paste(chars, collapse = "|")
      char_pattern <- paste0("[", char_pattern, "]")
      cleaned <- gsub(pattern = char_pattern, replacement = "", x = as.character(text))
      if (remove_dup) cleaned <- remove_duplicates(cleaned)
      if (upper_case) cleaned <- toupper(cleaned)
      ifelse(remove_empty, return(cleaned[sapply(cleaned, nchar) > 0]), return(cleaned))
}

#' remove_duplicates
#'
#' remove duplicates from vector or data frame
#'
#' @keywords internal
#'
#' @param x vector or data frame to remove duplicates from
#' @param column If a data frame is passed in, the column that is used
#' for deduplicating (default = 1)
#'
#' @return A character vector or data frame.
#' @examples
#' hello_string <- c("h", "e", "l", "l", "o")
#' hello_string
#' remove_duplicates(hello_string)
#'
#' df <- data.frame(hello_string, 1:5)
#' df
#' remove_duplicates(df)
remove_duplicates <- function(x, column = 1) {
  if (is.data.frame(x) | is.matrix(x)) {
    return(x[!duplicated(x[,column]), ])
  } else if (is.list(x)) {
    stop("x must be a vector, data frame or matrix")
  } else if (is.vector(x)) {
    return(unique(x))
  } else{
    warning("data must be a vector, data frame or matrix")
  }
}



#' Get GC content of genes.
#'
#' Looks up gene names from a dataframe and returns the GC content.
#' This will only work if there is a column containing the GC content, so if a gtf
#' file has been processed using the process_gtf function, it probably won't contain
#' the GC content as the gtf file doesn't contain this information.
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_gtf}},
#' containing genes and associated information including length of genes

#' @return A data frame containing the genes along with their GC content
#'  \url{http://rstudio.com} some more text \link{process_gtf}
#' @keywords internal
#' @examples
#' get_GC()
get_GC <- function(query_names, gene_info, name_column = NULL, GC_column = NULL) {

  query_names <- toupper(query_names)

  if (!is.null(name_column) & !is.null(GC_column)) {
    return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), GC_column]))
  } else if (!is.null(gene_info$gene_name) & !is.null(gene_info$GC_content)) {
    return(na.omit(gene_info$GC_content[match(query_names, toupper(gene_info$gene_name))]))
  } else if (is.null(name_column) | is.null(GC_column)) {
    stop("If the gene info dataset does not contain columns named 'gene_name' and 'GC_content'
         then the column numbers must be supplied using the name_column and length_column arguments.")
  } else warning("something unexpected here")
}



#' Get lengths of genes.
#'
#' Looks up gene names from a dataframe and returns the lengths
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including length of genes

#' @return A data frame containing the gene sets of interest along with information on
#' gene lengths etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_lengths()
get_lengths <- function(query_names, gene_info, name_column = NULL, length_column = NULL) {

  query_names <- toupper(query_names)

  if (!is.null(name_column) & !is.null(length_column)) {
    return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), length_column]))
  } else if (!is.null(gene_info$gene_name) & !is.null(gene_info$length)) {
    return(na.omit(gene_info$length[match(query_names, toupper(gene_info$gene_name))]))
  } else if (is.null(name_column) | is.null(length_column)) {
    stop("If the gene info dataset does not contain columns named 'gene_name' and 'length'
         then the column numbers must be supplied using the name_column and length_column arguments.")
  } else warning("something unexpected here")
}

#' get_chromosomes.
#'
#' Looks up gene names from a dataframe and returns the chromosome on which the gene is located
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including location of genes

#' @return A data frame containing the gene sets of interest along with information on
#' chrs etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_chr()
get_chromosomes <- function(query_names, gene_info, name_column = NULL, chr_column = NULL) {
  query_names <- toupper(query_names)

  if (!is.null(name_column) & !is.null(chr_column)) {
    return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), chr_column]))
  } else if (!is.null(gene_info$gene_name) & !is.null(gene_info$chr)) {
    return(na.omit(gene_info$chr[match(query_names, toupper(gene_info$gene_name))]))
  } else if (is.null(name_column) | is.null(chr_column)) {
    stop("If the gene info dataset does not contain columns named 'gene_name' and 'chr'
         then the column numbers must be supplied using the name_column and chr_column arguments.")
  } else warning("something unexpected here")
}


#' get_chr_percentage
#'
#' takes a named list (e.g. query and background) containing vectors of
#' chromosome names
#'
#' @param chr_list named list containing character vectors of chromosome names
get_chr_percentage <- function(chr_list) {

  if (any(names(chr_list) == "")) stop("all components of the list must be named")

  count_list <- lapply(chr_list, FUN = plyr::count)

  # get all the chromosome names from the lists
  all_chr <- levels(unlist(sapply(count_list, `[[`, 1)))
  # order them numericlly
  all_chr <- all_chr[order(as.integer(all_chr))]

  # set up the data frame
  df <- data.frame(chr = all_chr)

  # calculate the percentages
  chr_proportions <- lapply(count_list, function(x) {
    proportions <- x$freq/sum(x$freq)
    names(proportions) <- x$x
    proportions
  })

  # match it all up
  for (i in 1:length(chr_proportions)) {
    x <- chr_proportions[[i]]
    col_name <- names(chr_proportions)[i]
    df[,col_name] <- x[match(all_chr, names(x))]
  }

  # replace NAs with 0s
  df[is.na(df)] <- 0

  # convert to matrix for the bar plot
  chr_proportion_matrix <- as.matrix(df[,2:ncol(df)])
  rownames(chr_proportion_matrix) <- df[,1]

  chr_proportion_matrix*100
}






#' get_count_matrix.
#'
#' create a count matrix from a list
#'
#' @param query_names character vector of gene names
#' @param gene_info data frame, probably generated by \code{\link{process_GMT}},
#' containing genes and associated information including location of genes

#' @return A data frame containing the gene sets of interest along with information on
#' chrs etc. \url{http://rstudio.com} some more text \link{process_GMT}
#' @keywords internal
#' @examples
#' get_chr()
get_count_matrix <- function(x, name_column = NULL, chr_column = NULL) {
  query_names <- toupper(query_names)

  if (!is.null(name_column) & !is.null(chr_column)) {
    return(na.omit(gene_info[match(query_names, toupper(gene_info[, name_column])), chr_column]))
  } else if (!is.null(gene_info$gene_name) & !is.null(gene_info$chr)) {
    return(na.omit(gene_info$chr[match(query_names, toupper(gene_info$gene_name))]))
  } else if (is.null(name_column) | is.null(chr_column)) {
    stop("If the gene info dataset does not contain columns named 'gene_name' and 'chr'
         then the column numbers must be supplied using the name_column and chr_column arguments.")
  } else warning("something unexpected here")
}
laurabiggins/GOcategoryStats documentation built on Oct. 27, 2019, 11:36 a.m.