R/CpG_summary.R

Defines functions CpG_summary

Documented in CpG_summary

#' @title CpG_summary

#' @description CpG_summary function provides information on genes with CpG islands and GC content. The function checks genes against known CpG islands and provides various plots to assess emerging data features. The user can also specify if the plotting is necessary for location ("location") or protein class ("class"). Only genes with GC data are assessed.
#'
#' @param data Requires a  data frame generated by score_genes; class - data frame
#' @param type Requires to specify if plotting is performed for location or class types; default is "class". Alternatively, select "location". Class - string
#' @return  multiple plots (class - plots) and a data frame with GC content (class - data frame)
#' @importFrom RCurl getURL
#' @importFrom lattice densityplot
#' @importFrom lattice histogram
#' @importFrom ggExtra ggMarginal
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 theme
#' @ImpportFrom ggplot2 element_text
#' @ImportFrom ggplot2 geom_col
#' @ImportFrom ggplot2 ggplot
#' @import methods
#' @import utils
#' @examples
#'  \dontrun{
#' path_to_test_data<- system.file("extdata", "test_data.tabular", package="OmicInt")
#' # basic usage of CpG_summary
#' df<-utils::read.table(path_to_test_data)
#' return_df<-CpG_summary(df)
#' head(return_df)
#' }
#' @export
CpG_summary<-function(data, type="class"){

  #plot structures
  #access data
  #CpG data preparation
  cpg_url <- RCurl::getURL("https://gitlab.com/Algorithm379/databases/-/raw/main/CpG_data.csv")
  cpg <- utils::read.csv(text = cpg_url)
    cpg$region<-paste(paste(cpg$chromosome_name,cpg$start_position,sep=":"),cpg$end_position, sep=":")
    cpg$region<-paste("chr",cpg$region,sep = "")

    #prepare data frame
    data$"CpG"<-ifelse(data$"Symbol"%in%cpg$"hgnc_symbol",cpg$"region","NA")
    data$"GC_content"<-ifelse(data$"Symbol"%in%cpg$"hgnc_symbol",cpg$"percentage_gene_gc_content",NA)
    #extract features and plot GC
    #only GC data containing genes are reported
    df<-data[!is.na(data$"GC_content"),]

    #Class data preparation
    if(type=="class"){
    #access data

    classes_url <- RCurl::getURL("https://gitlab.com/Algorithm379/databases/-/raw/main/HS_protein_classes_curated.csv")
    classes <- utils::read.csv(text = classes_url)

    #prepare data frame
    data$"Class"<-ifelse(data$"Symbol"%in%classes$"Gene",classes$"Class","NA")
    #only GC data containing genes are reported
    df$"Class"<-ifelse(df$"Symbol"%in%data$"Symbol",data$"Class","NA")
    Class<-df$"Class"
    }
    #Location data preparation
  if(type=="location"){

    #download the data from curated databases
    location_url <- RCurl::getURL("https://gitlab.com/Algorithm379/databases/-/raw/main/Subcellular.locationmerged_protein_data.csv")
    location_df <- utils::read.csv(text = location_url)


    data$"Location"<-ifelse(data$"Symbol"%in%location_df$"Symbol",location_df$"Subcellular.location","NA")
    data$"Location"<-ifelse( is.na(data$"Location"),"NA", data$"Location")
    #only GC data containing genes are reported
    df$"Location"<-ifelse(df$"Symbol"%in%data$"Symbol",data$"Location","NA")
    Location<-df$"Location"
    }


    #df data preparation to avoid plotting conflicts

  gene<-df$"Symbol"
  GC<-df$"GC_content"
  Interactors<-df$"Interactors"
  Association_score<-df$"Association_score"
  Specificity_score<-df$"Specificity_score"
  LFCscore<-df$"LFCscore"
  log2FoldChange<-df$"log2FoldChange"


  #prepare color palette
  qual_col_pals <- brewer.pal.info[which(brewer.pal.info$"category"%in%c('qual')),] #max number of colours 335, setting for qual gives 74
  col_vector <- unlist(mapply(brewer.pal, qual_col_pals$"maxcolors", rownames(qual_col_pals)))
  col_vector<-col_vector[1:nlevels(factor(data$"Class"))]

  #plot general plot
  lattice::histogram(~GC,
                     type="percent",
                     xlab="GC%",
                     main="GC% distribution")



  if(type=="class"){

    lattice::histogram(~GC|Class,data=df,
                       type="percent",
                       xlab="GC%",
                       main="GC% distribution across protein classes")

  p<-lattice::densityplot(~ GC_content, groups = Class, data = df, plot.points = FALSE, auto.key = TRUE, par.settings = list(superpose.line = list(col =col_vector)),main="CpG island gene distributions based on protein class")
  methods::show(p)



  p1<-ggplot2::ggplot(df)+ggplot2::geom_col(ggplot2::aes(x=gene,y=GC, fill=Class))+ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1))
  methods::show(p1)

  p <- ggplot2::ggplot(df, ggplot2::aes(x = log2FoldChange, y = GC, color=Class)) +  ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  p <- ggplot2::ggplot(df, ggplot2::aes(x = LFCscore, y = GC, color=Class)) +  ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  p <- ggplot2::ggplot(df, ggplot2::aes(x = Specificity_score, y = GC, color=Class)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  p <- ggplot2::ggplot(df, ggplot2::aes(x = Association_score, y = GC, color=Class)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  p <- ggplot2::ggplot(df, ggplot2::aes(x = Interactors, y = GC, color=Class)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  }

  if(type=="location"){


    lattice::histogram(~GC|Location,data=df,
                       type="percent",
                       xlab="GC%",
                       main="GC% distribution across protein cellular locations")

    p<-lattice::densityplot(~ GC_content, groups = Location, data = df, plot.points = FALSE, auto.key = TRUE, par.settings = list(superpose.line = list(col =col_vector)),main="CpG island gene distributions based on protein location")
    methods::show(p)

  p <- ggplot2::ggplot(df, ggplot2::aes(x = Interactors, y = GC, color=Location)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")
  p <- ggplot2::ggplot(df, ggplot2::aes(x = LFCscore, y = GC, color=Location)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")

  p <- ggplot2::ggplot(df, ggplot2::aes(x = log2FoldChange, y = GC, color=Location)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")
  p <- ggplot2::ggplot(df, ggplot2::aes(x = Association_score, y = GC, color=Location)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")
  p <- ggplot2::ggplot(df, ggplot2::aes(x = Specificity_score, y = GC, color=Location)) +ggplot2::geom_point()
  ggExtra::ggMarginal(p, type = "densigram",
                      size = 3, fill="lightblue")}

  return(data)
}

Try the OmicInt package in your browser

Any scripts or data that you put into this service are public.

OmicInt documentation built on Oct. 28, 2021, 5:09 p.m.