R/clustersFeatures.R

Defines functions clustersFeatures

Documented in clustersFeatures

#' @title A function to extract priority lists of genes from ANOVAlike output
#' @description This function extracts clusters specific gene lists from ANOVAlike output. IMPORTANT: This analysis works only on up modulated genes, i.e. genes more expressed in the clusters under analysis with respect to the reference cluster
#' @param group, a character string. Two options: sudo or docker, depending to which group the user belongs
#' @param fileLogFC, a character string indicating the absolute path to logFC_filtered_DE_ file generated by anovaLike function
#' @param fileCounts, a character string indicating the absolute path to reordered counts table file generated by anovaLike function
#' @param sep, separator used in count file, e.g. '\\t', ','
#' @param delta, the minimal distance between the max value of FC for a gene in a cluster of interest and the nearest other max FC in any of the other clusters. This value define the minimal distance with respect to the same gene in an other cluster to identify it as a cluster specific gene.
#' @author Raffaele Calogero, raffaele.calogero [at] unito [dot] it, University of Torino
#' @return  Three tab delimited files with prefix onlyUP\_, followed by the counts table name, i.e. the count table only containing the selected genes. onlyUP\_, followed by logFC_filtered_DE_,  the table containing logFC only for the selected genes, onlyUP_clusters_specific_genes.txt, which contains the list of specific genes associated with the corresponding cluster.
#'
#' @examples
#' \dontrun{
#' system("wget http://130.192.119.59/public/clusters.features.zip")
#' unzip("clusters.features.zip")
#' setwd("clusters.features")
#'
#'     clustersFeatures(group="docker", fileLogFC=paste(getwd(),
#'        "logFC_filtered_DE_annotated_setPace_10000_noC5_reordered.txt",sep="/"),
#'         fileCounts=paste(getwd(),"annotated_setPace_10000_noC5_reordered.txt",sep="/"), delta=0.5, sep="\t")
#' }
#'
#' @export
clustersFeatures <- function(group=c("sudo","docker"),fileLogFC,fileCounts, delta=0.5,sep){



data.folder=dirname(fileCounts)
positions=length(strsplit(basename(fileCounts),"\\.")[[1]])
matrixNameC=strsplit(basename(fileCounts),"\\.")[[1]]
counts.table=paste(matrixNameC[seq(1,positions-1)],collapse="")
  matrixName=counts.table
file.type=strsplit(basename(basename(fileCounts)),"\\.")[[1]][positions]

counts.table=matrixName




positions=length(strsplit(basename(fileLogFC),"\\.")[[1]])
matrixNameC=strsplit(basename(fileLogFC),"\\.")[[1]]
counts.table=paste(matrixNameC[seq(1,positions-1)],collapse="")
  matrixName=counts.table

logFC.table=matrixName
  #storing the position of the home folder
  home <- getwd()

  #running time 1
  ptm <- proc.time()
  #setting the data.folder as working folder
  if (!file.exists(data.folder)){
    cat(paste("\nIt seems that the ",data.folder, " folder does not exist\n"))
    return(2)
  }

  setwd(data.folder)

  #initialize status
  system("echo 0 > ExitStatusFile 2>&1")

  de <- read.table(paste(logFC.table,".",file.type,sep=""), sep=sep, header=T, row.names = 1)
  selected.set <- NULL
  for(i in 1:dim(de)[2]){
       selected.set <- c(selected.set, which(de[,i] >= 1))
  }
  de.selected <- de[unique(selected.set),]

  #selecting the top candidates for separation between clusters
  # delta in de minimal distance between the max in cluster of interest and the nearest other max removing the cluster of interest
  .clusterGenes <- function(de.up, delta){
    max.de <- apply(de.up, 1, max)
    specific <- list()
    for(i in 1:dim(de.up)[2]){
      tmp <- as.numeric(de.up[,i] - max.de)#if column i has the max the delata with max is 0
      de.tmp0 <- de.up[which(tmp==0),]#selecting the data set for column i with max in that column
      max0.de <- apply(de.tmp0, 1, max)#max in de.tmp0
      de.tmp <- de.tmp0[, ! names(de.tmp0) %in% names(de.tmp0)[i], drop = F]#removing the column used to define max
      max1.de <- apply(de.tmp, 1, max)
      tmp.max <- as.numeric(max0.de - max1.de)#defining the distance between the max in column i and the nearest other max removing the column of interest

      specific[[names(de.up)[i]]] <- rownames(de.tmp0)[which(tmp.max>=delta)]
    }
    return(specific)
  }
  #set of cluster specific genes
  specifics <- .clusterGenes(de.selected, delta=delta)
  ###
  de.reorg <- list()
  for(i in 1:length(specifics)){
    de.reorg[[i]] <-  de.selected[which(rownames(de.selected)%in%specifics[[i]]),]
  }
  df <- do.call("rbind", de.reorg)


  write.table(df, paste("onlyUP_",logFC.table, sep=""), sep="\t", col.names = NA)
  ###
  counts <- read.table(paste(counts.table,".",file.type,sep=""), sep=sep, header=T, row.names = 1)
  counts.reorg <- list()
  for(i in 1:length(specifics)){
    counts.reorg[[i]] <- counts[which(rownames(counts)%in%specifics[[i]]),]
  }
  df.counts <- do.call("rbind", counts.reorg)
  write.table(df.counts, paste("onlyUP_",counts.table, sep=""), sep="\t", col.names = NA)
  ###
  specifics.reorg <- list()
  for(i in 1:length(specifics)){
    specifics.reorg[[i]] <-  data.frame(rep(names(specifics)[i], length(specifics[[i]])), specifics[[i]])
  }
  df.specific <- do.call("rbind", specifics.reorg)
  df.specific <- as.data.frame(df.specific)
  names(df.specific) <- c("cluster","geneID")
  write.table(df.specific, "onlyUP_clusters_specific_genes.txt", sep="\t", row.names = F)
  df.specific.symbol <- strsplit(as.character(df.specific$geneID), ":")
  df.specific.symbol <- sapply( df.specific.symbol, function(x)x[2])
  zz <- file("onlyUP_clusters_specific_geneSYMBOLs.txt", "w")
  writeLines(df.specific.symbol, zz)
  close(zz)

    #running time 2
  ptm <- proc.time() - ptm
  dir <- dir(data.folder)
  dir <- dir[grep("run.info",dir)]
  if(length(dir)>0){
    con <- file("run.info", "r")
    tmp.run <- readLines(con)
    close(con)
    tmp.run[length(tmp.run)+1] <- paste("FastQC user run time mins ",ptm[1]/60, sep="")
    tmp.run[length(tmp.run)+1] <- paste("FastQC system run time mins ",ptm[2]/60, sep="")
    tmp.run[length(tmp.run)+1] <- paste("FastQC elapsed run time mins ",ptm[3]/60, sep="")
    writeLines(tmp.run,"run.info")
  }else{
    tmp.run <- NULL
    tmp.run[1] <- paste("FastQC user run time mins ",ptm[1]/60, sep="")
    tmp.run[length(tmp.run)+1] <- paste("FastQC system run time mins ",ptm[2]/60, sep="")
    tmp.run[length(tmp.run)+1] <- paste("FastQC elapsed run time mins ",ptm[3]/60, sep="")

    writeLines(tmp.run,"run.info")
  }

  setwd(home)
}
kendomaniac/rCASC documentation built on July 3, 2024, 6:05 a.m.