R/loadCountData.R

Defines functions loadCountData

Documented in loadCountData

#' Load count files
#'
#' Load count files
#'
#' @param target target \code{data.frame} of the project
#' @param rawDir path to the directory containing the count files
#' @param skip number of lines of the data file to skip before beginning to read data
#' @param sep column separator when reading the count files (tabulation by default)
#' @param versionName versionName of the project
#' @param featuresToRemove vector of features Id to remove from the counts
#' @return The \code{matrix} of raw counts
#' @author Marie-Agnes Dillies and Hugo Varet

# created Feb 6th, 2012
# modified May 2nd, 2012 (colnames -> target$label)
# modified Sept 19th, 2013 (DESeq not required as using DESeq2)
# modified Sept 20th, 2013 (keep only the 2 first columns)
# modified Oct 8th, 2013 (keep NAs in the data)
# modified Feb 14th, 2014 (use the first column for the labels & the second column for the files)
# modified Mar 05th, 2014 (replace NAs by 0, cancel the modification of Oct 8th, 2013)
# modified Mar 05th, 2014 (added info argument to add features with only null counts to the data)
# modified May 13th, 2014 (added skip argument)
# modified July 30th, 2014 (combined loadCountData, HTSeqClean, removeRRNA and raw2counts)
# modified Oct 27th, 2014 (removed merge(info,counts))
# modified Feb 9th, 2015 (order counts according to row names)
# modified Feb 18th, 2015 (remove HTSeq-count lines with featuresToRemove)
# modified March 17th, 2015 (print features removed and top/bottom of the matrix)
# modified March 23rd, 2015 (fixed a bug when printing features removed)
# modified April 10th, 2015 (now exports the names of the features removed from the data)
# modified June 15th, 2015 (check if there are any duplicated features)
# modified Sept 3rd, 2015 (removed info argument)
# modified April 11th, 2016 (added sep parameter)
# modified February 21st, 2017 (detect featureCounts or HTSeq-count input, remove header argument)
# modified February 23rd, 2017 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified September 5th, 2018 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified October 4th, 2018 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified June 1th, 2019 (print duplicated ids if any)

loadCountData <- function(target, rawDir="raw", skip=0, sep="\t", versionName="",
                          featuresToRemove=c("alignment_not_unique", "ambiguous", "no_feature", "not_aligned", "too_low_aQual")){

  labels <- as.character(target[,1])
  files <- as.character(target[,2])
  
  # detect if input count files are from featureCounts or HTSeq-count
  f1 <- read.table(paste(rawDir,files[1],sep="/"), sep=sep, quote="\"", header=FALSE, skip=5, nrows=5, stringsAsFactors=FALSE)
  if (ncol(f1) >= 7 && is.numeric(f1[,7])){
    # counter featurecounts
    idCol <- 1
    countsCol <- 7
    header <- TRUE
  } else{
    if (ncol(f1) >= 2 && is.numeric(f1[,2])){
      # counter htseq-count
      idCol <- 1
      countsCol <- 2
      header <- FALSE
    } else{
      stop("Can't determine if count files come from HTSeq-count or featureCounts")
    }
  }

  rawCounts <- read.table(paste(rawDir,files[1],sep="/"), sep=sep, quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
  rawCounts <- rawCounts[,c(idCol, countsCol)]
  colnames(rawCounts) <- c("Id", labels[1])
  if (any(duplicated(rawCounts$Id))){
    stop("Duplicated feature names in ", files[1], ": ", 
         paste(unique(rawCounts$Id[duplicated(rawCounts$Id)]), collapse=", "))
  }
  cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="")

  for (i in 2:length(files)){
    tmp <- read.table(paste(rawDir,files[i],sep="/"), sep=sep, quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
    tmp <- tmp[,c(idCol, countsCol)]
    colnames(tmp) <- c("Id", labels[i])
    if (any(duplicated(tmp$Id))){
      stop("Duplicated feature names in ", files[i], ": ", 
           paste(unique(tmp$Id[duplicated(tmp$Id)]), collapse=", "))
    }
    rawCounts <- merge(rawCounts, tmp, by="Id", all=TRUE)
    cat(files[i],": ",length(tmp[,labels[i]])," rows and ",sum(tmp[,labels[i]]==0)," null count(s)\n",sep="")
  }

  rawCounts[is.na(rawCounts)] <- 0

  # transformation de la data.frame en matrice
  counts <- as.matrix(rawCounts[,-1])
  rownames(counts) <- rawCounts[,1]
  
  # ordering according to row names
  counts <- counts[order(rownames(counts)),]
  
  # remove some features
  cat("\nFeatures removed:\n")
  featuresRemoved <- NULL
  for (f in featuresToRemove){
    match <- grep(f, rownames(counts))
    if (length(match)>0){
      cat(rownames(counts)[match], sep = "\n")
      featuresRemoved <- unique(c(featuresRemoved,rownames(counts)[match]))
      counts <- counts[-match,]
    }
  }
  if (is.null(featuresRemoved)) featuresRemoved <- "No feature has been removed"
  write.table(data.frame(featuresRemoved), file=paste0("tables/",versionName,".featuresRemoved.xls"), 
              sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
  
  cat("\nTop of the counts matrix:\n")
  print(head(counts))
  cat("\nBottom of the counts matrix:\n")
  print(tail(counts))
  
  return(counts)
}
biomics-pasteur-fr/RNADiff documentation built on Aug. 27, 2020, 12:44 a.m.