
Defines functions DataImport

Documented in DataImport

#' Data Import
#' Reads the files and creates a list storing all the information.
#' @param TimeSeriesFile The name of the excel file storing the  growth evolution data. See \sQuote{Details}.
#' @param AnnotationFile The name of a txt/csv file storing  the annotation data. See \sQuote{Details}.
#' @return   DataImport returns a list, called ConnectorList, with four arguments: (i) Dataset: data frame with three columns (i.e. ID, data and time values) encoding the growth data for each sample, (ii) LenCurv: the vector reporting the number of observations collected per sample  (iii) LabCurv: the data frame matching the samples with their annotations, (iv) TimeGrid: the vector storing all the sample time points (i.e. time grid). Furthermore, it prints a brief summary of the input data, i.e. the total number of curves (samples), the minimum and the maximum curve length.
#' @details Two files are requested to run the data analysis exploiting the CONNECTOR package:
#' \itemize{
#' \item the excel file, namely \emph{TimeSeriesFile}, reporting the growth evolution data,
#' \item the csv file, namely \emph{AnnotationFile}, containing the annotation information associated with the samples.
#' }
#' Hence, the growth data associated with an experiment must be stored into TimeSeriesFile as a table with two columns for each sample.
#' The first column, labeled  \emph{Time}, contains the time points of a sample. The second column, labeled by the sample name, contains  the data observation over the time.
#' Instead, the second file (i.e. AnnotationFile)  stores  the annotated information associated with the samples as a table in csv format so that  number of rows is equals to the total number of samples.
#' @author Cordero Francesca, Pernice Simone, Sirovich Roberta
#' @examples
#' TimeSeriesFile <- "data/475dataset.xls"
#' AnnotationFile <- "data/475info.txt"
#' CONNECTORList <- DataImport(TimeSeriesFile,AnnotationFile)
#' @import readxl dplyr
#' @export
DataImport <- function(TimeSeriesFile,AnnotationFile) {
  ###Read Data File
  dataset <- read_excel(TimeSeriesFile,col_names=T)
  if( all(dataset[1,]==rep(0,length(dataset[1,]))) )
    warning("The first line of the TimeSeriesFile is zero.",immediate. = T)
  ### Read Target File
  labcurv  <- read.csv(file=AnnotationFile,header=TRUE)
  colnames(labcurv)= c("IDSample",colnames(labcurv)[-(1)] )
  labcurv$ID = 1:length(labcurv$IDSample)
  labcurveID <-  labcurv$ID
  names(labcurveID) <- labcurv$IDSample
  ###Check the column names
    warning("Number of columns in the excel file is different from the number of samples stored in the target file.")
      warning("SampleNames in the target file do not correspond to the names in the excel file.")
  ### Inizialize :
  ### vector for curves lenghts
  nvar       <- dim(dataset)[2]
  nobs       <- dim(dataset)[1]
  samplesize <- nvar/2
  lencurv    <- numeric(samplesize)
  ### vectors for times, observation and ID curves values
  TimeIndex  <- seq(1,nvar,2)
  ObsIndex   <- seq(2,nvar,2)
  lencurv.time<-sapply(TimeIndex, function(x){length(dataset[!is.na(dataset[,x]),x])})
  lencurv.obs<-sapply(ObsIndex, function(x){length(dataset[!is.na(dataset[,x]),x])})
  if(!all(lencurv.time==lencurv.obs)) # check if times are without observation
    for(i in 1:length(ind.diff) )
      na.obs<-is.na(dataset[,ind.diff[i]*2 ] )
      na.time<-is.na(dataset[,ind.diff[i]*2-1 ] )
      dataset[i.different,ind.diff[i]*2 ]<-NA
      dataset[i.different,ind.diff[i]*2-1 ]<-NA
    warning(paste(length(ind.diff),"time(s) without the corresponding observation (or viceversa) is(are) present.\n These cases will be deleted.\n") )
  lencurv<-sapply(ObsIndex, function(x){length(dataset[!is.na(dataset[,x]),x])})
  TimeValue  <- as.vector(dataset[,TimeIndex])
  TimeValue<- as.double(TimeValue[!is.na(TimeValue)])
  ObsValue  <- as.vector(dataset[,ObsIndex])
  ObsValue<-  as.double(ObsValue[!is.na(ObsValue)] )
  IDsample<- rep(c_names,times=lencurv)
  timegrid <- sort(unique(TimeValue))
  ### ID, observation and time data frame
  dataset <- data.frame(ID=IDsample,Observation=ObsValue,Time=TimeValue)
  dataset$ID = labcurveID[dataset$ID] 
  dataset = dataset %>% 
  check.times<-aggregate(dataset$Time, by = list(dataset$ID),function(x){
    if(length(unique(number.times))>1) return("Multiple Times")
    else return("Perfect")
    warning("Samples with multiple observations at the same time point are present. One observation per time point is required")
  alldata <- list(Dataset=dataset,LenCurv=lencurv,LabCurv=labcurv,TimeGrid=timegrid)
  cat("############################### \n######## Summary ##############\n")
  cat("\n Number of curves:",samplesize,";\n Min curve length: ",min(lencurv),"; Max curve length: ",max(lencurv),".\n")
  cat("############################### \n")
sysbioTurin/connector documentation built on April 9, 2024, 12:10 p.m.