GcxGctools: Tools for GCxGc data

Documented in precook_measures

#' Preprocess of raw data

#' The aim of precook_measures() is to avoid preprocessing raw data each time we have to operate with it. So we "precook" the raw data and save the result on a file we can read each time we need, avoiding long calculus each time.
#'
#' The function precook_measures() is used to read GcxGC raw data (all ncdf4 files of the directory), complete the measures (if any is bypassed) and obtain and altered TIC (without some chosen masses) of a particular second column time (t2) interval, The result contains also the non altered TIC (all the masses). The result is saved as the file preckd_measures.rda with the parameters used on the process saved in the file preckd_variables.rda
#' @export
#' @param cdf_folder directory with the ncdf4 files. The resulting files will be saved also there
#' @param massestoavoid vector with the names of the masses we don't want on the altered TIC
#' @param tmod modulation time
#' @param t2in Initial time of the second cromatographic column to start obtaining measures. By default is 0
#' @param t2out Final time of the second cromatographic column to end obtaining measures. By default is the modulation time
#' @param normalizationfactor vector with multipliers factors assigned to each sample so that all samples can be compared with each other. Numbers of factors related to all samples or files (in alphabetical order) are needed. e.g. normalizationfactor=c(0.043, 1.725,0.371,0.055,0.034,0.062)
#' @return There is no return. The results are saved on files
#'     preckd_measures.rda:
#'     preckd_variables.rda: Variables used during the precook (raw data filenames, massestoavoid, t2in, t2out, tmod)
#' @examples
#' precook_measures(cdf_folder="C:/esborrar_mesuresGcxGc/mesures_Berkely_Noelia", massestoavoid=c(43,61,70,73,88,207,208,209,210,281), normalizationfactor=c(0.043, 1.725,0.371,0.055,0.034,0.062), tmod=5, t2in=1, t2out=5)


precook_measures <- function(cdf_folder, massestoavoid=NULL,
                             normalizationfactor=NULL,tmod, t2in=0, t2out=NULL){

      if(!require(ncdf4)){install.packages("ncdf4"); require(ncdf4)}

      #Arxiu on desarem les mesures omplertes i les seves variables
      mesuresprecuinades<-"preckd_measures.rda"
      variablesprecuina<-"preckd_variables.rda"

      if(is.null(t2out)) t2out<-tmod

      print("Precuinant mesures...Obtenim els TICS de tots els arxius cdf")
      #nom?s cal carregar els noms de TOTS els arxius *.RDATA del directori
      filenames<-list.files(cdf_folder, pattern = "\\.cdf$")

      #inicialitzem la llista on desarem els resultats. Cada posicio te les mesures d'una mostra (i<-1)
      list_TICSdeSamples <- vector(mode = "list", length = length(filenames))
      variablessamples<-matrix(nrow=1,ncol=length(filenames),
                               dimnames = list(c("min_scanduration")))

      for(filenumber in 1:length(filenames)){
          print(paste("Omplim forats del fitxer",filenames[filenumber]))
          ncin <- nc_open(paste(cdf_folder,filenames[filenumber],sep="/"))


          # CHECKING TIME UNITS ----------------------------------------------------
          timeunit<-ncatt_get(ncin,varid=0,attname="units")$value
          if(tolower(timeunit)=="seconds"){cteconversion<-1
          } else if (tolower(timeunit)=="milliseconds"){ cteconversion<-1/1000
          } else if (tolower(timeunit)=="minutes"){cteconversion<-60
          } else if (tolower(timeunit)=="hours"){cteconversion<-3600
          } else {
              question1 <- readline(
                  paste("Raw data has the time unit \'", timeunit,"\'. The system is designed to work with the unit \'seconds\'. Would you like to continue and assume both units are equivalents?  (Y/N)",sep=""))

              if(!tolower(question1) %in% c("y","yes")) stop(paste("ERROR: Raw data has the time unit \'", timeunit, "\', not recognizable by the import function",sep=""))
          }

        # CHECKING OTHER UNITS ----------------------------------------------------
          variablestoread<-c("mass_values","intensity_values","total_intensity")
          expectedunits<-c("m/z","arbitrary intensity units","arbitrary intensity units")
          for(i in 1:length(variablestoread)){
              #read original unit of variable i
              unit<-ncatt_get(ncin,varid=variablestoread[i])$units
              #print(unit)
              #check if read unit fits the unit we expected
              if(tolower(unit)!=expectedunits[i]){
                  question1 <- readline(
                      paste("Raw data has the unit \'", unit, "\' for the variable \'",variablestoread[i],"\'. The system is designed to work with the unit \'",expectedunits[i],"\'. Would you like to continue and assume both units are equivalents?  (Y/N)",sep=""))

                  if(!tolower(question1) %in% c("y","yes")) stop(paste("Raw data has the nonrecognizable unit \'", unit, "\' for the variable \'", variablestoread[i],"\'. The system expected the unit \'",expectedunits[i],"\'.",sep=""))
              }

          }
            intensitatstotals <- ncvar_get(ncin,"total_intensity")
            #importing time in seconds
            temps <- cteconversion*ncvar_get(ncin,"scan_acquisition_time")
            masses <- ncvar_get(ncin,"mass_values")
            intensitats <- ncvar_get(ncin,"intensity_values")
            variablessamples[1,filenumber]<-min(ncvar_get(ncin,"scan_duration"))

            #MILLORAR. CALDRIA MIRAR la integritat de la rawdata de cada cdf
            #check_integritatmesures(temps,length(intensitats),masses)

            #oMPLIM FORATS I CREEM ESTRUCTURA INTENSITATS 2D
            matriuintensitatsplena<-complete_measures(temps,masses,intensitats,intensitatstotals)
            #Si hem de limitar-nos a les mesures que tinguin un t2 determinat
            if(!(t2in==0 && t2out==tmod)){
                  resoluciotemps<-min(diff(temps))
                  unitatsmod<-tmod/resoluciotemps

                  #passem de 2D a 3D (masses x t1 x t2) per poder treure els t2 que no volem
                  #abans desem els rownames
                  vectminim_masses<-rownames(matriuintensitatsplena)
                  matriuintensitatsplena<-aperm(array(matriuintensitatsplena,dim = c(nrow(matriuintensitatsplena),unitatsmod,ceiling(ncol(matriuintensitatsplena)/unitatsmod))),c(1,3,2))
                  #Al TIC inalterat tamb? li treiem els temps t2 que no volem
                  intensitatstotals<-matrix(intensitatstotals,nrow=unitatsmod)
                  #amb el temps emhem de fer el mateix
                  tempsescapcat<-matrix(temps,nrow=unitatsmod)
                  #Ens quedem amb els intervals de temps que ens interessen [t2in,t2out]
                  posiciot2in<-t2in/resoluciotemps;posiciot2out<-t2out/resoluciotemps
                  matriuintensitatsplena<-matriuintensitatsplena[,,posiciot2in:posiciot2out]

                  #tornem a posar la matriu en 2d (masses x temps)
                  matriuintensitatsplena<-matrix(aperm(matriuintensitatsplena,c(1,3,2)),nrow=nrow(matriuintensitatsplena))
                  #Al TIC inalterat tamb? li treiem els temps t2 que no volem
                  intensitatstotals<-as.vector(intensitatstotals[posiciot2in:posiciot2out,])
                  #i al temps tambe
                  tempsescapcat<-as.vector(tempsescapcat[posiciot2in:posiciot2out,])

                  #tornema posar el onm de les rows i les cols pq amb l'aper s'han perdut
                  names(intensitatstotals)<-tempsescapcat
                  colnames(matriuintensitatsplena)<-tempsescapcat
                  rownames(matriuintensitatsplena)<-vectminim_masses
            }

            #Eliminem les masses que no volem al TIC
            if(is.null(massestoavoid)){

                  #Per cada sample, desem el vector de TICS alterats i normals
                  list_TICSdeSamples[[filenumber]]<-rbind(colSums(matriuintensitatsplena),intensitatstotals)

            }else{
                  #1er mirem quina posicio cupen les masses que volem eliminar
                  rows_a_eliminar<-which(row.names(matriuintensitatsplena) %in% massestoavoid)

                  #Per cada sample, desem el vector de TICS alterats i normals
                  list_TICSdeSamples[[filenumber]]<-rbind(colSums(matriuintensitatsplena[-rows_a_eliminar,]),intensitatstotals)
            }
            filenamenoextension<-tools::file_path_sans_ext(filenames[filenumber])
            row.names(list_TICSdeSamples[[filenumber]])<-c(paste("altTIC",filenamenoextension,sep="#"),paste("TIC",filenamenoextension,sep="#"))
      }
      nc_close(ncin)

      rm(ncin, temps, masses, intensitats, intensitatstotals)


# Apply normalization factor --------------------------------------------------------
      if(!is.null(normalizationfactor) && length(normalizationfactor)!=length(filenames)) stop(paste("ERROR: El n?mero de factors de normalitzaci? NO coincideix amb el n?mero de mesures que consten com a precuinades a l'arxiu", variablesprecuina_withd))

      #si hi han factors de normalitzacio els fem servir
      if(!is.null(normalizationfactor)){
          #Multiplquem els TICS pels factors de normalitzacio
          for(i in 1:length(list_TICSdeSamples)){
              list_TICSdeSamples[[i]]<-round(list_TICSdeSamples[[i]]*normalizationfactor[i])
          }
      }

      print("gravant l'arxiu  de mesures precuinades....")
      #gravem les mesures precuinades
      save(list_TICSdeSamples, file=paste(cdf_folder,mesuresprecuinades,sep="/"))
      #gravem sota quines condicions es van precuinar les mesures
      save(filenames,variablessamples,normalizationfactor, massestoavoid,t2in, t2out, tmod, file=paste(cdf_folder,variablesprecuina,sep="/"))
}