R/align.R

Defines functions comp_peaks align

Documented in align comp_peaks

#' @title   Reference Batch Align
#'
#' @description  `align` aligns peaks from samples to a reference sample's
#' peaks.
#'
#' @details This function aligns the peaks from any number of samples. Peaks are
#' aligned to the retention times of the first peak. If aligning to a reference
#' or standard sample, this should be the first in the lists for data frames and
#' for the mass data. The function comp_peaks() is used to find the
#' corresponding peaks. This function  will return a new list of TIC data frames
#' and a list of mass data. The first sample's data is unchanged, used as the
#' reference. Then a TIC data frame and mass data for each of the given samples
#' containing the peaks and time coordinates of the aligned peaks. The time
#' coordinates are aligned to the first sample's peaks, the peak height and MS
#' is unchanged.
#'
#' @param data_list a \emph{list} object. Data extracted from each cdf file,
#' ideally the output from extract_data().
#' @param THR a \emph{float} object. Threshold for peak intensity. Should be a
#' number between the baseline value and the highest peak intensity. Default
#' is THR = 100000.
#'
#' @return A \emph{list} object. List of aligned data from each cdf file and a
#' list of peaks that were aligned for each file.
#'
#' @examples
#' file1 <- system.file("extdata","sample1.cdf",package="gcxgclab")
#' file2 <- system.file("extdata","sample2.cdf",package="gcxgclab")
#' file3 <- system.file("extdata","sample3.cdf",package="gcxgclab")
#' frame1 <- extract_data(file1,mod_t=.5)
#' frame2 <- extract_data(file2,mod_t=.5)
#' frame3 <- extract_data(file3,mod_t=.5)
#' aligned <- align(list(frame1,frame2,frame3))
#' plot_peak(aligned$Peaks$S1,aligned$S1,title="Reference Sample 1")
#' plot_peak(aligned$Peaks$S2,aligned$S2,title="Aligned Sample 2")
#' plot_peak(aligned$Peaks$S3,aligned$S3,title="Aligned Sample 3")
#'
#' @export
align <- function(data_list, THR=100000){
  # separating reference data to data to be aligned
  ref_df <- data_list[[1]]$TIC_df
  ref_df$'Overall Time Index' <- ref_df$'Overall Time Index'/60
  data_list[[1]]$TIC_df$`Overall Time Index` <- data_list[[1]]$TIC_df$`Overall Time Index`/60
  ref_ms <- data_list[[1]]$MS_df
  df_list <- list()
  ms_list <- list()
  for (i in 2:length(data_list)){
    df_list <- append(df_list,list(data_list[[i]]$TIC_df))
    df_list[[i-1]]$`Overall Time Index` <- df_list[[i-1]]$`Overall Time Index`/60
    data_list[[i]]$TIC_df$`Overall Time Index` <- data_list[[i]]$TIC_df$`Overall Time Index`/60
    ms_list <- append(ms_list,list(data_list[[i]]$MS_df))
  }
  # finding peaks to align to
  ref_peaks <- thr_peaks(ref_df,THR)
  ref_y <- length(which(ref_df$'RT1'==ref_df$'RT1'[1]))
  ref_x <- length(ref_df$'RT1')/ref_y
  if (length(df_list)==0 | length(ms_list)==0){
    stop('Must provide at least one sample to be aligned to the reference.')
  }
  aligned <- vector("list", (length(df_list)+2))
  naming <- c('S1')
  for (i in 1:length(df_list)){
    naming <- append(naming, paste0('S',i+1))
  }
  aligned[[1]] <- list(ref_df,ref_ms)
  names(aligned[[1]])<- c('TIC_df','MS_df')
  names(aligned) <- c(naming, 'Peaks')
  aligned$Peaks <- list(ref_peaks)

  # aligning the toluene peak first
  tol1 <- find_eic(data_list[[i]],92.1397,tolerance=10^(-3))
  lt1 <- which(tol1$intensity_values==max(tol1$intensity_values))
  ltt1 <- tol1$time_array[lt1]

  for (i in 1:length(df_list)){
    RT1 <- df_list[[i]]$RT1
    RT2 <- df_list[[i]]$RT2
    new_ms <- ms_list[[i]]
    message(paste("Aligning peaks of Sample",(i+1),"to the reference, Sample 1."))
    pb <- utils::txtProgressBar(min = 0, max = 100, style = 3, char = "=")

    df_y <- length(which(df_list[[i]]$'RT1'==df_list[[i]]$'RT1'[1]))
    df_x <- length(df_list[[i]]$'RT1')/df_y
    if (abs(ref_x-df_x)>1 | abs(ref_y-df_y)>1){
      stop(paste('Peak',i, 'cannot be aligned to the reference.
                           Dimensions do not match.'))
    }
    # aligning toluene peak first, if peak is present
    tol2 <- find_eic(data_list[[i+1]],92.1397,tolerance=10^(-3))
    lt2 <- which(tol2$intensity_values==max(tol2$intensity_values))
    ltt2 <- tol2$time_array[lt2]
    if (ltt1>8.8 & ltt2>8.8 & ltt1<9.4 & ltt2<9.4){
      df_list[[i]]<- phase_shift(data_list[[i+1]],(ltt1-ltt2))[[1]]
    }

    # peaks to align to ref
    peaks <- thr_peaks(df_list[[i]],THR)
    # finds closest peaks
    cp <- comp_peaks(ref_peaks,peaks)
    todel <- c()
    # checking that peaks are the same compound, compare MS
    for (j in 1:length(cp$Tref)){
      utils::setTxtProgressBar(pb, round(10/length(cp$Tref)*j))
      ms_r <- find_ms(data_list[[1]],(cp$Tref[j]*60),tolerance=10^-3)
      ms_a <- find_ms(data_list[[i+1]],(cp$Tal[j]*60),tolerance =10^-3)
      ms_r2 <- ms_r
      ms_a2 <- ms_a
      ms_r2$MZ <- round(ms_r$MZ)
      ms_a2$MZ <- round(ms_a$MZ)
      ms_r2 <- dplyr::arrange(ms_r2, ms_r2$MZ)
      ms_a2 <- dplyr::arrange(ms_a2, ms_a2$MZ)
      k <- 2
      while (k <= length(ms_r2$MZ)){
        if (ms_r2$MZ[k]==ms_r2$MZ[k-1]){
          ms_r2$Int[k-1]<- ms_r2$Int[k-1]+ms_r2$Int[k]
          ms_r2<-ms_r2[-k,]
        }
        else {
          k <- k+1
        }
      }
      k <- 2
      while (k <= length(ms_a2$MZ)){
        if (ms_a2$MZ[k]==ms_a2$MZ[k-1]){
          ms_a2$Int[k-1]<- ms_a2$Int[k-1]+ms_a2$Int[k]
          ms_a2<-ms_a2[-k,]
        }
        else {
          k <- k+1
        }
      }
      ms_r2$Int <- ms_r2$Int/max(ms_r2$Int)
      ms_a2$Int <- ms_a2$Int/max(ms_a2$Int)
      ms_r2 <- ms_r2[ms_r2$Int>.1,]
      ms_a2 <- ms_a2[ms_a2$Int>.1,]

      sum <- 0
      for (x in 1:length(ms_a2$MZ)){
        loc <- which(ms_r2$MZ==ms_a2$MZ[x])
        if (length(loc)>0){
          sum <- sum+ abs(ms_r2$Int[loc]-ms_a2$Int[x])
        }
        else {
          sum <- sum+ms_a2$Int[x]
        }
      }
      for (x in 1:length(ms_r2$MZ)){
        loc <- which(ms_a2$MZ==ms_r2$MZ[x])
        if (length(loc)>0){
          sum <- sum+ abs(ms_a2$Int[loc]-ms_r2$Int[x])
        }
        else {
          sum <- sum+ms_r2$Int[x]
        }
      }
      match_per <- 1-sum/(sum(ms_r2$Int)+sum(ms_a2$Int))
      if (match_per<.7){
        todel <- append(todel,j)
      }
    } #end loop over j

    # remove peaks which do not have matching MS
    if (length(todel)>0){
      cp2 <- cp[-todel,]
    }
    else{
      cp2 <- cp
    }
    frame <- df_list[[i]]
    # performing the alignment
    for (x in 1:length(cp2[,1])){
      utils::setTxtProgressBar(pb,round(10+89/length(cp2[,1])*x))
      j <- length(cp2[,1])-x+1
      if (abs(cp2$Tref[j]-cp2$Tal[j])>0.0001){
        # Shift TIC
        suppressWarnings(
        g <- gauss_fit(df_list[[i]], c(cp2$Xal[j],cp2$Yal[j]))
        )
        cat <- abs(g[[2]][3])
        # calculating how much data to shift and the distance
        l1 <- which.min(abs(cp2$Tref[j]-4*cat-df_list[[i]]$`Overall Time Index`))
        l2 <- which.min(abs(cp2$Tref[j]+4*cat-df_list[[i]]$`Overall Time Index`))
        if (is.na(l2)){
          l2<- length(df_list[[i]]$TIC)
        }
        cut <- df_list[[i]]$TIC[c(l1:l2)]
        ldiff1 <- which.min(abs(cp2$Tref[j]-df_list[[i]]$`Overall Time Index`))
        ldiff2 <- which.min(abs(cp2$Tal[j]-df_list[[i]]$`Overall Time Index`))
        ldiff <- ldiff1-ldiff2

        xdim <- length(unique(frame$RT1))
        ydim <- length(frame$RT2)/xdim
        sgn <- ldiff/abs(ldiff)*abs(round(ldiff/ydim))
        # shifting columns if needed
        if (abs(ldiff)>(ydim-10)){
          if ((l2+sgn*ydim)>length(frame$TIC)){
            len1 <- length(c(l1+sgn*ydim):length(frame$TIC))
            df_list[[i]]$TIC[c((l1+sgn*ydim):length(frame$TIC))]<- cut[1:len1]
            ldiff <- ldiff-sgn*ydim
          }
          else{
            df_list[[i]]$TIC[c((l1+sgn*ydim):(l2+sgn*ydim))]<- cut
            ldiff <- ldiff-sgn*ydim
          }
        }
        # shifting up and down in column
        if (ldiff>0){
          frame$TIC[c(l1:(l1+ldiff-1))]<-df_list[[i]]$TIC[c((l2+1):(l2+ldiff))]
          if ((l2+ldiff)>length(frame$TIC)){
            len1 <- length(c(l1+ldiff):length(frame$TIC))
            frame$TIC[c((l1+ldiff):length(frame$TIC))]<-cut[1:len1]
          }
          else{
            frame$TIC[c((l1+ldiff):(l2+ldiff))]<-cut
          }
        }
        else if (ldiff<0){
          frame$TIC[c((l2+ldiff+1):l2)]<-df_list[[i]]$TIC[c((l1+ldiff):(l1-1))]
          frame$TIC[c((l1+ldiff):(l2+ldiff))]<-cut
        }

        # Shift MS
        # calculating distance
        t1 <- df_list[[i]]$`Overall Time Index`[l1]
        t2 <- df_list[[i]]$`Overall Time Index`[l2]
        loc1 <- which.min(abs(ms_list[[i]]$time_array-t1))
        loc2 <- which.min(abs(ms_list[[i]]$time_array-t2))
        L1 <- which(ms_list[[i]]$time_array==ms_list[[i]]$time_array[loc1])[1]
        L2 <- which(ms_list[[i]]$time_array==ms_list[[i]]$time_array[loc2])
        L3 <- L2[length(L2)]

        tdiff <- cp2$Tref[j]-cp2$Tal[j]
        times <- unique(ms_list[[i]][L1:L3,c(1,2,3)])
        new_t <- c()
        ldiff <- ldiff1-ldiff2
        for (xx in 1:length(times$time_array)){
          tloc <- which.min(abs(ms_list[[i]]$time_array-times$time_array[xx]-tdiff))
          new_t <- rbind(new_t,ms_list[[i]][tloc,c(1,2,3)])
        }
        # shifting if column shift needed
        if (abs(ldiff)>(ydim-10)){
          new_t2 <- c()
          rt1cut <- ms_list[[i]]$RT1[L1]
          rt1s <- unique(ms_list[[i]]$RT1[sgn*ms_list[[i]]$RT1>sgn*rt1cut])
          if (sgn>0){
            rt1s <- rt1s[1]
          }
          else if (sgn<0){
            rt1s <- rt1s[length(rt1s)]
          }
          LL1 <- which(ms_list[[i]]$RT1==rt1s & ms_list[[i]]$RT2==ms_list[[i]]$RT2[L1])[1]
          LL2 <- which(ms_list[[i]]$RT1==rt1s & ms_list[[i]]$RT2==ms_list[[i]]$RT2[L3])
          LL3 <- LL2[length(LL2)]
          new_t2 <- ms_list[[i]][LL1:LL3,c(1,2,3)]
          new_t2 <- unique(new_t2)
          for (z in 1:length(times$time_array)){
            w1 <- which(times$time_array[z]==ms_list[[i]]$time_array)
            w2 <- which(new_t2$time_array[z]==ms_list[[i]]$time_array)
            ms_list[[i]][w1,c(1,2,3)] <- new_t2[z,]
            ms_list[[i]][w2,c(1,2,3)] <- times[z,]
          }
          ldiff <- ldiff-sgn*ydim
        }
        # shifting up and down in column
        if (ldiff>0){
          # shift high times down
          for (z in c((length(times$time_array)+1-ldiff):length(times$time_array))){
            lloc <- which(new_t$time_array[z]==ms_list[[i]]$time_array)
            zz <- z-length(times$time_array)+ldiff
            new_ms[lloc,c(1,2,3)] <- times[zz,]
          }
        }
        else if (ldiff<0){
          # shift low times up
          for (z in c(1:(-ldiff))){
            lloc <- which(new_t$time_array[z]==ms_list[[i]]$time_array)
            zz <- z+length(times$time_array)+ldiff
            new_ms[lloc,c(1,2,3)] <- times[zz,]
          }
        }
        for (y in c(1:length(times$time_array))){
          # shift peak times
          loca <- which(ms_list[[i]]$time_array==times$time_array[y])
          new_ms[loca,c(1,2,3)] <- new_t[y,]
        }

      }
    }

    utils::setTxtProgressBar(pb,100)

    # finalizing the list of data frames returned of the aligned data
    cp3 <- cp2[,c(1,2,3,8)]
    colnames(cp3) <- c('T','X','Y','Peak')
    frame$`Overall Time Index` <- frame$`Overall Time Index`*60
    frame <- dplyr::arrange(frame, frame$'Overall Time Index')
    frame$RT1 <- RT1
    frame$RT2 <- RT2
    aligned[[i+1]] <- list(frame,new_ms)
    names(aligned[[i+1]]) <- c('TIC_df','MS_df')
    cp3$T <- cp3$T*60
    aligned$Peaks <- append(aligned$Peaks,list(cp3))
    close(pb)
  } # end loop over i, samples being aligned

  names(aligned$Peaks) <- naming

  return(aligned)
}


#' @title   Compare Peaks
#'
#' @description  `comp_peaks` compares peaks of two samples.
#'
#' @details This function find compares the peaks from two samples and
#' correlates the peaks by determining the peaks closest to each other in the
#' two samples, within a certain reasonable distance. Then returns a data frame
#' with a list of the correlated peaks including each of their time coordinates.
#'
#' @param ref_peaks a \emph{data.frame} object. A data frame with 4 columns
#' (Time, X, Y, Peak), ideally the output from either top_peaks() or
#' thr_peaks().
#' @param al_peaks a \emph{data.frame} object. A data frame with 4 columns
#' (Time, X, Y, Peak), ideally the output from either top_peaks() or
#' thr_peaks().
#'
#' @return A \emph{data.frame} object. A data frame with 8 columns containing
#' the matched peaks from the two samples, with the time, x, y, and peak values
#' for each.
#'
#' @export
comp_peaks <- function(ref_peaks, al_peaks){
  N <- length(ref_peaks$'T')
  M <- length(al_peaks$'T')
  # Finding peak in second list with min dist to first list peak
  keep_2 <- c()
  keep_1 <- c()
  for (i in 1:N){
    min_d <- 1
    coord <- 0
    for (j in 1:M){
      distance <- 0.01*(ref_peaks$'X'[i]-al_peaks$'X'[j])^2+
        3*(ref_peaks$'Y'[i] -al_peaks$'Y'[j])^2
      min_d <-  min(min_d,distance)
      if (min_d == distance){coord <- j}
    }
    if (coord != 0){keep_2 <- append(keep_2, coord)
    keep_1 <- append(keep_1, i)
    }
  }

  # Finding peak in first list with min dist to second list peak
  keep_1b <- c()
  keep_2b <- c()
  for (i in 1:M){
    min_d <- 1
    coord <- 0
    for (j in 1:N){
      distance <- 0.01*(ref_peaks$'X'[j]-al_peaks$'X'[i])^2+3*(ref_peaks$'Y'[j]
                                                               -al_peaks$'Y'[i])^2
      min_d <-  min(min_d,distance)
      if (min_d == distance){coord <- j}
    }
    if (coord != 0){
      keep_1b <- append(keep_1b, coord)
      keep_2b <- append(keep_2b, i)
    }
  }

  # Keeping only matched peaks
  ordered1b <- ref_peaks[keep_1b, ]
  ordered2b <- al_peaks[keep_2b, ]

  all_peaks_data <- as.data.frame(cbind(ordered1b$'T', ordered1b$'X',
                                        ordered1b$'Y', ordered1b$'Peak',
                                        ordered2b$'T', ordered2b$'X',
                                        ordered2b$'Y', ordered2b$'Peak'))
  colnames(all_peaks_data) <- c('Tref', 'Xref', 'Yref', 'Peakref', 'Tal', 'Xal',
                                'Yal', 'Peakal')

  # Removing double(+) matched peaks, keeping highest matched peak
  all_peaks_data <- dplyr::arrange(all_peaks_data, 'Tal', dplyr::desc("Peakref"))
  Num <- length(all_peaks_data$'Tal')
  if (Num>2){
    for (j in 2:Num){
      i <- Num-j+2
      if (all_peaks_data$'Tal'[i-1]==all_peaks_data$'Tal'[i]){
        all_peaks_data <- all_peaks_data[-i, ]
      }
    }
    all_peaks_data <- dplyr::arrange(all_peaks_data, 'Tref', dplyr::desc("Peakal"))
    Num <- length(all_peaks_data$'Tref')
    for (j in 2:Num){
      i <- Num -j+2
      if (all_peaks_data$'Tref'[i-1]==all_peaks_data$'Tref'[i]){
        all_peaks_data <- all_peaks_data[-i, ]
      }
    }
  }
  return(all_peaks_data)
}

# ©2022 Battelle Savannah River Alliance, LLC
# Notice: These data were produced by Battelle Savannah River Alliance, LLC
# under Contract No 89303321CEM000080 with the Department of Energy. During the
# period of commercialization or such other time period specified by DOE, the
# Government is granted for itself and others acting on its behalf a
# nonexclusive, paid-up, irrevocable worldwide license in this data to
# reproduce, prepare derivative works, and perform publicly and display
# publicly, by or on behalf of the Government. Subsequent to that period, the
# Government is granted for itself and others acting on its behalf a
# nonexclusive, paid-up, irrevocable worldwide license in this data to
# reproduce, prepare derivative works, distribute copies to the public,
# perform publicly and display publicly, and to permit others to do so. The
# specific term of the license can be identified by inquiry made to Contract or
# DOE. NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR
# ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY
# LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
# USEFULNESS OF ANY DATA, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR
# REPRESENTS THAT ITS USE WORLD NOT INFRINGE PRIVATELY OWNED RIGHTS.

Try the gcxgclab package in your browser

Any scripts or data that you put into this service are public.

gcxgclab documentation built on May 29, 2024, 5:33 a.m.