R/rt_correction.R

Defines functions best_loess best_poly correct_rt

Documented in correct_rt

#' @title Correct RTs of peaks in peak table using internal standard list
#' @description Correct RTs of peaks in peak table using internal standard list.
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param reference_is_table_name Experiment internal standard table name.
#' @param query_is_table_name Database internal standard table name.
#' @param query_peak_table_name Peak table name. Column 1 is name (peak name), column 2 is mz
#' (mass to charge ratio), column 3 is rt. And remaining columns are intensity of samples.
#' @param method polyline or loess.
#' @param poly A numeric vector.
#' @param degree A numeric vector.
#' @param path Work directory.
#' @return A peak table.
#' @export

# sxtTools::setwd_project()
# setwd("example/rt_correction")
#
# reference_is_table_name = "reference_is_table.csv"
# query_is_table_name = "query_is_table.csv"
# query_peak_table_name = "query_peak_table.csv"
# method = "polyline"
# poly = c(1, 2, 3)
# path = "."
#
# correct_rt(reference_is_table_name = "reference_is_table.csv",
#            query_is_table_name = "query_is_table.csv",
#            query_peak_table_name = "query_peak_table.csv",
#            method = "loess", 
#            path = "example/rt_correction/")

correct_rt = function(reference_is_table_name,
                      query_is_table_name,
                      query_peak_table_name,
                      method = c("loess", "polyline"),
                      poly = c(1, 2, 3),
                      degree = c(1, 2),
                      path = ".") {
  dir.create(path, showWarnings = FALSE)
  method <- match.arg(method)
  
  ###check data
  if (all(dir(path = path) != reference_is_table_name)) {
    stop("No ", reference_is_table_name, " in ", path)
  } else{
    reference_is_table = readTable(file.path(path, reference_is_table_name)) %>%
      as.data.frame()
  }
  
  if (all(dir(path = path) != query_is_table_name)) {
    stop("No ", query_is_table_name, " in ", path)
  } else{
    query_is_table = readTable(file.path(path, query_is_table_name)) %>%
      as.data.frame()
  }
  
  if (all(dir(path = path) != query_peak_table_name)) {
    stop("No ", query_peak_table_name, " in ", path)
  } else{
    query_peak_table = readTable(file.path(path, query_peak_table_name)) %>%
      as.data.frame()
  }
  
  reference_is_table$RT = as.numeric(reference_is_table$RT)
  query_is_table$RT = as.numeric(query_is_table$RT)
  query_peak_table$rt = as.numeric(query_peak_table$rt)
  
  remain.idx <-
    which(!is.na(reference_is_table[, 3]) &
            !is.na(query_is_table[, 3]))
  reference_is_table <-
    reference_is_table[remain.idx, , drop = FALSE]
  query_is_table <-
    query_is_table[remain.idx, , drop = FALSE]
  
  if (nrow(reference_is_table) < 10) {
    poly = c(1, 2)
  }
  
  rt_table <- data.frame(
    reference = reference_is_table[, 3],
    query = query_is_table[, 3],
    stringsAsFactors = FALSE
  )
  
  rt1 <- rt_table$reference
  rt2 <- rt_table$query
  
  if (method == "polyline") {
    poly <- poly[which(poly < nrow(rt_table) - 1)]
    mse <- best_poly(
      x = rt_table$query,
      y = rt_table$reference,
      poly = poly,
      path = path
    )
    idx <- poly[which.min(mse)]
    
    model <- lm(rt1 ~ poly(rt2, idx))
  } else{
    result <- best_loess(
      x = rt_table$query,
      y = rt_table$reference,
      span.begin = 0.5,
      span.end = 1,
      span.step = 0.1,
      degree = degree,
      path = path
    )
    idx <- which.min(result[, 3])
    model <- loess(rt1 ~ rt2,
                   span = result[idx, 2],
                   degree = result[idx, 1])
    
  }
  
  raw_rt <- query_peak_table$rt
  
  predict_rt <- predict(object = model,
                        newdata = data.frame(rt2 = raw_rt))
  predict_rt[raw_rt < min(rt2)] <-
    raw_rt[raw_rt < min(rt2)]
  predict_rt[raw_rt > max(rt2)] <-
    raw_rt[raw_rt > max(rt2)]
  
  temp.data <- data.frame(raw_rt, predict_rt,
                          stringsAsFactors = FALSE)
  
  plot <-
    ggplot2::ggplot(data = temp.data, ggplot2::aes(x = raw_rt, y = predict_rt)) +
    ggplot2::geom_point(color = "black") +
    # ggplot2::geom_line() +
    ggplot2::geom_smooth(method = "loess",
                         color = "black",
                         fill = "gray") +
    ggplot2::geom_abline(intercept = 0,
                         slope = 1,
                         color = "red") +
    ggplot2::theme_bw()
  
  
  ggplot2::ggsave(
    filename = file.path(path, "raw_rt vs predict_rt plot.pdf"),
    plot = plot,
    width = 8,
    height = 6
  )
  
  query_peak_table <- data.frame(query_peak_table,
                                 "rt_correction" = predict_rt,
                                 stringsAsFactors = FALSE) %>%
    dplyr::select(name, mz, rt, rt_correction)
  
  write.csv(
    query_peak_table,
    file = file.path(path, "Feature_table_with_corrected_rt.csv"),
    row.names = FALSE
  )
}


###parameters optimization
best_poly = function(x,
                     y,
                     poly = c(1, 2, 3, 4),
                     path = ".") {
  pre.all <- list()
  for (i in seq_along(poly)) {
    # cat(i, " ")
    ##LOO
    pre <- NULL
    for (j in seq_along(x)) {
      # cat(j, " ")
      y1 <- y[-j]
      x1 <- x[-j]
      model <- lm(y1 ~ poly(x1, poly[i]))
      pre[j] <- predict(object = model,
                        newdata = data.frame(x1 = x[j]))
    }
    pre.all[[i]] <- pre
  }
  
  mse <-
    unlist(lapply(pre.all, function(x) {
      sum((y - x) ^ 2) / length(y)
    }))
  
  # y.lim1 <- 0.8*min(c(y, unlist(lapply(pre.all, min))))
  # y.lim2 <- 1.2*max(c(y, unlist(lapply(pre.all, max))))
  
  temp.data1 <- do.call(c, pre.all)
  temp.data1 <- data.frame(
    query.RT = rep(x, length(pre.all)),
    reference.RT = temp.data1,
    Poly.MSE = as.character(rep(paste(
      poly, round(mse, 3), sep = ": "
    ), each = length(x))),
    stringsAsFactors = FALSE
  )
  
  temp.data2 <- data.frame(
    query.RT = x,
    reference.RT = y,
    stringsAsFactors = FALSE
  )
  
  plot <-
    ggplot2::ggplot(data = temp.data2,
                    ggplot2::aes(x = query.RT, y = reference.RT)) +
    ggplot2::geom_point(color = "black") +
    ggplot2::geom_line() +
    ggplot2::geom_smooth(method = "loess",
                         color = "black",
                         fill = "gray") +
    ggplot2::geom_point(
      data = temp.data1,
      mapping = ggplot2::aes(x = query.RT, y = reference.RT, color = Poly.MSE)
    ) +
    ggplot2::geom_line(
      data = temp.data1,
      mapping = ggplot2::aes(x = query.RT, y = reference.RT, color = Poly.MSE)
    ) +
    ggplot2::geom_smooth(
      data = temp.data1,
      mapping = ggplot2::aes(
        x = query.RT,
        y = reference.RT,
        color = Poly.MSE,
        fill = Poly.MSE
      ),
      method = "loess"
    ) +
    # ggthemes::theme_pander() +
    ggplot2::theme_bw()
  ggplot2::ggsave(
    filename = file.path(path, "MSE.plot.pdf"),
    plot = plot,
    width = 8,
    height = 6
  )
  
  return(mse)
}



best_loess = function(x,
                      y,
                      span.begin = 0.5,
                      span.end = 1,
                      span.step = 0.1,
                      degree = c(1, 2),
                      path = ".") {
  span <- seq(span.begin, span.end, span.step)
  para <- NULL
  for (i in seq_along(degree)) {
    para <- rbind(para, cbind(degree[i], span))
  }
  colnames(para) <- c("degree", "span")
  
  y <- y[order(x)]
  x <- sort(x)
  
  pre.all <- list()
  
  for (i in 1:nrow(para)) {
    temp.degree <- para[i, 1]
    temp.span <- para[i, 2]
    pre <- NULL
    # for(j in 2:(length(x) - 1)) {
    for (j in seq_along(x)) {
      y1 <- y
      x1 <- x
      # y1 <- y[-j]
      # x1 <- x[-j]
      model <-
        loess(y1 ~ x1, span = temp.span, degree = temp.degree)
      pre[j] <- predict(object = model,
                        newdata = data.frame(x1 = x[j]))
    }
    pre.all[[i]] <- pre[!is.na(pre)]
  }
  
  mse <-
    unlist(lapply(pre.all, function(x) {
      sum((y[2:(length(y) - 1)] - x) ^ 2) / length(y)
    }))
  result <- data.frame(para, mse, stringsAsFactors = FALSE)
  
  
  temp.data1 <- do.call(c, pre.all)
  para <- paste(para[, 1], para[, 2], sep = ";")
  
  temp.data1 <- data.frame(
    query.RT = rep(x, length(pre.all)),
    reference.RT = temp.data1,
    Degree.Span.MSE = as.character(rep(paste(
      para, round(mse, 3), sep = ": "
    ), each = length(x))),
    stringsAsFactors = FALSE
  )
  
  
  temp.data2 <- data.frame(
    query.RT = x,
    reference.RT = y,
    stringsAsFactors = FALSE
  )
  
  plot <-
    ggplot2::ggplot(data = temp.data2,
                    ggplot2::aes(x = query.RT, y = reference.RT)) +
    ggplot2::geom_point(color = "black") +
    ggplot2::geom_line() +
    ggplot2::geom_smooth(method = "loess",
                         color = "black",
                         fill = "gray") +
    ggplot2::geom_point(
      data = temp.data1,
      mapping = ggplot2::aes(x = query.RT, y = reference.RT, color = Degree.Span.MSE)
    ) +
    ggplot2::geom_line(
      data = temp.data1,
      mapping = ggplot2::aes(x = query.RT, y = reference.RT, color = Degree.Span.MSE)
    ) +
    ggplot2::geom_smooth(
      data = temp.data1,
      mapping = ggplot2::aes(
        x = query.RT,
        y = reference.RT,
        color = Degree.Span.MSE,
        fill = Degree.Span.MSE
      ),
      method = "loess"
    ) +
    ggplot2::theme_bw()
  
  ggplot2::ggsave(
    filename = file.path(path, "MSE.plot.pdf"),
    plot = plot,
    width = 8,
    height = 6
  )
  return(result)
}
jaspershen/metflow2 documentation built on Aug. 15, 2021, 4:38 p.m.