R/preprocess.R

Defines functions pasre.categorical.data preprocess

Documented in pasre.categorical.data preprocess

#' (Internal function) Perform the pre-processing step of IPCAPS2
#'
#' @param files IPCAPS2 supports SNPs encoded as 0, 1 and 2 (dosage encoding).
#' Rows represent SNPs and columns represent subjects. Each column needs to be
#' separated by a space or a tab. A big text file should be divided into smaller
#' files to load faster. For instance, to input 3 files, use as: files=c(
#' 'input1.txt', 'input2.txt', 'input3.txt').
#' @param label.file An additional useful information (called "labels" in
#' IPCAPS2) related subject, for example, geographic location or disease
#' phenotype. These labels (one at a time) are used in displaying the clustering
#' outcome of IPCAPS2. A label file must contain at least one column. However,
#' it may contain more than one column in which case each column need to be
#' separated by a space or a tab.
#' @param lab.col The label in the label file to be used in the tree-like
#' display of IPCAPS2 clustering results.
#' @param rdata.infile In case of re-analysis, it is convenient to run IPCAPS2
#' using the file rawdata.RData generated by IPCAPS2. This file contains a matrix
#' of SNPs
#' (raw.data) and a vector of labels (label).
#' @param bed.infile A PLINK binary format consists of 3 files; bed, bim, and
#' fam. To generate these files from PLINK, use option –make-bed. See more
#' details at: \url{http://zzz.bwh.harvard.edu/plink/data.shtml}.
#' @param cate.list (Unimplemented) A list of categorical input file (text). For
#' instance, to input 3 files, use as: files=c('input1.txt', 'input2.txt',
#' 'input3.txt').
#' @param result.dir To set an absolute path for IPCAPS2 output. If the specified output
#' directory already exists, result files are saved in sub-directories
#' cluster_out, cluster_out1, cluster_out2, etc.
#' @param threshold Cutoff value for EigenFit.
#' @param min.fst Minimum Fst between a pair of subgroups.
#' @param max.thread (Require the parallelization patch) Maximum number of
#' concurrent threads.
#' @param reanalysis (Unimplemented) To specify whether it is re-analysis or
#' not. If TRUE, it is re-analysis, otherwise it is not. Default = FALSE.
#' @param method The internal clustering method. It can be set to "mix"
#' (rubikclust & mixmod), "mixmod" (Lebret et al., 2015), "clara" (R: Clustering
#' Large Applications), "pam" (R: Partitioning Around Medoids (PAM) Object),
#' "meanshift" (Wang, 2016), "apcluster" (Bodenhofer et al., 2016), and "hclust"
#' (R: Hierarchical Clustering). Default = "mix".
#' @param min.in.group Minimum number of individuals to constitute a cluster or
#' subgroup.
#' @param datatype To specify whether the input data are 'snp' or other type.
#' Defalut = 'snp'.
#' @param nonlinear (Unimplemented) To specify whether linear or non-linear
#' method is used for IPCAPS2 analysis. If TRUE, non-linear method is used,
#' otherwise linear method is used. Default = FALSE.
#' @param missing.char Symbol used for missing genotypes. Default = NA.
#' @param regression.file A file of covariates; one covariate per column. SNPs
#' can be adjusted for these covariates via regression modeling and residual
#' computation.
#' @param regression.col.first Refer to a covariate file, the first covariate to
#' be considered as confounding variable.
#' @param regression.col.last Refer to a covariate file, the last covariate to be
#' considered as confounding variable. All the variables in between the
#' cov.col.first and cov.col.last will be considered in the adjustment process.
#' @param reg.method (Fixed) Specify a method for regression analysis.
#' Default = 'linear'.
#' @param plot.as.pdf To export plots as PDF. When omitted, plots are saved as
#' PNG.
#' @param no.plot No plot is generated if this option is TRUE. This option is
#' useful when the system does not support X Windows in the unix based system.
#' Default = FALSE.
#' @param silence.mode To enable or disable silence mode. If silence mode is
#' enabled, the fuction is processed without printing any message on the screen,
#' and it is slightly faster. Default = TRUE.
#' @param max.thread To specify a number of threads in order to run an analysis
#' in parallel. If max.thread is specified more than the maximum number of CPU
#' cores, then the maximum number of CPU cores are used instead. If max.thread
#' is specified as floating point number, it will be rounded up using the
#' function round(). Default = 0, which the maximum number of CPU cores are
#' used.
#' @param seed To specify a seed number for random generator. Default = NA,
#' which means that a seed number is automatically chose.
#'
#' @return A data frame of input data.
#'
#' @include parallelization.R
#'

preprocess <-
  function(files,
           label.file,
           lab.col,
           rdata.infile,
           bed.infile,
           cate.list,
           result.dir,
           threshold,
           min.fst,
           reanalysis = FALSE,
           method = "mix",
           min.in.group = 20,
           datatype = "snp",
           nonlinear = FALSE,
           missing.char = NA,
           regression.file = NA,
           regression.col.first = NA,
           regression.col.last = NA,
           reg.method = "linear",
           plot.as.pdf = NA,
           no.plot = NA,
           silence.mode = FALSE,
           max.thread = 0,
           seed = NULL)
  {
    if (reanalysis == FALSE)
    {
      # New analysis
      if (file.exists(result.dir))
      {
        if (!silence.mode)
          cat(paste0("Note: the directory '",
                     result.dir, "' is existed."),
              "\n")
        sub.dir <- "cluster_output"
        i <- 0
        tmp.dir <- file.path(result.dir, sub.dir)
        while (file.exists(tmp.dir))
        {
          i <- i + 1
          tmp.dir <-
            file.path(result.dir, paste0(sub.dir, i))
        }
        result.dir <- tmp.dir
      } else
      {
        dir.create(file.path(result.dir),
                   showWarnings = FALSE,
                   recursive = TRUE)
      }


      if (!silence.mode)
        cat(paste0(
          "The result files will be saved to this directory:",
          result.dir,
          "\n"
        ))
      dir.create(file.path(result.dir), showWarnings = FALSE)
      img.dir <- file.path(result.dir, "images")
      dir.create(file.path(img.dir), showWarnings = FALSE)
      rdata.dir <- file.path(result.dir, "RData")
      dir.create(file.path(rdata.dir), showWarnings = FALSE)

      # create empty file
      leaf.node <- c()
      file.name <-
        file.path(result.dir, "RData", "leafnode.RData")
      save(leaf.node, file = file.name, compress = "bzip2")
      node.list <- c()

    } else
    {
      # Old analysis
      if (file.exists(result.dir))
      {
        if (!silence.mode)
          cat(paste0("Note:  directory '",
                     result.dir, "' is existed."),
              "\n")
      } else
      {
        cat(
          paste0(
            "Error:  directory '",
            result.dir,
            "' is not existed. Please refer to a result directory."
          ),
          "\n"
        )
        quit()
      }
    }

    # Raw data
    index <- NULL
    label <- NULL

    file.name <- file.path(result.dir, "RData", "rawdata.RData")
    if (!is.na(files))
    {
      raw.data <- NULL
      snp.info <- NULL
      ind.info <- NULL

      if (!file.exists(file.name))
      {
        # import label
        test.file <- file(label.file)
        filetype <- summary(test.file)$class
        close(test.file)
        if (filetype == "gzfile")
        {
          zz <- gzfile(label.file, "rt")
          label <- read.table(zz, header = FALSE)
          close(zz)
        } else
        {
          label <- read.table(file = label.file, header = FALSE)
        }

        label <- label[, lab.col]
        # load genotype files
        if (!silence.mode)
          cat(paste0("Loading the input files ... \n"))
        for (fname in files)
        {
          # test the separator
          test.file <- file(fname)
          filetype <- summary(test.file)$class
          close(test.file)
          if (filetype == "gzfile")
          {
            zz <- gzfile(fname, "rt")
            oneline <- read.table(zz,
                                  header = FALSE,
                                  sep = ",",
                                  nrows = 1)
            close(zz)
          } else
          {
            oneline <- read.table(
              file = fname,
              header = FALSE,
              sep = ",",
              nrows = 1
            )
          }
          # separated by comma
          if (dim(oneline)[2] > 1)
          {
            test.file <- file(fname)
            filetype <- summary(test.file)$class
            close(test.file)
            if (filetype == "gzfile")
            {
              zz <- gzfile(fname, "rt")
              tmp_geno <- read.table(zz,
                                     header = FALSE,
                                     sep = ",")
              close(zz)
            } else
            {
              tmp_geno <- read.table(file = fname,
                                     header = FALSE,
                                     sep = ",")
            }
          } else
          {
            # separated by white space
            test.file <- file(fname)
            filetype <- summary(test.file)$class
            close(test.file)
            if (filetype == "gzfile")
            {
              zz <- gzfile(fname, "rt")
              tmp_geno <-
                read.table(zz, header = FALSE)
              close(zz)
            } else
            {
              tmp_geno <- read.table(file = fname, header = FALSE)
            }
          }

          raw.data <- rbind(raw.data, tmp_geno)
        }
        # data needs to be like rows = individuals, columns = markers
        raw.data <- t(raw.data)

        index <- seq(1, length(raw.data[, 1]))
        label <- unlist(label)
        label <- as.factor(label)

        tmp <- as.numeric(raw.data)
        tmp1 <- matrix(tmp, nrow = length(raw.data[, 1]))
        tmp2 <- as.data.frame(tmp1)
        rownames(tmp2) <- index
        raw.data <- tmp2

      } else
      {
        if (!silence.mode)
          cat(paste0("Loading: the file '",
                     file.name, "' is existed."),
              "\n")
        # load rawdata file to prepare node 1
        load(file.name)
        index <- seq(1, length(raw.data[, 1]))
      }
    } else if (!is.na(rdata.infile))
    {
      load(rdata.infile)
      if (!exists("raw.data"))
      {
        cat(paste0("Error: Can't find 'raw.data' in file ",
                   rdata.infile,
                   "\n"))
        quit()
      }
      if (!exists("label"))
      {
        cat(paste0("Error: Can't find 'label' in file ",
                   rdata.infile,
                   "\n"))
        quit()
      }
      index <- seq(1, length(raw.data[, 1]))

    } else if (!is.na(bed.infile))
    {
      # load BED files
      prefix <- gsub(".bed", "", bed.infile)
      bed <- paste0(prefix, ".bed")
      bim <- paste0(prefix, ".bim")
      fam <- paste0(prefix, ".fam")
      bed.data <- read.bed(bed, bim, fam, only.snp = FALSE)
      raw.data <- bed.data$snp
      snp.info <- bed.data$snp.info
      ind.info <- bed.data$ind.info
      index <- seq(1, length(raw.data[, 1]))
      raw.data <- as.data.frame(raw.data)
      rownames(raw.data) <- index

      # import label
      test.file <- file(label.file)
      filetype <- summary(test.file)$class
      close(test.file)
      if (filetype == "gzfile")
      {
        zz <- gzfile(label.file, "rt")
        label <- read.table(zz, header = FALSE)
        close(zz)
      } else
      {
        label <- read.table(file = label.file, header = FALSE)
      }
      label <- label[, lab.col]
      label <- unlist(label)
      label <- as.factor(label)

    } else if (!is.na(cate.list))
    {
      # load CATegorical files
      raw.data <- pasre.categorical.data(cate.list)
      index <- seq(1, length(raw.data[, 1]))

      # import label
      test.file <- file(label.file)
      filetype <- summary(test.file)$class
      close(test.file)
      if (filetype == "gzfile")
      {
        zz <- gzfile(label.file, "rt")
        label <- read.table(zz, header = FALSE)
        close(zz)
      } else
      {
        label <- read.table(file = label.file, header = FALSE)
      }
      label <- label[, lab.col]
      label <- unlist(label)
      label <- as.factor(label)

    } else
    {
      # No proper input file
      cat(paste0(
        "Can not find the proper input files, ",
        "please check the examples of "
      ))
      cat(paste0("ipcaps2() in order to use the function properly.\n"))
      return(NULL)
    }


    # print(dim(raw.data)) Resolve missing value by median
    X.median <- apply(raw.data, 2, median, na.rm = TRUE)
    raw.data <-
      t(apply(
        raw.data,
        1,
        replace.missing,
        missing = missing.char,
        rep = X.median
      ))

    # regression
    if ((!is.na(regression.file)) &&
        (!is.na(regression.col.first)) &&
        !(is.na(regression.col.last)))
    {
      test.file <- file(regression.file)
      filetype <- summary(test.file)$class
      close(test.file)
      if (filetype == "gzfile")
      {
        zz <- gzfile(regression.file, "rt")
        PCs <- read.table(zz, header = FALSE, sep = "")
        close(zz)
      } else
      {
        PCs <- read.table(file = regression.file,
                          header = FALSE,
                          sep = "")
      }
      PCs <-
        data.matrix(PCs[, regression.col.first:regression.col.last])
      # print(dim(raw.data))
      if (!silence.mode)
        cat(paste0("Correct for covariates using ",
                   reg.method, " model\n"))
      raw.data <-
        apply(raw.data, 2, do.glm, PC = PCs, method = reg.method)
      # print(dim(raw.data)) raw.data = lm(as.matrix(raw.data) ~ PCs,
      # na.action=na.exclude)$residuals
    }

    if (exists("snp.info"))
    {
      save(raw.data,
           label,
           snp.info,
           ind.info,
           file = file.name,
           compress = "bzip2")
    } else
    {
      save(raw.data, label, file = file.name, compress = "bzip2")
    }
    no.marker <- dim(raw.data)[2]
    no.individual <- dim(raw.data)[1]
    if (!silence.mode)
      cat(paste0(
        "Input data: ",
        no.individual,
        " individuals, ",
        no.marker,
        " markers\n"
      ))
    # Save new experiment condition
    file.name <-
      file.path(result.dir, "RData", "condition.RData")
    # load some parameters to add more parameters
    save(
      threshold,
      min.fst,
      no.marker,
      no.individual,
      reanalysis,
      result.dir,
      method,
      min.in.group,
      datatype,
      nonlinear,
      plot.as.pdf,
      no.plot,
      file = file.name,
      compress = "bzip2",
      silence.mode,
      max.thread,
      seed
    )


    # Check if node 1 is existed
    file.name <- file.path(result.dir, "RData", "node1.RData")
    if (!file.exists(file.name))
    {
      #confident.list = NULL
      confident.list = rep(0, length(index))
      save(index,
           confident.list,
           file = file.name,
           compress = "bzip2")
    }

    return(result.dir)
  }

#' (Internal function) Manipulate categorical input files
#'
#' @param files ipcaps supports categorical data, which rows represent features
#' and columns represent subjects or individuals. Each column needs to be
#' separated by a space or a tab. A big text file should be divided into smaller
#' files to load faster. For instance, to input 3 files, use as: files=c(
#' 'input1.txt', 'input2.txt', 'input3.txt').
#'
#' @return A data frame of input data
#'
#' @import utils

pasre.categorical.data <- function(files)
{
  map.to.zero.one.list <- function(cate.data, uni.cate)
  {
    ret <- c()
    ar.uni.cate <- strsplit(uni.cate, "#@")[[1]]
    for (i in seq(1, length(ar.uni.cate)))
    {
      tmp <- rep(0, length(cate.data))
      tmp[which(cate.data == ar.uni.cate[i])] <- 1
      ret <- cbind(ret, tmp)
    }

    return(ret)
  }

  map.to.zero.one.matrix <- function(cate.data, uni.cate)
  {
    ret <- c()
    ar.uni.cate <- strsplit(uni.cate, "#@")[[1]]
    for (i in seq(1, length(ar.uni.cate)))
    {
      tmp <- rep(0, length(cate.data))
      tmp[which(cate.data == ar.uni.cate[i])] <- 1
      ret <- cbind(ret, tmp)
    }
    drop.col <- which(colSums(ret) == max(colSums(ret)))
    ret <- ret[,-c(drop.col)]
    return(list(test = ret))
  }

  silence.mode <- NULL

  raw.data <- NULL
  if (!silence.mode)
    cat(paste0("Loading the input files ... \n"))

  for (fname in files)
  {
    # test the separator
    test.file <- file(fname)
    filetype <- summary(test.file)$class
    close(test.file)
    if (filetype == "gzfile")
    {
      zz <- gzfile(fname, "rt")
      oneline <-
        read.table(zz,
                   header = FALSE,
                   sep = ",",
                   nrows = 1)
      close(zz)
    } else
    {
      oneline <- read.table(
        file = fname,
        header = FALSE,
        sep = ",",
        nrows = 1
      )
    }
    # separated by comma
    if (dim(oneline)[2] > 1)
    {
      test.file <- file(fname)
      filetype <- summary(test.file)$class
      close(test.file)
      if (filetype == "gzfile")
      {
        zz <- gzfile(fname, "rt")
        tmp_geno <-
          read.table(zz, header = FALSE, sep = ",")
        close(zz)
      } else
      {
        tmp_geno <- read.table(file = fname,
                               header = FALSE,
                               sep = ",")
      }
    } else
    {
      # separated by white space
      test.file <- file(fname)
      filetype <- summary(test.file)$class
      close(test.file)
      if (filetype == "gzfile")
      {
        zz <- gzfile(fname, "rt")
        tmp_geno <- read.table(zz, header = FALSE)
        close(zz)
      } else
      {
        tmp_geno <- read.table(file = fname, header = FALSE)
      }
    }

    raw.data <- rbind(raw.data, tmp_geno)
  }

  n.row <- dim(raw.data)[1]
  uni.cate <- apply(raw.data, 2, unique)
  if (is.list(uni.cate))
  {
    uni.cate <- mapply(paste, uni.cate, sep = "#@", collapse = "#@")
    raw.data <- mapply(map.to.zero.one.list,
                       cate.data = raw.data,
                       uni.cate = uni.cate)
  } else
  {
    uni.cate <- apply(uni.cate,
                      2,
                      paste,
                      sep = "#@",
                      collapse = "#@")
    raw.data <- mapply(map.to.zero.one.matrix,
                       cate.data = raw.data,
                       uni.cate = uni.cate)
  }

  raw.data <- unlist(raw.data)
  raw.data <- matrix(raw.data, nrow = n.row, byrow = FALSE)

  raw.data <- as.data.frame(raw.data)
  index <- seq(1, length(raw.data[, 1]))
  rownames(raw.data) <- index
  return(raw.data)
}
kridsadakorn/ipcaps2 documentation built on June 11, 2022, 8:35 p.m.