R/preprocess.R
In IPCAPS: Iterative Pruning to Capture Population Structure

Documented in pasre.categorical.data preprocess

#' (Internal function) Perform the pre-processing step of IPCAPS
#'
#' @param files IPCAPS supports SNPs encoded as 0, 1 and 2 (dosage encoding).
#' Rows represent SNPs and columns represent subjects. Each column needs to be
#' separated by a space or a tab. A big text file should be divided into smaller
#' files to load faster. For instance, to input 3 files, use as: files=c(
#' 'input1.txt', 'input2.txt', 'input3.txt').
#' @param label.file An additional useful information (called "labels" in
#' IPCAPS) related subject, for example, geographic location or disease
#' phenotype. These labels (one at a time) are used in displaying the clustering
#' outcome of IPCAPS. A label file must contain at least one column. However,
#' it may contain more than one column in which case each column need to be
#' separated by a space or a tab.
#' @param lab.col The label in the label file to be used in the tree-like
#' display of IPCAPS clustering results.
#' @param rdata.infile In case of re-analysis, it is convenient to run IPCAPS
#' using the file rawdata.RData generated by IPCAPS. This file contains a matrix
#' of SNPs
#' (raw.data) and a vector of labels (label).
#' @param bed.infile A PLINK binary format consists of 3 files; bed, bim, and
#' fam. To generate these files from PLINK, use option –make-bed. See more
#' details at: \url{http://zzz.bwh.harvard.edu/plink/data.shtml}.
#' @param cate.list (Unimplemented) A list of categorical input file (text). For
#' instance, to input 3 files, use as: files=c('input1.txt', 'input2.txt',
#' 'input3.txt').
#' @param result.dir To set an absolute path for IPCAPS output. If the specified output
#' directory already exists, result files are saved in sub-directories
#' cluster_out, cluster_out1, cluster_out2, etc.
#' @param threshold Cutoff value for EigenFit.
#' @param min.fst Minimum Fst between a pair of subgroups.
#' @param max.thread (Require the parallelization patch) Maximum number of
#' concurrent threads.
#' @param reanalysis (Unimplemented) To specify whether it is re-analysis or
#' not. If TRUE, it is re-analysis, otherwise it is not. Default = FALSE.
#' @param method The internal clustering method. It can be set to "mix"
#' (rubikclust & mixmod), "mixmod" (Lebret et al., 2015), "clara" (R: Clustering
#' Large Applications), "pam" (R: Partitioning Around Medoids (PAM) Object),
#' "meanshift" (Wang, 2016), "apcluster" (Bodenhofer et al., 2016), and "hclust"
#' (R: Hierarchical Clustering). Default = "mix".
#' @param min.in.group Minimum number of individuals to constitute a cluster or
#' subgroup.
#' @param datatype To specify whether the input data are 'snp' or other type.
#' Defalut = 'snp'.
#' @param nonlinear (Unimplemented) To specify whether linear or non-linear
#' method is used for IPCAPS analysis. If TRUE, non-linear method is used,
#' otherwise linear method is used. Default = FALSE.
#' @param missing.char Symbol used for missing genotypes. Default = NA.
#' @param regression.file A file of covariates; one covariate per column. SNPs
#' can be adjusted for these covariates via regression modeling and residual
#' computation.
#' @param regression.col.first Refer to a covariate file, the first covariate to
#' be considered as confounding variable.
#' @param regression.col.last Refer to a covariate file, the last covariate to be
#' considered as confounding variable. All the variables in between the
#' cov.col.first and cov.col.last will be considered in the adjustment process.
#' @param reg.method (Fixed) Specify a method for regression analysis.
#' Default = 'linear'.
#' @param plot.as.pdf To export plots as PDF. When omitted, plots are saved as
#' PNG.
#' @param no.plot No plot is generated if this option is TRUE. This option is
#' useful when the system does not support X Windows in the unix based system.
#' Default = FALSE.
#'
#' @return A data frame of input data.
#'
#' @include parallelization.R
#'

preprocess <- function( files, label.file, lab.col, rdata.infile, bed.infile, cate.list,
                        result.dir, threshold, min.fst, max.thread=NA, reanalysis=FALSE, method="mix",
                        min.in.group=20, datatype="snp", nonlinear = FALSE ,missing.char=NA,
                        regression.file=NA,regression.col.first=NA,regression.col.last=NA,
                        reg.method="linear", plot.as.pdf=NA,no.plot=NA){

  if (reanalysis==FALSE){
    #New analysis
    if (file.exists(result.dir)){
      cat(paste0("Note: the directory '",result.dir,"' is existed."),"\n")
      sub.dir = "cluster_output"
      i = 0
      tmp.dir = file.path(result.dir,sub.dir)
      while (file.exists(tmp.dir)){
        i = i + 1
        tmp.dir = file.path(result.dir,paste0(sub.dir,i))
      }
      result.dir = tmp.dir
    }else{
      dir.create(file.path(result.dir), showWarnings = FALSE, recursive = TRUE)
    }


    cat("The result files will be saved to this directory:",result.dir,"\n")
    dir.create(file.path(result.dir), showWarnings = FALSE)
    img.dir = file.path(result.dir,"images")
    dir.create(file.path(img.dir), showWarnings = FALSE)
    rdata.dir = file.path(result.dir,"RData")
    dir.create(file.path(rdata.dir), showWarnings = FALSE)

    #create empty file
    leaf.node = c()
    file.name = file.path(result.dir,"RData","leafnode.RData")
    save(leaf.node, file=file.name, compress = 'bzip2')
    node.list = c()

  }else{
    #Old analysis
    if (file.exists(result.dir)){
      cat(paste0("Note:  directory '",result.dir,"' is existed."),"\n")
    }else{
      cat(paste0("Error:  directory '",result.dir,"' is not existed. Please refer to a result directory."),"\n")
      quit()
    }
  }

  #Raw data
  index = NULL
  file.name = file.path(result.dir,"RData","rawdata.RData")
  if (!is.na(files)){
    label = NULL
    raw.data = NULL
    snp.info = NULL
    ind.info = NULL

    if (!file.exists(file.name)){
      #import label
      label = read.table(file=label.file, header=FALSE)
      label = label[,lab.col]
      #load genotype files
      cat(paste0("Loading the input files ... \n"))
      for (fname in files){
        #test the separator
        oneline = read.table(file=fname, header=FALSE, sep=',',nrows=1)
        #separated by comma
        if (dim(oneline)[2]>1){
          tmp_geno = read.table(file=fname, header=FALSE, sep=',')
        }else{
          #separated by white space
          tmp_geno = read.table(file=fname, header=FALSE)
        }

        raw.data = rbind(raw.data,tmp_geno)
      }
      # data needs to be like rows = individuals, columns = markers
      raw.data = t(raw.data)

      index = seq(1,length(raw.data[,1]))
      label = unlist(label)
      label = as.factor(label)

      tmp = as.numeric(raw.data)
      tmp1 = matrix(tmp,nrow=length(raw.data[,1]))
      tmp2 = as.data.frame(tmp1)
      rownames(tmp2) = index
      raw.data = tmp2

    }else{
      cat(paste0("Loading: the file '",file.name,"' is existed."),"\n")
      #load rawdata file to prepare node 1
      load(file.name)
      index = seq(1,length(raw.data[,1]))
    }
  }else if (!is.na(rdata.infile)){
    load(rdata.infile)
    if (!exists("raw.data")){
      cat(paste0("Error: Can't find 'raw.data' in file ",rdata.infile,"\n"))
      quit()
    }
    if (!exists("label")){
      cat(paste0("Error: Can't find 'label' in file ",rdata.infile,"\n"))
      quit()
    }
    index = seq(1,length(raw.data[,1]))

  }else if (!is.na(bed.infile)){
    #load BED files
    prefix = gsub('.bed','',bed.infile)
    bed = paste0(prefix,".bed")
    bim = paste0(prefix,".bim")
    fam = paste0(prefix,".fam")
    bed.data = read.bed(bed,bim,fam,only.snp=FALSE)
    raw.data = bed.data$snp
    snp.info = bed.data$snp.info
    ind.info = bed.data$ind.info
    index = seq(1,length(raw.data[,1]))
    raw.data = as.data.frame(raw.data)
    rownames(raw.data) = index

    #import label
    label = read.table(file=label.file, header=FALSE)
    label = label[,lab.col]
    label = unlist(label)
    label = as.factor(label)

  }else if (!is.na(cate.list)){
    #load CATegorical files
    raw.data = pasre.categorical.data(cate.list)
    index = seq(1,length(raw.data[,1]))

    #import label
    label = read.table(file=label.file, header=FALSE)
    label = label[,lab.col]
    label = unlist(label)
    label = as.factor(label)

  }else{
    #No proper input file
    cat(paste0("Not found proper input files, please check the examples of "))
    cat(paste0("ipcaps() in order to use the function properly.\n"))
    return(NULL)
  }


  #print(dim(raw.data))
  #Resolve missing value by median
  X.median = apply(raw.data,2,median,na.rm=TRUE)
  raw.data = t(apply(raw.data,1,replace.missing,missing=missing.char,rep=X.median))

  #regression
  if ((!is.na(regression.file)) && (!is.na(regression.col.first)) && !(is.na(regression.col.last))){
    PCs = read.table(file=regression.file, header=FALSE, sep='')
    PCs = data.matrix(PCs[,regression.col.first:regression.col.last])
    #print(dim(raw.data))
    cat(paste0("Correct for covariates using ",reg.method," model\n"))
    raw.data = apply(raw.data,2,do.glm,PC=PCs,method=reg.method)
    #print(dim(raw.data))
    #raw.data = lm(as.matrix(raw.data) ~ PCs, na.action=na.exclude)$residuals
  }

  if (exists("snp.info")){
    save(raw.data,label,snp.info,ind.info,file=file.name, compress = 'bzip2')
  }else{
    save(raw.data,label,file=file.name, compress = 'bzip2')
  }

  no.marker = dim(raw.data)[2]
  no.individual = dim(raw.data)[1]
  cat(paste0("Input data: ",no.individual," individuals, ",no.marker," markers\n"))
  #Save new experiment condition
  file.name = file.path(result.dir,"RData","condition.RData")
  #load some parameters to add more parameters
  save(threshold,min.fst,max.thread,no.marker,no.individual,reanalysis,result.dir,
       method,min.in.group,datatype,nonlinear,plot.as.pdf,no.plot,file=file.name,
       compress = 'bzip2')


  #Check if node 1 is existed
  file.name = file.path(result.dir,"RData","node1.RData")
  if (!file.exists(file.name)){
    save(index,file=file.name, compress = 'bzip2')
  }

  return(result.dir)
}

#' (Internal function) Manipulate categorical input files
#'
#' @param files IPCAPS supports categorical data, which rows represent features
#' and columns represent subjects or individuals. Each column needs to be
#' separated by a space or a tab. A big text file should be divided into smaller
#' files to load faster. For instance, to input 3 files, use as: files=c(
#' 'input1.txt', 'input2.txt', 'input3.txt').
#'
#' @return A data frame of input data
#'
#' @import utils

pasre.categorical.data <- function(files){

  map.to.zero.one.list <- function(cate.data,uni.cate){
    ret = c()
    ar.uni.cate=strsplit(uni.cate,'#@')[[1]]
    for (i in 1:length(ar.uni.cate)){
      tmp = rep(0,length(cate.data))
      tmp[which(cate.data == ar.uni.cate[i])] = 1
      ret = cbind(ret,tmp)
    }

    return(ret)
  }

  map.to.zero.one.matrix <- function(cate.data,uni.cate){
    ret = c()
    ar.uni.cate=strsplit(uni.cate,'#@')[[1]]
    for (i in 1:length(ar.uni.cate)){
      tmp = rep(0,length(cate.data))
      tmp[which(cate.data == ar.uni.cate[i])] = 1
      ret = cbind(ret,tmp)
    }
    drop.col = which(colSums(ret) == max(colSums(ret)))
    ret = ret[,-c(drop.col)]
    return(list('test'=ret))
  }

  raw.data = NULL
  cat(paste0("Loading the input files ... \n"))

  for (fname in files){
    #test the separator
    oneline = read.table(file=fname, header=FALSE, sep=',',nrows=1)
    #separated by comma
    if (dim(oneline)[2]>1){
      tmp_geno = read.table(file=fname, header=FALSE, sep=',')
    }else{
      #separated by white space
      tmp_geno = read.table(file=fname, header=FALSE)
    }

    raw.data = rbind(raw.data,tmp_geno)
  }

  n.row = dim(raw.data)[1]
  uni.cate = apply(raw.data,2,unique)
  if (is.list(uni.cate)){
    uni.cate = mapply(paste,uni.cate,sep="#@",collapse="#@")
    raw.data = mapply(map.to.zero.one.list,cate.data=raw.data,uni.cate=uni.cate)
  }else{
    uni.cate = apply(uni.cate,2,paste,sep="#@",collapse="#@")
    raw.data = mapply(map.to.zero.one.matrix,cate.data=raw.data,uni.cate=uni.cate)
  }

  raw.data = unlist(raw.data)
  raw.data = matrix(raw.data, nrow=n.row, byrow=F)

  raw.data = as.data.frame(raw.data)
  index = seq(1,length(raw.data[,1]))
  rownames(raw.data) = index
  return(raw.data)
}