R/top.discriminator.R

Defines functions top.discriminator

Documented in top.discriminator

#' Detecting top discriminators between two groups
#'
#' @description Detects top discriminators that contribute to
#' group separation based on the fixation index (Fst).
#'
#' @param cluster.obj The object which is returned from \code{\link{ipcaps}}.
#' This parameter is used when \code{use.path} is FALSE.
#' @param group1 To specify the first group number to be compared. (also see
#' \code{use.node.number})
#' @param group2 To specify the second group number to be compared. (also see
#' \code{use.node.number})
#' @param bim.file Option: In case that SNP information is not provided to
#' \code{\link{ipcaps}}, an absolute path of SNP information file is required.
#' It needs to be in PLINK format (bim). See more details at:
#' \url{http://zzz.bwh.harvard.edu/plink/data.shtml}.
#' @param use.node.number To specify whether a group number or a node number
#' is be used. If TRUE, a node nubmer is used instead. Default = FALSE.
#' @param num.top A number of top Fst SNPs to be returned. This parameter is
#' used when \code{percentile} is FALSE. Default = 100.
#' @param percentile A percentile for top Fst SNPs to be returned. This
#' parameter is used when \code{percentile} is TRUE. Default = 0.999.
#' @param use.percentile A logical value to indicate whether \code{percentile}
#' is used instead of \code{num.top}. This parameter is used when
#' \code{percentile} is TRUE. Default = FALSE.
#' @param use.path A logical value to indicate whether \code{result.path} is
#' used instead of \code{cluster.obj}. Importantly, \code{result.path} needs to
#' be set. This parameter only work with the ipcaps's result from version 1.1.7
#' onward. Default = FALSE.
#' @param result.path A path to an result directory of ipcaps. This parameter is
#' used when \code{use.path} is TRUE. This parameter only work with the ipcaps's
#' result from version 1.1.7 onward.
#'
#' @return The returned value is a data.frame of SNP information sorting by Fst
#' in descending order, which contains 7 columns, chr, SNP, centimorgans,
#' position, allele1, allele2, and Fst. The column 1-6 are SNP information from
#' the bim file. The column Fst contains estimated Fst between group1 and
#' group2.
#'
#' @export
#'
#' @examples
#'
#' # Importantly, bed file, bim file, and fam file are required
#' # Use the example files embedded in the package
#'
#' BED.file <- system.file("extdata",
#'                         "ipcaps_example.bed",
#'                         package = "IPCAPS.BIOC")
#'
#' LABEL.file <- system.file("extdata",
#'                           "ipcaps_example_individuals.txt.gz",
#'                           package = "IPCAPS.BIOC")
#'
#' my.cluster <- ipcaps(bed = BED.file,
#'                      label.file = LABEL.file,
#'                      lab.col = 2,
#'                      out = tempdir(),
#'                      max.thread = 1,
#'                      seed = 1234)
#'
#' table(my.cluster$cluster$group,my.cluster$cluster$label)
#' #   outlier6 pop1 pop2 pop3 pop4 pop5
#' # 1        2    0    0    0    0    0
#' # 2        0    0    0    0    0  200
#' # 3        2    0    0    0  200    0
#' # 4        0    0    0  199    0    0
#' # 5        0  199    4    0    0    0
#' # 6        0    1  196    1    0    0
#'
#' #Identify top discriminators between groups, for example, group 4 and group 5
#' top.snp1 <- top.discriminator(my.cluster, 4, 5)
#' head(top.snp1)
#' #or, specify the bim file
#' BIM.file <- system.file("extdata",
#'                         "ipcaps_example.bim",
#'                         package = "IPCAPS.BIOC")
#'
#' top.snp2 <- top.discriminator(my.cluster,
#'                               4,
#'                               5,
#'                               bim.file = BIM.file)
#' head(top.snp2)
#' # chr SNP centimorgans position allele1 allele2 Fst
#' #V5452 1 marker5452 0 54520000 A T 0.11337260
#' #V2348 1 marker2348 0 23480000 A T 0.11194490
#' #V8244 1 marker8244 0 82440000 A T 0.09556580
#' #V5972 1 marker5972 0 59720000 A T 0.08747794
#' #V3561 1 marker3561 0 35610000 A T 0.08725860
#' #V8419 1 marker8419 0 84190000 A T 0.08293494
#'
#' #Alternatively, specify the previous result directory of ipcaps and identify
#' #top discriminators between groups, for example, group 4 and group 5
#' previous.res.path <- my.cluster$output.dir
#' top.snp3 <-top.discriminator(result.path = previous.res.path,
#'                             use.path = TRUE,
#'                             group1 = 4,
#'                             group2 = 5)
#'
#' head(top.snp3)
#'
#' #Identify top discriminators between groups, for example, group 4 and group 5
#' top.snp4 <- top.discriminator(my.cluster,4,5)
#' head(top.snp4)
#' #or, specify the bim file
#' BIM.file <- system.file("extdata",
#'                         "ipcaps_example.bim",
#'                         package = "IPCAPS.BIOC")
#'
#' top.snp5 <- top.discriminator(my.cluster,
#'                              4,
#'                              5,
#'                              bim.file = BIM.file)
#'
#' dim(top.snp5)
#' head(top.snp5)

#'
#' #Here, it is possible to select the top Fst SNPs based on a percentile.
#' top.snp6 <-top.discriminator(my.cluster,
#'                             4,
#'                             5,
#'                             percentile = 0.9,
#'                             use.percentile = TRUE)
#'
#' dim(top.snp6)
#' head(top.snp6)
#'
#' #Identify top discriminators between groups, for example, node 7 and node 8
#' top.snp7 <-top.discriminator(my.cluster,
#'                              7,
#'                              8,
#'                              use.node.number = TRUE)
#'
#' head(top.snp7)
#'
#' # chr SNP centimorgans position allele1 allele2 Fst
#' #V5452 1 marker5452 0 54520000 A T 0.11337260
#' #V2348 1 marker2348 0 23480000 A T 0.11194490


top.discriminator <-
    function(cluster.obj = NULL,
            group1,
            group2,
            bim.file,
            use.node.number = FALSE,
            num.top = 100,
            percentile = 0.9,
            use.percentile = FALSE,
            use.path = FALSE ,
            result.path = NULL) {
        raw.data <- NULL
        used.path <- NULL

        if (is.null(cluster.obj) && is.null(result.path)) {
            cat(
                paste0(
                    "Incorrect parameter, please use the object ",
                    "returned from the ",
                    "function ipcaps as an input\n"
                )
            )
            return(NULL)
        }

        if (use.path == TRUE) {
            used.path = result.path
            if (!dir.exists(used.path)) {
                cat(paste0(
                    "The result path doesn't exist, please check result.path\n"
                ))
                return(NULL)
            }
            result.filename = file.path(used.path, "RData", "result.RData")
            load(result.filename)
        } else{
            used.path = cluster.obj$output.dir
        }


        raw.filename = file.path(used.path, "RData", "rawdata.RData")
        if (!file.exists(raw.filename)) {
            cat(paste0("Not found the rawdata file: ", raw.filename, "\n"))
            return(NULL)
        } else{
            load(raw.filename)
            if (is.null(snp.info)) {
                if (!file.exists(bim.file)) {
                    cat(paste0("Not found the bim file: ", bim.file, "\n"))
                    return(NULL)
                } else{
                    snp.info <-
                        read.table(
                            file = bim.file,
                            colClasses = c(
                                'factor',
                                'factor',
                                'factor',
                                'factor',
                                'factor',
                                'factor'
                            )
                        )
                }
            }
        }


        if (num.top < 0) {
            cat(paste0("num.top must be more than zero\n"))
            return(NULL)
        }

        if (use.node.number == FALSE) {
            index1 <-
                cluster.obj$cluster$row.number[
                    which(cluster.obj$cluster$group == group1)]
            index2 <-
                cluster.obj$cluster$row.number[
                    which(cluster.obj$cluster$group == group2)]
        } else{
            index1 <-
                cluster.obj$cluster$row.number[
                    which(cluster.obj$cluster$node == group1)]
            index2 <-
                cluster.obj$cluster$row.number[
                    which(cluster.obj$cluster$node == group2)]
        }

        all.fst <- fst.each.snp.hudson(raw.data, index1, index2)

        snp.with.fst <- cbind(snp.info, all.fst)
        colnames(snp.with.fst) <-
            c('chr',
            'SNP',
            'centimorgans',
            'position',
            'allele1',
            'allele2',
            'Fst')
        top.fst.snp <- snp.with.fst[order(-snp.with.fst$Fst),]

        if (use.percentile == FALSE) {
            ret <- head(top.fst.snp, n = num.top)
        } else{
            qfst = quantile(snp.with.fst$Fst,
                            c(as.double(percentile)),
                            na.rm = TRUE)
            ret = top.fst.snp[which(top.fst.snp$Fst > qfst),]
        }
        return(ret)
    }
kridsadakorn/ipcaps.bioc documentation built on Jan. 22, 2020, 11:18 p.m.