R/geno_012.R

############
# geno_012 #
############

#' Marker scores into 0, 1, 2 format
#'
#' Transform genotype marker score into 0, 1, 2 format. The score represents the
#' number of copies of the allele with the lowest marker allele frequency (MAF).
#'
#' @param mk.mat \code{Character} genotype  marker score \code{matrix}.
#' \strong{Marker scores must be coded using one letter for each allele.
#' For example, AA, CC, GG, TT, AC, AG, AT, CA, CG, CT, GA, GC, GT, TA, TC, TG.
#' Missing values must be coded NA.}
#'
#' @return Return:
#'
#' \code{List} with two matrices
#'
#' \item{geno012}{Marker \code{matrix} with marker scores coded as 0, 1, 2
#' corresponding to the number of copies of the least frequent SNP allele.}
#'
#' \item{all.ref}{\code{matrix} with reference allele scores. The first row
#' represents the allele with the highest MAF, the second the one with the
#' lowest MAF and the two others the heterozygous scores.}
#'
#' @author Vincent Garin
#'
#' @examples
#'
#' Later
#'
#' @export
#'


geno_012 <- function(mk.mat) {

  geno.names <- rownames(mk.mat)

  # 1. Establish the allele with the highest MAF
  ##############################################

  ### 1.1 function to apply along the matrix

  # The function return vector of four characters. The first one represent
  # the allele with the highest MAF. Then the second one and the two
  # heterozygous


  allele.MAF <- function(x) {

    x <- x[!is.na(x)] # remove missing values

    if (length(x) == 0){ # Only missing values

      return(c(NA, NA, NA, NA))

    } else {

      # split all marker score into single allele scores

      alleles <- c(vapply(X = x, FUN = function(x) unlist(strsplit(x, split = "")),
                          FUN.VALUE = character(2)))

      alleles.sum <- table(alleles)

      if (length(alleles.sum) == 1) { # monomorphic case

        return(c(NA, NA, NA, NA))

      } else {

        all.sc <- unlist(attr(sort(alleles.sum, decreasing = FALSE), "dimnames"))
        all.sc <- c(paste0(all.sc[1], all.sc[1]), paste0(all.sc[2], all.sc[2]),
                    paste0(all.sc[1], all.sc[2]), paste0(all.sc[2], all.sc[1]))

        return(all.sc)

      }

    }

  }

  ### 1.2 computation of the allele with the highest MAF

  all.class <- apply(X = mk.mat, MARGIN = 2, FUN = allele.MAF)


  # 2. transform the marker scores. Given the allele with the highest MAF
  #######################################################################

  trans012 <- function(x, ref) {

    if(is.na(ref[1])){

      x <- rep(NA, length(x))

    } else {

      x[is.na(x)] <- NA
      x[x == ref[1]] <- 2
      x[x == ref[2]] <- 0
      x[x == ref[3]] <- 1
      x[x == ref[4]] <- 1

    }

    return(as.numeric(x))

  }

  mk.mat.list <- data.frame(mk.mat, stringsAsFactors = FALSE)
  all.class.list <- data.frame(all.class, stringsAsFactors = FALSE)

  geno012 <- mapply(FUN = trans012, x = mk.mat.list, ref = all.class.list)
  rownames(geno012) <- geno.names

  results <- list(geno012 = geno012, all.ref = all.class)

  return(results)

}
vincentgarin/GWASToolBox documentation built on May 6, 2019, 8:59 p.m.