Siccuracy: Pipeline Package for AlphaImpute

# Phase-o-to-geno #####################

#' Convert phase files to genotype files.
#' 
#' Simply sums every pair of rows together.
#' 
#' A phase file has format similar to SNP files, expect the genotype on each allele are listed on two pairs of rows.
#' Each individual has therefore two rows, one for each allele, see following example:
#'
#' \code{Genotype file:}
#' \tabular{lccccc}{
#'  1003 \tab 0 \tab 1 \tab 1 \tab 2 \tab 0 \cr
#'  1004 \tab 1 \tab 1 \tab 0 \tab 1 \tab 0 \cr
#' }
#'
#' \code{Phase file:}
#' \tabular{lccccc}{
#'  1003 \tab 0 \tab 0 \tab 1 \tab 1 \tab 0 \cr
#'  1003 \tab 0 \tab 1 \tab 0 \tab 1 \tab 0 \cr
#'  1004 \tab 0 \tab 1 \tab 0 \tab 0 \tab 0 \cr
#'  1004 \tab 1 \tab 0 \tab 0 \tab 1 \tab 0 \cr
#' }
#'
#' By default assumes genotypes are whole integers (cf. \code{int} argument).
#' If numeric values are required, e.g. gene dosages from imputation, use \code{int=FALSE}. 
#' Use \code{format} to change from default 5 characters width and 2 decimals (\code{"5.2"}). 
#' NB! Function does not test for validity of this argument, so change with caution!
#'
#' \strong{Missing values:} Values after summing less than 0 or greater than 2 are assumed as missing and replaced with \code{na}.
#'
#' @param fn Filename of input file, every two rows are for same animal.
#' @param outfn Filename of intended output.
#' @param ncol Integer, number of SNP columns in files. When \code{NULL}, automagically detected with \code{get_ncols(phasefn)-1}.
#' @param nlines Integer, maximum number of pairs of lines to convert.
#' @param na Missing value.
#' @param int Logical (default \code{TRUE}), read and write integers.
#' @param format Character, Fortran edit descriptors for output. See \link{parseformat}.
#' @return Number of rows written.
#' @export
convert_phases <- function(fn, outfn, ncol=NULL, nlines=NULL, na=9, int=TRUE, format=NULL) {
  stopifnot(file.exists(fn))
  
  if (is.null(ncol)) ncol <- get_ncols(fn)-1
  if (is.null(nlines)) nlines <- 0
  
  format <- parse.format(format, int)
  
  # subroutine convert_phase(phasefn, genofn, ncol, nrow, na, int, lenfmt, userfmt)
  res <- .Fortran('convert_phase', 
                  phasefn=as_fortran_character(fn), 
                  genofn=as_fortran_character(outfn), 
                  ncol=as.integer(ncol), 
                  nrow=as.integer(nlines), 
                  na=as.integer(na), 
                  int=as.integer(int), 
                  lenfmt=as.integer(nchar(format)), userfmt=as.character(format))
  res$nrow
}


# Convert plink A format #########################

#' Convert PLINK recoded A to SNP file.
#' 
#' Facilitates converting a PLINK binary file to simplified SNP file format.
#' Requires using PLINK to recode it to the \code{A} format by using command line \code{plink -bfile <file stem> --recode A}.
#' This function then swiftly strips of first 6 columns (family ID, sample ID, paternal ID, maternal ID, sex, phenotypic record) 
#' and inserts an integer-based ID column. \code{NA}'s outputted from PLINK are replaced with \code{na} argument.
#' 
#' @param rawfn Plink output filename. Most likely \code{plink.raw} if PLINK command line argument \code{--out} is not used.
#' @param outfn Filename of new file.
#' @param newID Integer scalar (default \code{0}) for automatically assigning new IDs. See description for more. 
#' @param ncol Integer,number of SNP columns in \code{rawfn}. When \code{NULL}, automagically detected with \code{get_ncols(rawfn)-6}.
#' @param nlines Number of lines to process.
#' @param na Missing value, 
#' @inheritSection convert_plink Assigning new IDs
#' @return Data.frame with columns \code{famID}, \code{sampID}, and \code{newID}.
#' @references 
#' \itemize{
#'  \item PLINK. Purcell and Chang. \url{https://www.cog-genomics.org/plink2}
#'  \item Chang CC, Chow CC, Tellier LCAM, Vattikuti S, Purcell SM, Lee JJ (2015) Second-generation PLINK: rising to the challenge of larger and richer datasets. GigaScience, 4. doi: \href{https://doi.org/10.1186/s13742-015-0047-8}{10.1186/s13742-015-0047-8} \href{http://gigascience.biomedcentral.com/articles/10.1186/s13742-015-0047-8}{link}.
#' }
#' @export
#' @seealso 
#' \code{\link{convert_plink}} is a direct conversion that does not rely on PLINK.
convert_plinkA <- function(rawfn, outfn, newID=0, ncol=NULL, nlines=NULL, na=9) {
  stopifnot(file.exists(rawfn))
  
  if (is.data.frame(newID)) stopifnot(all(c('famID','sampID','newID') %in% names(newID)))
  #if (is.null(ncol)) ncol <- get_ncols(rawfn) - 6
  
  firstline <- scan(rawfn, what=character(), nlines=1, quiet=TRUE)
  if (is.null(ncol)) ncol <- length(firstline) - 6
  header <- all(firstline[1:3] == c('FID','IID','PAT'))
  
  firstcols <- get_firstcolumn(rawfn, class=list('character','character'), col.names=c('famID','sampID'), header=header)
  if (is.null(nlines)) {
    if (length(newID) == 1) {
      nlines <- nrow(firstcols)
    } else if (is.data.frame(newID)) {
      nlines = nrow(newID)
    } else {
      nlines = length(newID)
    }
  }
  
  if (length(newID) == 1) {
    .newID <- firstcols
    .newID$newID <- 1:nrow(.newID) + newID
  } else if (is.atomic(newID)) {
    .newID <- cbind(firstcols[1:nlines,], newID[1:nlines])
  } else {
    newID <- do.call(data.frame, append(lapply(newID, as.character), list(stringsAsFactors=FALSE)))
    .newID <- merge(firstcols, newID, by=c('famID','sampID'), sort=FALSE, all.x=TRUE, all.y=FALSE, stringsAsFactors=FALSE)
  }
  
  nlines <- min(nlines, nrow(.newID))
  newID <- .newID[1:nlines,]
  newID$newID <- as.integer(newID$newID)
  
  #subroutine convert_plinkA(rawfn, outputfn, newID, ncol, nrow, naval, stat) 
  res <- .Fortran('convertplinka', 
                  rawfn=as_fortran_character(rawfn), 
                  outputfn=as_fortran_character(outfn), 
                  newID=as.integer(newID$newID),
                  ncol=as.integer(ncol), 
                  nrow=as.integer(nlines), 
                  naval=as.integer(na), 
                  header=as.integer(header),
                  stat=integer(1), 
                  PACKAGE='Siccuracy', NAOK=TRUE)
  
  newID
}


# Convert PLINK binary files ########
# Our understanding of the different filetypes.
# The binary bim/bed/ped format lists for each loci (in bm-file) the major and minor allele.
# The corresponding bed file counts the *minor* allele.
#
# Using `PLINK --recode A` option probably counts the *major* allele; it is llisted in the header. 

#' Converts PLINK binary format to SNP formatted file.
#' 
#' @details 
#' 
#' \strong{\code{method}} \emph{simple} stores entire genotype matrix \emph{in memory}, as PLINK binary files are stored in locus-major mode, 
#' i.e. first \eqn{m} bits store first locus for all \eqn{n} animals.
#' Since we are interested in writing out all \eqn{m} loci for each animal, for efficiency we need to read the entire file.
#' \emph{lowmem} breaks the loci into smaller chunks (e.g. by chromosome), writes each chunk to a file, and merges them back as with \code{\link{cbind_SNPs}}.
#' \emph{dryrun} does not call the Fortran subroutine, but returns the treated arguments that would have been sent to the subroutine.
#'
#' For \code{method='lowmem'} use argument \code{fragment} to indicate how the loci are subdivided. 
#' When \code{fragment='chr'} (case unsensitive), loci are split according to 1st column of .bim file.
#' If \code{fragment} is a scalar integer, loci are split into this number of blocks.
#' If an integer vector of same length as \code{ncol}, it directly specifies which block a locus is sent to. \code{max(fragment)} specifies the number of blocks.
#'
#' @section Assigning new IDs: 
#' The new integer IDs can be supplied. If not, they will be made for you.
#' \code{newID} may be an integer vector and will be used as is.
#' If data.frame with columns \code{famID}, \code{sampID}, and \code{newID}, they will be reordered to match input file.
#'   
#' @section Filtering loci or samples:
#' Filters on loci or samples can be employed in a number of ways; filtering on loci and samples are handled independently.
#' Inclusion criteria (\code{extract} and \code{keep}) reduces the output to only those loci or samples that pass the criteria.
#' Exclusion criteria (\code{exclude} and \code{remove}) are applied \emph{after} inclusion criteria, and \emph{reduces} the output further.
#'
#' \code{extract} and \code{exclude} can be any combination of:
#' \describe{
#'  \item{Logical}{Vector of same length as loci in input file.}
#'  \item{Integer or numeric}{Indicates positional which loci to include or exclude. Numeric vectors are coerced to integer vectors.}
#'  \item{Character}{Matched against probe IDs, i.e. 2nd column of .bim file.}
#' }
#' For restricting the output to certain chromosomes, use \code{extract_chr}. The output is the intersect of \code{extract} and \code{exctract_chr}.
#' 
#' \code{keep} and \code{remove} are as \code{exctract} and \code{exclude} above, can be a combination of, and can additionally be:
#' \describe{
#'  \item{Character}{Matched against both famID or sampID, i.e. 1st and 2nd column of .fam file.}
#'  \item{List with named elements \code{famID} and/or \code{sampID}}{The named elements are matched against, respectively, the 1st and 2nd column of the .fam file.}
#' }
#' 
#' @section Fragment file names:
#' The argument \code{fragmentfns} is used for method \code{'lowmem'}, providing filenames 
#' (absolute or relative) for producing the final converted files and intermediate .bim files.
#' When \code{remerge=TRUE}, the argument \code{outfn} is ignored.
#' 
#' \code{fragmentfns} defaults to temporary files, created with \code{\link[base]{tempfile}}.
#' If a character vector, the first $n_f$ elements are filenames for $n_f$ fragments (e.g. chromosomes).
#' The following $n_f + 1 ... 2 n_f$ elements are for the intermediate .bim files.
#' The vector is automatically padding with temporary files to the required length.
#' 
#' If \code{fragmentfns} is a \emph{function}, it will be called with 0, 1, or 2
#' arguments. The first argument is a running number for the fragments, the second
#' is the maximum number of fragments.
#' 
#' 
#' @param bfile Filename of PLINK binary files, i.e. without extension.
#' @param outfn Filename of new file.
#' @param na Missing value.
#' @param newID Integer scalar (default \code{0}) for automatically assigning new IDs. See description for more. 
#' @param nlines Number of lines to process.
#' @param fam If binary files have different stems, specify each of them with \code{fam}, \code{bim}, \code{bed}, and set \code{bfile=NULL}.
#' @param bim See \code{fam}.
#' @param bed See \code{fam}.
#' @param countminor Logical: Should the output count minor allele (default), or major allele as \code{plink --recode A}.
#' @param maf Numeric, restrict SNPs to SNPs with this frequency. 
#' @param chr Vector of chromosomes to limit output to.
#' @param extract Extract only these SNPs, see Details.
#' @param exclude Do not extract these SNPs, see Details.
#' @param extract_chr Extract only these chromosomes, see Details.
#' @param keep Keep only these samples, see Details.
#' @param remove Removes these samples from output, see Details.
#' @param fragments \code{"chr"} or integer vector. Only used when \code{method='lowmem'}.
#' @param remerge Logical, whether to re-merge fragmented blocks. Only used when \code{method='lowmem'}.
#' @param fragmentfns Character vector or function for producing filenames.
#' @param method Character, which of following methods to use: \code{simple}, \code{lowmem}, or \code{drymem}. See Details.
#' @export
#' @references 
#' \itemize{
#'  \item PLINK v. 1.07 BED file format: \url{https://www.cog-genomics.org/plink/1.9/formats#bed}
#'  \item Shaun Purvell and Christopher Chang. \emph{PLINK v. 1.90} \url{https://www.cog-genomics.org/plink2}
#'  \item Chang CC, Chow CC, Tellier LCAM, Vattikuti S, Purcell SM, Lee JJ (2015) Second-generation PLINK: rising to the challenge of larger and richer datasets. GigaScience, 4. doi: \href{https://doi.org/10.1186/s13742-015-0047-8}{10.1186/s13742-015-0047-8} \href{http://gigascience.biomedcentral.com/articles/10.1186/s13742-015-0047-8}{link}.
#' }
#' @seealso 
#' \code{convert_plink} is a direct conversion that does not rely on PLINK.
#' See the alternate \code{\link{convert_plinkA}} which re-formats the output from \code{plink --recode A}.
convert_plink <- function(bfile, outfn, na=9, newID=0, nlines=NULL, fam=NULL, bim=NULL, bed=NULL, countminor=TRUE, maf=0.0, chr=NULL, extract=NULL, exclude=NULL, extract_chr=NULL, keep=NULL, remove=NULL, method='simple', fragments="chr", remerge=TRUE, fragmentfns=NULL) {
  
  # Get filenames
  if (!(is.null(bfile) | is.na(bfile))) {
    bfile <- sub_ext(bfile, c('fam','bim','bed'))
    fam <- bfile[1]
    bim <- bfile[2]
    bed <- bfile[3]
  }
  stopifnot(all(file.exists(fam, bim, bed)))

  # Detect subroutine:
  use.method <- pmatch(method, c('simple','lowmem','dryrun'))

  # Make up new IDs, as in convert_plinkA:
  if (is.data.frame(newID)) stopifnot(all(c('famID','sampID','newID') %in% names(newID)))
  #if (is.null(ncol)) ncol <- get_ncols(rawfn) - 6
  
  firstline <- scan(fam, what=character(), nlines=1, quiet=TRUE)
  if (is.null(ncol)) ncol <- length(firstline) - 6
  header <- all(firstline[1:3] == c('FID','IID','PAT'))
  
  firstcols <- get_firstcolumn(fam, class=list('character','character'), col.names=c('famID','sampID'), header=header)
  if (is.null(nlines)) {
    if (length(newID) == 1) {
      nlines <- nrow(firstcols)
    } else if (is.data.frame(newID)) {
      nlines = nrow(newID)
    } else {
      nlines = length(newID)
    }
  }
  
  if (length(newID) == 1) {
    .newID <- firstcols
    .newID$newID <- 1:nrow(.newID) + newID
  } else if (is.atomic(newID)) {
    .newID <- firstcols
    .newID$newID <- 0
    .newID$newID[1:length(newID)] <- newID
  } else {
    newID <- do.call(data.frame, append(lapply(newID[,c('famID','sampID','newID')], as.character), list(stringsAsFactors=FALSE)))
    .newID <- merge(firstcols, newID, by=c('famID','sampID'), sort=FALSE, all.x=TRUE, all.y=FALSE, stringsAsFactors=FALSE)
  }

  nlines <- nrow(.newID)
  newID <- .newID[1:nlines,]
  newID$newID <- as.integer(newID$newID)
  
  
  # Handle samples
  .keep <- newID$newID != 0
    
  if (!is.null(keep) | !is.null(remove)) {
    if (is.logical(keep)) {
      if (sum(!is.na(keep)) < nlines) stop('`keep` as logical must be at least same length as samples in input file, without NA\'s, and as `newID` (if not scalar).')
    }
    if (is.logical(remove)) {
      if (sum(!is.na(remove)) < nlines) stop('`keep` as logical must be at least same length as samples in input file, without NA\'s, and as `newID` (if not scalar).')
    }

    if (is.numeric(keep)) keep <- as.integer(keep)
    if (is.numeric(remove)) remove <- as.integer(remove)
   
    if (!is.logical(keep) & !is.integer(keep) & !is.list(keep)) keep <- as.character(keep)
    if (!is.logical(remove) & !is.integer(remove) & !is.list(remove)) remove <- as.character(remove)
    
    if (is.logical(keep)) {
      .keep <- keep
    } else if (length(keep) > 0) {
      .keep[] <- FALSE
      if (is.integer(keep)) {
        .keep[keep] <- TRUE
      } else if (is.character(keep)) {
        .keep[newID$famID %in% keep] <- TRUE
        .keep[newID$sampID %in% keep] <- TRUE
      } else if (is.list(keep)) {
        .keep[newID$famID %in% keep$famID] <- TRUE
        .keep[newID$sampID %in% keep$sampID] <- TRUE
      }
    }
    
    if (is.logical(remove)) {
      .keep <- .keep & !remove
    } else if (length(remove) > 0) {
      if (is.integer(remove)) {
        .keep[remove] <- FALSE
      } else if (is.character(remove)) {
        .keep[newID$famID %in% remove] <- FALSE
        .keep[newID$sampID %in% remove] <- FALSE
      } else if (is.list(remove)) {
        .keep[newID$famID %in% remove$famID] <- FALSE
        .keep[newID$sampID %in% remove$sampID] <- FALSE
      }      
    }
  }
  newID$keep <- .keep
  
  # Handle SNPS
  # Number of columns:
  ncol <- get_nlines(bim)
  .extract <- rep(TRUE, ncol)
  snps <- NULL
  
  if (!is.null(chr)) {
    snps <- get_firstcolumn(bim, class=c('character','character'), col.names=c('chr','rs'), stringsAsFactors=FALSE)
    .extract <- snps$chr %in% as.character(chr)
  }
  
  # Decide upon exclude/include
  if (!is.null(extract) | !is.null(exclude) | !is.null(extract_chr)) {
    .is.na <- function(x) {if (is.null(x)) return(logical(0)); is.na(x)}
    if (is.logical(extract) & sum(!.is.na(extract)) < ncol) stop('`extract` as logical must be same length as SNPs in input file and without NA\'s.')
    if (is.logical(exclude) & sum(!.is.na(exclude)) < ncol) stop('`exclude` as logical must be same length as SNPs in input file and without NA\'s.')
    
    if (is.numeric(extract)) extract <- as.integer(extract)
    if (is.numeric(exclude)) exclude <- as.integer(exclude)
    
    if (!is.logical(extract) & !is.integer(extract)) extract <- as.character(extract)
    if (!is.logical(exclude) & !is.integer(exclude)) exclude <- as.character(exclude)

    if ((is.character(extract) | is.character(exclude) | !is.null(extract_chr)) & is.null(snps)) {
      snps <- get_firstcolumn(bim, class=c('character','character'), col.names=c('chr','rs'), stringsAsFactors=FALSE)
    }    
  
    if (is.logical(extract)) {
      .extract <- extract
    } else if (length(extract) > 0) {
      .extract[] <- FALSE
      if (is.integer(extract)) {
        .extract[extract] <- TRUE
      } else {
        .extract[snps$rs %in% extract] <- TRUE
      }
    }
    
    if (!is.null(extract_chr)) {
      extract_chr <- as.character(extract_chr)
      .extract <- .extract & snps$chr %in% extract_chr
    }
    
    if (is.logical(exclude)) {
      .extract <- .extract & !exclude
    } else if (length(exclude) > 0) {
      if (is.integer(exclude)) {
        .extract[exclude] <- FALSE
      } else {
        .extract[snps$rs %in% exclude] <- FALSE
      }
    }
  }
  
  
  ##  Fragments
  if (use.method != 1) {
    if (is.character(fragments) & length(fragments) == 1 & tolower(fragments) == 'chr') {
      if (is.null(snps)) snps <- get_firstcolumn(bim, class=c('character','character'), col.names=c('chr','rs'), stringsAsFactors=FALSE)
      fragments <- cumsum(!duplicated(snps$chr))
    }
    if (!is.integer(fragments)) fragments <- as.integer(fragments)
    if (length(fragments) == 1) {
      fragments <- rep(1:fragments, each=ceiling(ncol/fragments))[1:ncol]
    }
    if (length(fragments) != ncol) stop('Argument `fragments` should be scalar or same length as number of loci.')
    
    
    if (is.function(fragmentfns)) {
      n <- length(formals(fragmentfns))
      if (n == 0) {
        tmpfiles <- replicate(max(fragments), fragmentfns(), simplify=TRUE)
      } else if (n == 1) {
        tmpfiles <- sapply(1:max(fragments), fragmentfns)
      } else {
        tmpfiles <- sapply( 1:max(fragments), fragmentfns, max(fragments))
      }
    } else {
      tmpfiles <- as.character(fragmentfns)
    }
    tmpfiles <- c(tmpfiles, as.character(replicate(2*max(fragments)-length(tmpfiles), tempfile())))
    
  }  
  
  if (use.method == 1) {
    #subroutine readplinksimple(bed, fnout, ncol, nlines, na, newID, minor, maf, extract, keep, status)
    res <- .Fortran('readplinksimple', 
                    bed=as_fortran_character(bed), 
                    fnout=as_fortran_character(outfn), 
                    ncol=as.integer(ncol), 
                    nlines=as.integer(nlines), 
                    na=as.integer(na), 
                    newID=as.integer(newID$newID), 
                    minor=as.integer(countminor), 
                    maf=as.numeric(maf), 
                    extract=as.integer(.extract), 
                    keep=as.integer(.keep), 
                    status=as.integer(0),
                    PACKAGE='Siccuracy')
  } else if (use.method == 2) {
    # The not-so-simple complex way to do stuff. Boy, do we get to have fun now!
    tmpfile <- tempfile()
    writeLines(tmpfiles, tmpfile)
    
    #subroutine convertplinkrwrapper(listfn, n, remerge, fragments, &
    #                       bed, fnout, ncol, nlines, na, newID, minor, maf, extract, keep, status)
    res <- .Fortran('convertplinkrwrapper', 
                    listfn=as_fortran_character(tmpfile), 
                    n=as.integer(length(tmpfiles)), 
                    remerge=as.integer(remerge), 
                    fragments=as.integer(fragments),
                    bed=as_fortran_character(bed), 
                    fnout=as_fortran_character(outfn), 
                    ncol=as.integer(ncol), 
                    nlines=as.integer(nlines),
                    na=as.integer(na), 
                    newID=as.integer(newID$newID),
                    minor=as.integer(countminor), 
                    maf=as.numeric(maf), 
                    extract=as.integer(.extract), 
                    keep=as.integer(.keep),
                    status=as.integer(0),
                    PACKAGE='Siccuracy')
    
    res$listfn <- NULL
    res$n <- NULL
    res$fragmentfns <- tmpfiles
  } else if (use.method == 3) {
    res <- list(bed=as.character(bed), fnout=as.character(outfn), 
                ncol=as.integer(ncol), nlines=as.integer(nlines), na=as.integer(na), newID=as.integer(newID$newID), minor=as.integer(countminor), 
                maf=as.numeric(maf), extract=as.integer(.extract), keep=as.integer(.keep),
                fragments=as.integer(fragments), fragmentfns=as.character(tmpfiles))
  } 
  res$newID=newID
  res
}