R/import.R

Defines functions fasta2DNAbin fasta2genlight read.PLINK extract.PLINKmap read.snp import2genind read.structure read.genepop read.fstat read.genetix df2genind

Documented in df2genind extract.PLINKmap fasta2DNAbin fasta2genlight import2genind read.fstat read.genepop read.genetix read.PLINK read.snp read.structure

###################################################################
## Fonctions designed to import files from other softwares
## into genind objects
##
## currently supported formats are :
## .gtx (GENETIX)
## .dat (Fstat)
## .gen (Genepop)
## .stru (STRUCTURE)
##
## Thibaut Jombart, avril 2006
## Revised March 2015
## t.jombart@imperial.ac.uk
##
##################################################################


######################
## Function df2genind
######################
#' Convert a data.frame of allele data to a genind object.
#'
#' The function \code{df2genind} converts a data.frame (or a matrix) into a
#' \linkS4class{genind} object. The data.frame must meet the following
#' requirements:
#' \itemize{
#' \item genotypes are in row (one row per genotype)
#' \item markers/loci are in columns
#' \item each element is a string of characters coding alleles, ideally
#' separated by a character string (argument \code{sep}); if no separator is
#' used, the number of characters coding alleles must be indicated (argument
#' \code{ncode}).}
#'
#' See \code{\link{genind2df}} to convert \linkS4class{genind} objects back to
#' such a data.frame.
#'
#' === Details for the \code{sep} argument ===\cr this character is directly
#' used in reguar expressions like \code{gsub}, and thus require some characters
#' to be preceeded by double backslashes. For instance, "/" works but "|" must
#' be coded as "\\|".
#'
#' @aliases df2genind
#' @param X a matrix or a data.frame containing allelle data only (see
#'   decription)
#' @param sep a character string separating alleles. See details.
#' @param ncode an optional integer giving the number of characters used for
#'   coding one genotype at one locus. If not provided, this is determined from
#'   data.
#' @param ind.names optinal, a vector giving the individuals names; if NULL,
#'   taken from rownames of X. If factor or numeric, vector is converted to
#'   character.
#' @param loc.names an optional character vector giving the markers names; if
#'   NULL, taken from colnames of X.
#' @param pop an optional factor giving the population of each individual.
#' @param NA.char a character string corresponding to missing allele (to be
#'   treated as NA)
#' @param ploidy an integer indicating the degree of ploidy of the genotypes.
#' @param type a character string indicating the type of marker: 'codom' stands
#'   for 'codominant' (e.g. microstallites, allozymes); 'PA' stands for
#'   'presence/absence' markers (e.g. AFLP, RAPD).
#' @param strata an optional data frame that defines population stratifications
#'   for your samples. This is especially useful if you have a hierarchical or
#'   factorial sampling design.
#' @param hierarchy a hierarchical formula that explicitely defines hierarchical
#'   levels in your strata. see \code{\link{hierarchy}} for details.
#' @param check.ploidy a boolean indicating if the ploidy should be checked (TRUE,
#' default) or not (FALSE). Not checking the ploidy makes the import much faster,
#' but might result in bugs/problems if the input file is misread or the ploidy is
#' wrong. It is therefore advised to first import and check a subset of data to 
#' see if everything works as expected before setting this option to false. 
#'
#' @return an object of the class \linkS4class{genind} for \code{df2genind}; a
#'   matrix of biallelic genotypes for \code{genind2df}
#'
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}, Zhian N. Kamvar
#'   \email{kamvarz@@science.oregonstate.edu}
#'
#' @seealso \code{\link{genind2df}}, \code{\link{import2genind}},
#'   \code{\link{read.genetix}}, \code{\link{read.fstat}},
#'   \code{\link{read.structure}}
#'
#' @keywords manip
#' @examples
#'
#' ## simple example
#' df <- data.frame(locusA=c("11","11","12","32"),
#' locusB=c(NA,"34","55","15"),locusC=c("22","22","21","22"))
#' row.names(df) <- .genlab("genotype",4)
#' df
#'
#' obj <- df2genind(df, ploidy=2, ncode=1)
#' obj
#' tab(obj)
#'
#'
#' ## converting a genind as data.frame
#' genind2df(obj)
#' genind2df(obj, sep="/")
#'
#' @export
#'
df2genind <- function(X, sep=NULL, ncode=NULL, ind.names=NULL, loc.names=NULL,
                      pop=NULL, NA.char="", ploidy=2, type=c("codom","PA"),
                      strata = NULL, hierarchy = NULL,
                      check.ploidy = getOption("adegenet.check.ploidy")){

    ## CHECKS ##
    if(is.data.frame(X)) X <- as.matrix(X)
    if (!inherits(X, "matrix")) stop ("X is not a matrix")
    res <- list()
    type <- match.arg(type)
    if (is.null(sep) && is.null(ncode) && any(ploidy > 1)){
        stop("Not enough information to convert data: please indicate the separator (sep=...) or the number of characters coding an allele (ncode=...)")
    }
    if(length(NA.char)>1) {
        warning("NA.char has several values; only the first one will be considered")
        NA.char <- NA.char[1]
    }

    # If by any chance provided ind.names are of class int/factor, they are (silently?) converted to characters.
    if (!is.character(ind.names) & !is.null(ind.names)) {
      ind.names <- as.character(ind.names)
    }


    ## TYPE-INDEPENDENT STUFF ##
    ## misc variables
    n <- nrow(X)
    nloc <- ncol(X)
    if (length(ploidy) < n){
      if (length(ploidy) == 1){
        ploidy <- rep(as.integer(ploidy), length=n)
      } else {
        undefined <- length(ploidy)/n
        msg <- paste0("\nPloidy is undefined for ",
                      undefined*100,
                      "% of data.\n",
                      "This must be a single integer indicating the ploidy of",
                      "the entire data set or vector of integers the same",
                      "length as the number of samples.")
        stop(msg)
      }

    }
    if(any(ploidy < 1L)) stop("ploidy cannot be less than 1")

    ## check individual labels
    if(is.null(ind.names)) ind.names <- rownames(X)
    if(is.null(ind.names)) ind.names <- .genlab("",n)
    if(any(duplicated(ind.names))){
        warning("duplicate labels detected for some individuals; using generic labels")
        ind.names <- .genlab("",n)
    }
    rownames(X) <- ind.names

    ## check locus labels
    if(is.null(loc.names)) loc.names <- colnames(X)
    if(is.null(loc.names)) loc.names <- .genlab("loc",nloc)
    if(any(duplicated(loc.names))){
        warning("duplicate labels detected for some loci; using generic labels")
        loc.names <- .genlab("loc",nloc)
    }
    if(length(grep("[.]", loc.names))>0L){
        warning("character '.' detected in names of loci; replacing with '_'")
        gsub("[.]","_", loc.names)
    }
    colnames(X) <- loc.names


    ## check alleles for periods
    if (length(grep("[.]", X)) > 0L){
      if (is.null(sep) || sep != "_"){
        warning("character '.' detected in names of loci; replacing with '_'")
        replacement <- "_"
      } else {
        warning("character '.' detected in names of loci; replacing with 'p'")
        replacement <- "p"
      }
      X <- apply(X, 2, function(i) gsub("[.]", replacement, i))
    }

    ## PRESENCE/ABSENCE MARKERS ##
    if(toupper(type)=="PA"){
        ## preliminary stuff
        rownames(X) <- ind.names
        colnames(X) <- loc.names

        ## Erase entierely non-typed loci
        temp <- colSums(is.na(X))==nrow(X)
        if(any(temp)){
            X <- X[,!temp]
            warning("Markers with no scored alleles have been removed")
        }

        ## Erase entierely non-type individuals
        temp <- rowSums(is.na(X))==ncol(X)
        if(any(temp)){
            X <- X[!temp,,drop=FALSE]
            if(!is.null(pop)) pop <- pop[!temp]
            ploidy <- ploidy[!temp]
            ind.names <- ind.names[!temp]
            warning("Individuals with no scored loci have been removed")
        }

        ## erase non-polymorphic loci
        temp <- apply(X, 2, function(loc) length(unique(loc[!is.na(loc)]))==1)
        if(any(temp)){
            X <- X[,!temp,drop=FALSE]
            loc.names <- loc.names[!temp]
            nloc <- ncol(X)
            warning("non-polymorphic marker(s) deleted")
        }

        prevcall <- match.call()

        res <- genind(tab=X, pop=pop, prevcall=prevcall, ploidy=ploidy,
                      type = "PA", strata = strata, hierarchy = hierarchy)

        return(res)
    } # end type PA


    ## CODOMINANT MARKERS ##
    ## make sure X is in character mode
    mode(X) <- "character"


    ## HANDLE MISSING SEPARATORS
    if(is.null(sep) && any(ploidy>1)){
        ## check that ncode is provided
        if(is.null(ncode)) stop("please indicate either the separator (sep) or the number of characters coding an allele (ncode).")

        ## add "/" as separator
        X <- gsub(paste0("([[:alnum:]]{",ncode,"})"), "\\1/", X)
        X <- sub("/$","",X)
        sep <- "/"
    }

    ## HANDLE NAs
    ## find all strings which are in fact NAs
    NA.list <- unlist(lapply(unique(ploidy), function(nrep) paste(rep(NA.char, nrep), collapse=sep)))
    NA.list <- unique(c(NA.list, NA.char))

    ## replace NAs
    X[X %in% NA.list] <- NA

    ## erase entirely non-type loci
    toRemove <- which(colSums(is.na(X))==nrow(X))
    if(length(toRemove) > 0){
        X <- X[,-toRemove, drop = FALSE]
        loc.names <- loc.names[-toRemove]
        warning("Markers with no scored alleles have been removed")
    }


    ## erase entierely non-type individuals
    toRemove <- which(rowSums(is.na(X))==ncol(X))
    if(length(toRemove) > 0){
        X <- X[-toRemove, , drop = FALSE]
        ind.names <- rownames(X)
        ploidy <- ploidy[-toRemove]
        if(!is.null(pop)) pop <- pop[-toRemove]
        warning("Individuals with no scored loci have been removed")
    }


    ## TRANSLATE DATA INTO ALLELE COUNTS ##
    ## get dimensions of X
    nloc <- ncol(X)
    nind <- nrow(X)

    ## unfold data for each cell of the table
    if (any(ploidy > 1)){
        allele.data <- strsplit(X, sep)
        n.items <- lengths(allele.data)
        locus.data <- rep(rep(loc.names, each = nind), n.items)
        ind.data <- rep(rep(ind.names,ncol(X)), n.items)
        allele.data <- unlist(allele.data)
    } else {
        n.items     <- rep(1, length(X))
        locus.data  <- rep(rep(loc.names, each=nind), n.items)
        ind.data    <- rep(rep(ind.names, ncol(X)), n.items)
        allele.data <- unlist(X)
    }


    ## identify NAs
    NA.posi <- which(is.na(allele.data))
    NA.ind <- ind.data[NA.posi]
    NA.locus <- locus.data[NA.posi]

    ## remove NAs
    if(length(NA.posi)>0){
        allele.data <- allele.data[-NA.posi]
        locus.data <- locus.data[-NA.posi]
        ind.data <- ind.data[-NA.posi]
    }

    ## get matrix of allele counts
    allele.data <- paste(locus.data, allele.data, sep=".")

    tmpfun <- function(x){
      tab <- tabulate(factor(x, levels = unique(x)))
      to <- cumsum(tab)
      from <- c(0L, to[-length(to)])+ 1L
      res <- mapply(seq, from, to, SIMPLIFY=FALSE)
      names(res) <- unique(x)
      res
    }    
    tmp <- unique(allele.data)
    ind <- match(tmp, allele.data)
    ## named list with position of the columns for each locus 
    pos <- tmpfun(locus.data[ind]) 
    
    allele.data <- factor(allele.data, levels=unique(allele.data))
    ind.data    <- factor(ind.data, levels=ind.names)
    
    out         <- table(ind.data, allele.data)
#    out         <- out[ind.names, , drop = FALSE] # table sorts alphabetically. This resets.

    ## force type 'matrix'
    class(out) <- NULL
    dimnames(out) <- list(rownames(out), colnames(out))

    ## restore NAs
    ##
    ## Thanks to Klaus Schliep for the proposed speedup:
    ##
    # if (length(NA.posi) > 0) {
    #     out.colnames <- colnames(out)
    #     NA.row <- match(NA.ind, rownames(out))
    #     loc <- paste0(NA.locus, "\\.")
    #     uloc <- unique(loc)
    #     loc.list <- lapply(uloc, grep, out.colnames)
    #     NA.col <- match(loc, uloc)
    #     out[cbind(rep(NA.row, unlist(lapply(loc.list, length))[NA.col]), unlist(loc.list[NA.col]))] <- NA
    #  }
    ## This one is modified from above to make everything more explicit.
    if (length(NA.posi) > 0) {
      NA.row <- match(NA.ind, rownames(out))
      uloc <- unique(NA.locus)
      loc.list <- pos[uloc] ## subset pos 
      NA.col <- match(NA.locus, uloc)

      # Coordinates for missing rows
      missing.ind <- vapply(loc.list, length, integer(1))[NA.col]
      missing.ind <- rep(NA.row, missing.ind)
      # Coordinates for missing columns
      missing.loc <- unlist(loc.list[NA.col], use.names = FALSE)

      missing_coordinates <- matrix(0L, nrow = length(missing.ind), ncol = 2L)
      missing_coordinates[, 1] <- missing.ind
      missing_coordinates[, 2] <- missing.loc
      #      [,1] [,2]
      # [1,]    2    1
      # [2,]    3    1
      # [3,]    4   13
      # [4,]    4   14

      out[missing_coordinates] <- NA

      #          X1401_25.33
      # A_KH1584           1
      # C_KH1059           1
      # M_KH1834           1
      # M_KH1837           1
    }
    if(check.ploidy){
      ploidmat <- vapply(pos, function(i) rowSums(out[, i, drop = FALSE], 
                         na.rm = TRUE), FUN.VALUE = double(nrow(out)))      
      if (max(ploidmat, na.rm = TRUE) > max(ploidy, na.rm = TRUE)) {
        oran <- paste(range(ploidmat, na.rm = TRUE), collapse = "-")
        eran <- paste(range(ploidy, na.rm = TRUE), collapse = "-")
        msg <- paste0("The observed allele dosage (", oran, ") ", 
                      "does not match the defined ploidy ", "(", eran, ").\n",
                      "Please check that your input parameters (ncode, sep) ",
                      "are correct.")
        warning(msg, immediate. = TRUE)
      }
    }
    ## call upon genind constructor
    prevcall <- match.call()
    out <- genind(tab=out, pop=pop, prevcall=prevcall, ploidy=ploidy, type=type,
                  strata = strata, hierarchy = hierarchy)

    return(out)
} # end df2genind







########################################
## Function read.genetix
## code based on previous ade4 functions
########################################
#'
#' Reading data from GENETIX
#'
#' The function \code{read.genetix} reads GENETIX data files (.gtx) and convert
#' them into a \linkS4class{genind} object.
#'
#' Note: \code{read.genetix} is meant for DIPLOID DATA ONLY. Haploid data with
#' the GENETIX format can be read into R using \code{read.table} or
#' \code{read.csv} after removing headers and 'POP' lines, and then converted
#' using \code{\link{df2genind}}.
#'
#' @param file a character string giving the path to the file to convert, with
#' the appropriate extension.
#' @param quiet logical stating whether a conversion message must be printed
#' (TRUE,default) or not (FALSE).
#' @return an object of the class \code{genind}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso \code{\link{import2genind}}, \code{\link{df2genind}},
#' \code{\link{read.fstat}}, \code{\link{read.structure}},
#' \code{\link{read.genepop}}
#' @references Belkhir K., Borsa P., Chikhi L., Raufaste N. & Bonhomme F.
#' (1996-2004) GENETIX 4.05, logiciel sous Windows TM pour la genetique des
#' populations. Laboratoire Genome, Populations, Interactions, CNRS UMR 5000,
#' Universite de Montpellier II, Montpellier (France). \cr
#' @keywords manip
#' @examples
#'
#' obj <- read.genetix(system.file("files/nancycats.gtx",package="adegenet"))
#' obj
#'
#' @export read.genetix
read.genetix <- function(file=NULL,quiet=FALSE) {
    if(!quiet) cat("\n Converting data from GENETIX to a genind object... \n")


    ## read from file
    ## if(!file.exists(file)) stop("Specified file does not exist.") <- not needed

    if(toupper(.readExt(file)) != "GTX") stop("File extension .gtx expected")
    ## retrieve first infos
    nloc <- as.integer(scan(file,nlines=1,what="character",quiet=TRUE)[1])
    npop <- as.integer(scan(file,nlines=1,skip=1,what="character",quiet=TRUE)[1])
    txt <- scan(file,skip=2,what="character",sep="\n",quiet=TRUE)
    txt <- gsub("\t"," ",txt)
    ## check that nloc is consistent with actual nloc (bug-report 1.2-2.02)
    temp <- temp <- trimws(txt[length(txt)])
    nlocbis <- length(unlist(strsplit(temp, "[[:space:]]+")))-1
    if(nloc != nlocbis) {
        warning(paste("\n== Genetix file error == \n",
                      "Indicated number of locus (", nloc, ")\n",
                      "does not match actual number (", nlocbis, ").\n",
                      "Using ", nlocbis, " as number of locus.\n",
                      "Please check your file.", sep=""))
        nloc <- nlocbis
    }
    loc.names <- txt[seq(1,by=2,length=nloc)]
    txt <- txt[-(1:(nloc*2))]

    ## retrieve populations infos
    pop.names <- vector(mode="character",length=npop)
    pop.nind <- vector(mode="integer",length=npop)
    index <- 1
    temp <- vector(mode="integer",length=npop)
    for(i in 1:npop){
        pop.names[i] <- txt[index]
        pop.nind[i] <- as.numeric(txt[index+1])
        temp[i] <- index
        index <- index + pop.nind[i] + 2
    }
    pop.names <- trimws(pop.names)

    ## retrieve genotypes infos
    txt <- txt[-c(temp,temp+1)]
    txt <- trimws(txt)
    txt <- sapply(1:length(txt),function(i) unlist(strsplit(txt[i],"([[:space:]]+)|([[:blank:]]+)")) )
    X <- t(txt)
    if(ncol(X) == (nloc+1)){
        rownames(X) <- X[,1]
        X <- X[,-1]
    } else{
        rownames(X) <- 1:nrow(X)
    }

    colnames(X) <- loc.names

    ## pop is kept as character; treatment and conversion to a factor belongs to the constructor
    ## (otherwise there is potential for inconsistencies across different import functions
    pop <- as.character(rep(pop.names,pop.nind))

    ## pass X to df2genind
    res <- df2genind(X=X, ncode=3, pop=pop, ploidy=2, NA.char="000")
    res@call <- match.call()

    if(!quiet) cat("\n...done.\n\n")

    return(res)
} # end read.genetix





######################
## Function read.fstat
######################
#' Reading data from Fstat
#'
#' The function \code{read.fstat} reads Fstat data files (.dat) and convert
#' them into a \linkS4class{genind} object.
#'
#' Note: \code{read.fstat} is meant for DIPLOID DATA ONLY. Haploid data with
#' the Hierfstat format can be read into R using \code{read.table} or
#' \code{read.csv} after removing headers and 'POP' lines, and then converted
#' using \code{\link{df2genind}}.
#'
#' @param file a character string giving the path to the file to convert, with
#' the appropriate extension.
#' @param quiet logical stating whether a conversion message must be printed
#' (TRUE,default) or not (FALSE).
#' @return an object of the class \code{genind}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso \code{\link{import2genind}}, \code{\link{df2genind}},
#' \code{\link{read.genetix}}, \code{\link{read.structure}},
#' \code{\link{read.genepop}}
#' @references Fstat (version 2.9.3). Software by Jerome Goudet.
#' http://www2.unil.ch/popgen/softwares/fstat.htm\cr
#' @keywords manip
#' @examples
#'
#' obj <- read.fstat(system.file("files/nancycats.dat",package="adegenet"))
#' obj
#'
#' @export read.fstat
read.fstat <- function(file, quiet=FALSE){
    ##if(!file.exists(file)) stop("Specified file does not exist.") <- not needed
    if(toupper(.readExt(file)) != "DAT") stop("File extension .dat expected")

    if(!quiet) cat("\n Converting data from a FSTAT .dat file to a genind object... \n\n")

    call <- match.call()
    txt <- scan(file,what="character",sep="\n",quiet=TRUE)
    txt <- gsub("\t"," ",txt)

    ## read length of allele
    ncode <- as.integer(unlist(strsplit(txt[1], " "))[4])
    NA.char <- paste(rep("0",ncode),collapse="")

    ## read first infos
    info <- unlist(strsplit(txt[1],"([[:space:]]+)"))
    ## npop <- as.numeric(info[1]) ## no longer used
    nloc <- as.numeric(info[2])

    loc.names <- txt[2:(nloc+1)]

    ## build genotype matrix
    txt <- txt[-(1:(nloc+1))]
    txt <- trimws(txt)
    txt <- sapply(1:length(txt),function(i) unlist(strsplit(txt[i],"([[:space:]]+)|([[:blank:]]+)")) )
    X <- t(txt)
    pop <- as.character(X[,1])
    if(length(unique(pop)) == 1 ) pop <- NULL
    X <- X[,-1]

    colnames(X) <- loc.names
    rownames(X) <- 1:nrow(X)

    ## replace all possible missing data coding by NA.char
    allNAs <- sapply(1:8, function(i) paste(rep("0",i),collapse=""))
    X[X %in% allNAs] <- NA.char

    ## call df2genind
    res <- df2genind(X=X, pop=pop, ploidy=2, ncode=ncode, NA.char=NA.char)
    res@call <- call

    if(!quiet) cat("\n...done.\n\n")

    return(res)

} # end read.fstat





##########################
## Function read.genepop
##########################
#' Reading data from Genepop
#'
#' The function \code{read.genepop} reads Genepop data files (.gen) and convert
#' them into a \linkS4class{genind} object.
#'
#' Note: \code{read.genepop} is meant for DIPLOID DATA ONLY. Haploid data with
#' the Genepop format can be read into R using \code{read.table} or
#' \code{read.csv} after removing headers and 'POP' lines, and then converted
#' using \code{\link{df2genind}}.
#'
#' @param file a character string giving the path to the file to convert, with
#' the appropriate extension.
#' @param ncode an integer indicating the number of characters used to code an allele.
#' @param quiet logical stating whether a conversion message must be printed
#' (TRUE,default) or not (FALSE).
#' @return an object of the class \code{genind}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso \code{\link{import2genind}}, \code{\link{df2genind}},
#' \code{\link{read.fstat}}, \code{\link{read.structure}},
#' \code{\link{read.genetix}}
#' @references Raymond M. & Rousset F, (1995). GENEPOP (version 1.2):
#' population genetics software for exact tests and ecumenicism. \emph{J.
#' Heredity}, \bold{86}:248-249 \cr
#' @keywords manip
#' @examples
#'
#' obj <- read.genepop(system.file("files/nancycats.gen",package="adegenet"))
#' obj
#'
#' @export read.genepop
read.genepop <- function(file, ncode=2L, quiet=FALSE){
    ## if(!file.exists(file)) stop("Specified file does not exist.") <- not needed
    if(toupper(.readExt(file)) != "GEN") stop("File extension .gen expected")

    if(!quiet) cat("\n Converting data from a Genepop .gen file to a genind object... \n\n")

    prevcall <- match.call()

    txt <- scan(file,sep="\n",what="character",quiet=TRUE)
    if(!quiet) cat("\nFile description: ",txt[1], "\n")
    txt <- txt[-1]
    txt <- gsub("\t", " ", txt)
    NA.char <- paste(rep("0",ncode), collapse="")

    ## two cases for locus names:
    ## 1) all on the same row, separated by ","
    ## 2) one per row
    ## ! spaces and tab allowed
    ## a bug was reported by S. Devillard, occuring
    ## when the two cases occur together,
    ## that is:
    ## loc1,
    ## loc2,
    ## ...


    ## new strategy (shorter): isolate the 'locus names' part and then parse it.
    locinfo.idx <- 1:(min(grep("POP",toupper(txt)))-1)
    locinfo <- txt[locinfo.idx]
    locinfo <- paste(locinfo,collapse=",")
    loc.names <- unlist(strsplit(locinfo,"([,]|[\n])+"))
    loc.names <- trimws(loc.names)
    nloc <- length(loc.names)
    txt <- txt[-locinfo.idx]

    ## locus names have been retreived

    ## build the pop factor
    ## and correct the genotypes splited on more than 1 line
    pop.idx <- grep("^([[:space:]]*)POP([[:space:]]*)$",toupper(txt))
    npop <- length(pop.idx)
    ## correction for splited genotype
    ## isolated by the absence of comma on a line not containing "pop"
    nocomma <- which(! (1:length(txt)) %in% grep(",",txt))
    splited <- nocomma[which(! nocomma %in% pop.idx)]
    if(length(splited)>0){
        for(i in sort(splited,decreasing=TRUE)){
            txt[i-1] <- paste(txt[i-1],txt[i],sep=" ")
        }
        txt <- txt[-splited]
    }
    ## end correction

    ## reevaluate pop index
    pop.idx <- grep("^([[:space:]]*)POP([[:space:]]*)$",toupper(txt))

    txt[length(txt)+1] <- "POP"
    nind.bypop <- diff(grep("^([[:space:]]*)POP([[:space:]]*)$",toupper(txt)))-1
    pop <- factor(rep(1:npop,nind.bypop))

    txt <- txt[-c(pop.idx,length(txt))]

    temp <- sapply(1:length(txt),function(i) strsplit(txt[i],","))
    ## temp is a list with nind elements, first being ind. name and 2nd, genotype

    ind.names <- vapply(temp, function(e) e[1], character(1))
    ind.names <- trimws(ind.names)
    ## individuals' name are now clean

    vec.genot <- vapply(temp, function(e) e[2], character(1))
    vec.genot <- trimws(vec.genot)

    ## X is a individual x locus genotypes matrix
    X <- matrix(unlist(strsplit(vec.genot,"[[:space:]]+")),ncol=nloc,byrow=TRUE)

    # If there are any duplicate names, make them unique and issue a warning. Else
    # use existing individual names.
    if (any(duplicated(ind.names))) {
      rownames(X) <- .genlab("", nrow(X))
      warning("Duplicate individual names detected. Coercing them to be unique.")
    } else {
      rownames(X) <- ind.names
    }
    colnames(X) <- loc.names

    ## give right pop names
    ## beware: genepop takes the name of the last individual of a sample as this sample's name
    pop.names.idx <- cumsum(table(pop))
    pop.names <- ind.names[pop.names.idx]
    levels(pop) <- pop.names

    ## check that data are consistent with NCODE and ploidy=2
    if(!all(unique(nchar(X))==(ncode*2))) stop(paste("some alleles are not encoded with", ncode,
                                                     "characters\nCheck 'ncode' argument"))

    res <- df2genind(X=X, pop=as.character(pop), ploidy=2, ncode=ncode, NA.char=NA.char)
    res@call <- prevcall

    if(!quiet) cat("\n...done.\n\n")

    return(res)

} # end read.genepop





############################
## Function read.structure
############################
#' Reading data from STRUCTURE
#'
#' The function \code{read.structure} reads STRUCTURE data files (.str ou
#' .stru) and convert them into a \linkS4class{genind} object. By default, this
#' function is interactive and asks a few questions about data content. This
#' can be disabled (for optional questions) by turning the 'ask' argument to
#' FALSE. However, one has to know the number of genotypes, of markers and if
#' genotypes are coded on a single or on two rows before importing data.
#'
#' Note: \code{read.structure} is meant for DIPLOID DATA ONLY. Haploid data
#' with the STRUCTURE format can easily be read into R using \code{read.table}
#' or \code{read.csv} and then converted using \code{\link{df2genind}}.
#'
#' @param file a character string giving the path to the file to convert, with
#' the appropriate extension.
#' @param n.ind an integer giving the number of genotypes (or 'individuals') in
#' the dataset
#' @param n.loc an integer giving the number of markers in the dataset
#' @param onerowperind a STRUCTURE coding option: are genotypes coded on a
#' single row (TRUE), or on two rows (FALSE, default)
#' @param col.lab an integer giving the index of the column containing labels
#' of genotypes. '0' if absent.
#' @param col.pop an integer giving the index of the column containing
#' population to which genotypes belong. '0' if absent.
#' @param col.others an vector of integers giving the indexes of the columns
#' containing other informations to be read. Will be available in @@other of the
#' created object.
#' @param row.marknames an integer giving the index of the row containing the
#' names of the markers. '0' if absent.
#' @param NA.char the character string coding missing data. "-9" by default.
#' Note that in any case, series of zero (like "000") are interpreted as NA
#' too.
#' @param pop an optional factor giving the population of each individual.
#' @param sep a character string used as separator between alleles.
#' @param ask a logical specifying if the function should ask for optional
#' informations about the dataset (TRUE, default), or try to be as quiet as
#' possible (FALSE).
#' @param quiet logical stating whether a conversion message must be printed
#' (TRUE,default) or not (FALSE).
#' @return an object of the class \code{genind}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso \code{\link{import2genind}}, \code{\link{df2genind}},
#' \code{\link{read.fstat}}, \code{\link{read.genetix}},
#' \code{\link{read.genepop}}
#' @references Pritchard, J.; Stephens, M. & Donnelly, P. (2000) Inference of
#' population structure using multilocus genotype data. \emph{Genetics},
#' \bold{155}: 945-959
#' @keywords manip
#' @examples
#'
#' obj <- read.structure(system.file("files/nancycats.str",package="adegenet"),
#'   onerowperind=FALSE, n.ind=237, n.loc=9, col.lab=1, col.pop=2, ask=FALSE)
#'
#' obj
#'
#' @export read.structure
read.structure <- function(file, n.ind=NULL, n.loc=NULL,  onerowperind=NULL,
                           col.lab=NULL, col.pop=NULL, col.others=NULL,
                           row.marknames=NULL, NA.char="-9", pop=NULL,
                           sep=NULL, ask=TRUE, quiet=FALSE){

    ## if(!file.exists(file)) stop("Specified file does not exist.") <- not needed
    if(!toupper(.readExt(file)) %in% c("STR","STRU")) stop("File extension .stru expected")

    ## set defaults for non optional arguments without default values
    if(!ask){
        if(is.null(col.lab)) col.lab <- as.integer(0)
        if(is.null(col.pop)) col.pop <- as.integer(0)
        if(is.null(row.marknames)) row.marknames <- as.integer(0)
    }

    ## required questions
    if(is.null(n.ind)){
        cat("\n How many genotypes are there? ")
        n.ind <- as.integer(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    if(is.null(n.loc)){
        cat("\n How many markers are there? ")
        n.loc <- as.integer(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    if(is.null(col.lab)){
        cat("\n Which column contains labels for genotypes ('0' if absent)? ")
        col.lab <- as.integer(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    if(is.null(col.pop)){
        cat("\n Which column contains the population factor ('0' if absent)? ")
        col.pop <- as.integer(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    if(is.null(col.others) & ask){
        cat("\n Which other optional columns should be read (press 'return' when done)? ")
        col.others <- scan(quiet=TRUE)
        if(length(col.others) == 0)  col.others <- NULL
    }

    if(is.null(row.marknames)){
        cat("\n Which row contains the marker names ('0' if absent)? ")
        row.marknames <- as.integer(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    if(is.null(onerowperind)){
        cat("\n Are genotypes coded by a single row (y/n)? ")
        onerowperind <- toupper(readLines(con = getOption('adegenet.testcon'), n = 1))
        if(onerowperind == "Y") {
            onerowperind <- TRUE
        } else {
            onerowperind <- FALSE
        }
    }

    if(is.null(NA.char)){
        cat("\n What is the code for missing data (default is '-9')? ")
        NA.char <- as.character(readLines(con = getOption('adegenet.testcon'), n = 1))
    }

    ## message to console
    if(!quiet) cat("\n Converting data from a STRUCTURE .stru file to a genind object... \n\n")

    ## read the file
    txt <- scan(file,sep="\n",what="character",quiet=TRUE)

    ## remove empty lines and spaces/tabs at the end of a line
    temp <- grep("^[[:space:]]*$",txt)
    if(length(temp) > 0) {
        txt <- txt[-temp]
    }

    txt <- gsub("([[:blank:]]+)$","",txt)

    ## isolate each useful component of the file
    ## matrix of data
    if(onerowperind) {
        n <- n.ind
        p <- 2*n.loc
    } else{
        n <- 2*n.ind
        p <- n.loc
    }

    lastline <- length(txt)
    mat <- txt[(lastline-n+1):lastline]
    mat <- t(as.data.frame(strsplit(mat,"[[:blank:]]+")))
    rownames(mat) <- 1:n
    gen <- mat[, (ncol(mat)-p+1):ncol(mat)]


    ## markers names
    if(row.marknames != 0) {
        loc.names <- trimws(txt[row.marknames])
        loc.names <- unlist(strsplit(loc.names,"[[:blank:]]+"))
    } else {
        loc.names <- .genlab("L",n.loc)
    }

    ## genotypes labels
    if(col.lab !=0) {
        ind.names <- mat[, col.lab]
    } else {
        ind.names <- .genlab("",n.ind)
    }

    ## population factor
    if(col.pop !=0) {
        pop <- as.character(mat[, col.pop])
    } else {
        pop <- NULL
    }

    ## other variables
    if(!is.null(col.others)){
        X.other <- mat[,col.others]
    }

    ## transformations if onerowperind is FALSE
    if(!onerowperind) {
        temp <- seq(1,n,by=2)
        ind.names <- ind.names[temp]
        if(length(ind.names) < n.ind) warning("Duplicated identifier for genotypes")
        pop <- pop[temp]
        if(exists("X.other")) X.other <- X.other[temp]

        ## make sur that all strings in gen have the same number of characters
        ncode <- max(nchar(gen))
        keepCheck <- any(nchar(gen) < ncode)

        while(keepCheck){
            mat0 <- matrix("", ncol=ncol(gen), nrow=nrow(gen))
            mat0[nchar(gen) < ncode] <- "0"
            gen <-  matrix(paste(mat0, gen, sep=""), nrow=nrow(mat0))
            keepCheck <- any(nchar(gen) < ncode)
        }

        ## reorder matrix of genotypes
        X <- t(sapply(temp, function(i) paste(gen[i,],gen[i+1,],sep="") ))

    } else { # if onerowperind
        temp <- seq(1,p-1,by=2)
        X <- paste(gen[,temp] , gen[,temp+1], sep="/")
        X <- matrix(X, nrow=n.ind)
        sep <- "/"
    }

    ## replace missing values by NAs
    X <- gsub(NA.char,NA,X)
    rownames(X) <- ind.names
    colnames(X) <- loc.names

    res <- df2genind(X=X, pop=pop, ploidy=2, sep=sep, ncode=ncode)

    res@call <- match.call()

    if(exists("X.other")) {res@other <- list(X=X.other)}

    return(res)

}




#########################
## Function import2genind
#########################
#'
#' Importing data from several softwares to a genind object
#'
#' Their are several ways to import genotype data to a \linkS4class{genind}
#' object: i) from a data.frame with a given format (see
#' \code{\link{df2genind}}), ii) from a file with a recognized extension, or
#' iii) from an alignement of sequences (see \code{\link{DNAbin2genind}}).\cr
#'
#' The function \code{import2genind} detects the extension of the file given in
#' argument and seeks for an appropriate import function to create a
#' \code{genind} object.\cr Current recognized formats are :\cr - GENETIX files
#' (.gtx) \cr - Genepop files (.gen) \cr - Fstat files (.dat) \cr - STRUCTURE
#' files (.str or .stru) \cr
#'
#' Beware: same data in different formats are not expected to produce exactly
#' the same \code{genind} objects.\cr For instance, conversions made by GENETIX
#' to Fstat may change the the sorting of the genotypes; GENETIX stores
#' individual names whereas Fstat does not; Genepop chooses a sample's name
#' from the name of its last genotype; etc.
#'
#' @aliases import2genind
#' @param file a character string giving the path to the file to convert, with
#' the appropriate extension.
#' @param quiet logical stating whether a conversion message must be printed
#' (TRUE,default) or not (FALSE).
#' @param \dots other arguments passed to the appropriate 'read' function
#' (currently passed to \code{read.structure})
#' @return an object of the class \code{genind}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso \code{\link{import2genind}}, \code{\link{read.genetix}},
#' \code{\link{read.fstat}}, \code{\link{read.structure}},
#' \code{\link{read.genepop}}
#' @references Belkhir K., Borsa P., Chikhi L., Raufaste N. & Bonhomme F.
#' (1996-2004) GENETIX 4.05, logiciel sous Windows TM pour la genetique des
#' populations. Laboratoire Genome, Populations, Interactions, CNRS UMR 5000,
#' Universite de Montpellier II, Montpellier (France). \cr
#'
#' Pritchard, J.; Stephens, M. & Donnelly, P. (2000) Inference of population
#' structure using multilocus genotype data. \emph{Genetics}, \bold{155}:
#' 945-959
#'
#' Raymond M. & Rousset F, (1995). GENEPOP (version 1.2): population genetics
#' software for exact tests and ecumenicism. \emph{J. Heredity},
#' \bold{86}:248-249 \cr
#'
#' Fstat (version 2.9.3). Software by Jerome Goudet.
#' http://www2.unil.ch/popgen/softwares/fstat.htm\cr
#'
#' Excoffier L. & Heckel G.(2006) Computer programs for population genetics
#' data analysis: a survival guide \emph{Nature}, \bold{7}: 745-758
#' @keywords manip
#' @examples
#'
#' import2genind(system.file("files/nancycats.gtx",
#' package="adegenet"))
#'
#' import2genind(system.file("files/nancycats.dat",
#' package="adegenet"))
#'
#' import2genind(system.file("files/nancycats.gen",
#' package="adegenet"))
#'
#' import2genind(system.file("files/nancycats.str",
#' package="adegenet"), onerowperind=FALSE, n.ind=237, n.loc=9, col.lab=1, col.pop=2, ask=FALSE)
#'
import2genind <- function(file, quiet=FALSE, ...){
    ## if(!file.exists(file)) stop("Specified file does not exist.") <- not needed
    ext <- .readExt(file)
    ext <- toupper(ext)

    if(ext == "GTX")
        return(read.genetix(file,quiet=quiet))

    if(ext == "DAT")
        return(read.fstat(file, quiet=quiet))

    if(ext == "GEN")
        return(read.genepop(file, quiet=quiet, ...))

    if(ext %in% c("STR","STRU"))
        return(read.structure(file, quiet=quiet, ...))

    ## evaluated only if extension is not supported
    cat("\n File format (",ext,") not supported.\n")
    cat("\nSupported formats are:\nGENETIX (.gtx) \nFSTAT (.dat) \nGenepop (.gen)\n \nSTRUCTURE (.str)\n")

    return(invisible())
}







#######################
## Function read.snp
#######################
#' Reading Single Nucleotide Polymorphism data
#'
#' The function \code{read.snp} reads a SNP data file with extension '.snp' and
#' converts it into a \linkS4class{genlight} object. This format is devoted to
#' handle biallelic SNP only, but can accommodate massive datasets such as
#' complete genomes with considerably less memory than other formats.
#'
#' The function reads data by chunks of a few genomes (minimum 1, no maximum)
#' at a time, which allows one to read massive datasets with negligible RAM
#' requirements (albeit at a cost of computational time). The argument
#' \code{chunkSize} indicates the number of genomes read at a time. Increasing
#' this value decreases the computational time required to read data in, while
#' increasing memory requirements.
#'
#' A description of the .snp format is provided in an example file distributed
#' with adegenet (see example below).
#'
#' === The .snp format ===
#'
#' Details of the .snp format can be found in the example file distributed with
#' adegenet (see below), or on the adegenet website (type \code{adegenetWeb()}
#' in R).
#'
#' @param file a character string giving the path to the file to convert, with
#' the extension ".snp".
#' @param quiet logical stating whether a conversion messages should be printed
#' (TRUE,default) or not (FALSE).
#' @param chunkSize an integer indicating the number of genomes to be read at a
#' time; larger values require more RAM but decrease the time needed to read
#' the data.
#' @param parallel a logical indicating whether multiple cores -if available-
#' should be used for the computations (TRUE, default), or not (FALSE);
#' requires the package \code{parallel} to be installed (see details).
#' @param n.cores if \code{parallel} is TRUE, the number of cores to be used in
#' the computations; if NULL, then the maximum number of cores available on the
#' computer is used.
#' @param \dots other arguments to be passed to other functions - currently not
#' used.
#' @return an object of the class \code{"\linkS4class{genlight}"}
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso - \code{?genlight} for a description of the class
#' \code{"\linkS4class{genlight}"}.
#'
#' - \code{\link{read.PLINK}}: read SNPs in PLINK's '.raw' format.
#'
#' - \code{\link{fasta2genlight}}: extract SNPs from alignments with fasta
#' format.
#'
#' - \code{\link{df2genind}}: convert any multiallelic markers into adegenet
#' \code{"\linkS4class{genlight}"}.
#'
#' - \code{\link{import2genind}}: read multiallelic markers from various
#' software into adegenet.\cr
#' @keywords manip
#' @examples
#'
#' \dontrun{
#' ## show the example file ##
#' ## this is the path to the file:
#' system.file("files/exampleSnpDat.snp",package="adegenet")
#'
#' ## show its content:
#' file.show(system.file("files/exampleSnpDat.snp",package="adegenet"))
#'
#'
#' ## read the file
#' obj <-
#' read.snp(system.file("files/exampleSnpDat.snp",package="adegenet"), chunk=2)
#' obj
#' as.matrix(obj)
#' ploidy(obj)
#' alleles(obj)
#' locNames(obj)
#' }
#'
#' @export read.snp
#'
read.snp <- function(file, quiet=FALSE, chunkSize=1000,
                     parallel=FALSE, n.cores=NULL, ...){
    ext <- .readExt(file)
    ext <- toupper(ext)
    if(ext != "SNP") warning("wrong file extension - '.snp' expected")
    if(!quiet) cat("\n Reading biallelic SNP data file into a genlight object... \n\n")
    ## if(parallel && !require(parallel)) stop("parallel package requested but not installed")
    if(parallel && is.null(n.cores)){
        n.cores <- parallel::detectCores()
    }

    call <- match.call()

    ## HANDLE THE COMMENTS ##
    if(!quiet) cat("\n Reading comments... \n")

    count <- 0L
    i <- 0
    while(count < 2L){
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=i, nmax=1, blank.lines.skip=FALSE)
        if(length(grep(">>>>", txt)>0)){
            count <- count + 1L
        }
        i <- i+1
        if(count==0L && i>10){
            warning("No comment section at the beginning of the file. Format may be wrong.")
            i <- 0
            break
        }
    }

    lines.to.skip <- i


    ## READ GENERAL DATA (>>) ##
    if(!quiet) cat("\n Reading general information... \n")

    misc.info <- list()

    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)

    while(length(grep(">>", txt))>0){
        itemName <- gsub(">>","", txt)
        itemName <- gsub("(^[[:space:]]+)|([[:space:]]+$)", "", itemName)
        misc.info[itemName] <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip + 1, nmax=1)
        lines.to.skip <-lines.to.skip + 2
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)
    }

    ## transform each character string into a vector
    misc.info <- sapply(misc.info, function(e) unlist(strsplit(e,"[[:space:]]+")))

    ## READ GENOTYPE DATA ##
    ## one genotype is read/converted at a time to spare RAM
    if(!quiet) cat("\n Reading",ifelse(is.null(misc.info$population),"",length(misc.info$population)), "genotypes... \n")


    res <- list() # this will be a list of SNPbin objects

    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize*2)

    ID.INDIV <- grep(">", txt)

    COUNT <- 0 # used to count the nb reads

    while(length(ID.INDIV)>0){
        COUNT <- COUNT + 1
        if(!quiet) {
            if(COUNT %% 5 == 0){
                cat(length(res)+length(ID.INDIV))
            } else {
                cat(".")
            }
        }

        ind.lab <- gsub(">","", txt[ID.INDIV])
        ind.lab <- gsub("(^[[:space:]]+)|([[:space:]]+$)", "", ind.lab)
        temp <- strsplit(txt[ID.INDIV+1], "")
        temp <- lapply(temp, function(e) suppressWarnings(as.integer(e)))
        if(parallel){
            res <- c(res, parallel::mclapply(temp, function(e) new("SNPbin", e),
                                   mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) )
        } else {
            res <- c(res, lapply(temp, function(e) new("SNPbin", e)) )
        }
        names(res)[(length(res)-length(ID.INDIV)+1):length(res)] <- ind.lab
        lines.to.skip <-lines.to.skip + length(txt)
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize*2)
        ID.INDIV <- grep(">", txt)
    }


    ## CHECK CONSISTENCY ##
    if(!quiet) cat("\n Checking consistency... \n")

    n.loc <- unique(sapply(res, nLoc))
    n.ind <- length(res)
    other <- list()

    if(length(n.loc)>1) {
        print(n.loc)
        warning("!!! Differing numbers of loci detected between individuals !!!")
    }

    if(!is.null(misc.info$position) && length(misc.info$position)!=n.loc) {
        other$position <- misc.info$position
        misc.info$position <- NULL
        warning("vector of positions of the SNPs does not match the number of SNPs - storing this information in @other")
    }
    if(!is.null(misc.info$allele) && length(misc.info$allele)!=n.loc) {
        other$allele <- misc.info$allele
        misc.info$allele <- NULL
        warning("vector of alleles of the SNPs does not match the number of SNPs - storing this information in @other")
    }
    if(!is.null(misc.info$chromosome) && length(misc.info$chromosome)!=n.loc) {
        other$chromosome <- misc.info$chromosome
        misc.info$chromosome <- NULL
        warning("vector of chromosomes of the SNPs does not match the number of SNPs - storing this information in @other")
    }
    if(!is.null(misc.info$population) && length(misc.info$population)!=n.ind) {
        other$population <- misc.info$population
        misc.info$population <- NULL
        warning("vector of population of the individuals does not match the number of individuals - storing this information in @other")
    }
    if(!is.null(misc.info$ploidy) && length(misc.info$ploidy)>1 && length(misc.info$ploidy)!=n.ind) {
        other$ploidy <- misc.info$ploidy
        misc.info$ploidy <- NULL
        warning("vector of ploidy of the individuals has more than one value but does not match the number of individuals - storing this information in @other")
    }


    ## BUILD OUTPUT ##
    if(!quiet) cat("\n Building final object... \n")

    ind.names <- names(res)
    if(!is.null(misc.info$chromosome)){
        other <- list(chromosome = misc.info$chromosome)
    }

    res <- new("genlight", gen=res, ind.names=ind.names, position=misc.info$position, loc.all=misc.info$allele, ploidy=misc.info$ploidy, pop=misc.info$population, other=other, parallel=parallel)

    if(!quiet) cat("\n...done.\n\n")

    return(res)

} # end read.snp







####################
## extract.PLINKmap
####################
#' @export
#' @rdname read.PLINK
#' @aliases extract.PLINKmap
#'
#' @param x an optional object of the class \code{"\linkS4class{genlight}"}, in which
#' the information read is stored; if provided, information is matched against
#' the names of the loci in \code{x}, as returned by \code{locNames(x)}; if not
#' provided, a list of two components is returned, containing chromosome and
#' position information.
#'
extract.PLINKmap <- function(file, x = NULL){
    ## CHECK EXTENSION ##
    ext <- .readExt(file)
    ext <- toupper(ext)
    if(ext != "MAP") warning("wrong map.file extension - '.map' expected")


    ## READ FILE ##
    ## find nb of columns
    txt <- scan(file, what = "character", sep = "\n", quiet = TRUE,  nlines = 1)
    nb.col <- length( unlist(strsplit(txt,"[[:blank:]]+")))

    ## read file
    txt <- scan(file, what = "character", sep= "\n", quiet = TRUE)
    txt <- unlist(strsplit(as.vector(txt), split = "[[:blank:]]"))
    txt <- matrix(txt, ncol=4, byrow=TRUE)


    ## EXTRACT INFO AND RETURN OBJECT ##
    ## return a genlight
    if (!is.null(x)) {
        if(!inherits(x, "genlight")) stop("x is not a genlight object")

        ## match data: we need remove the potential alleles added to locus names
        marker_id <- sub("_.*$", "", locNames(x))
        ord <- match(marker_id, txt[,2])

        chromosome(x) <- factor(txt[ord, 1])
        position(x) <- as.integer(txt[ord, 4])

        return(x)
    }

    ## return a list
    res <- list(chromosome = factor(txt[ord, 1]),
                position = as.integer(txt[ord, 4]))

    return(res)
} # end extract.PLINKmap







########################
## Function read.PLINK
########################
#' Reading PLINK Single Nucleotide Polymorphism data
#'
#' The function \code{read.PLINK} reads a data file exported by the PLINK
#' software with extension '.raw' and converts it into a \code{"\linkS4class{genlight}"}
#' object. Optionally, information about SNPs can be read from a ".map" file,
#' either by specifying the argument \code{map.file} in \code{read.PLINK}, or
#' using \code{extract.PLINKmap} to add information to an existing
#' \code{"\linkS4class{genlight}"} object.
#'
#' The function reads data by chunks of several genomes (minimum 1, no maximum)
#' at a time, which allows one to read massive datasets with negligible RAM
#' requirements (albeit at a cost of computational time). The argument
#' \code{chunkSize} indicates the number of genomes read at a time. Increasing
#' this value decreases the computational time required to read data in, while
#' increasing memory requirements.
#'
#' See details for the documentation about how to export data using PLINK to
#' the '.raw' format.
#'
#' === Exporting data from PLINK ===
#'
#' Data need to be exported from PLINK using the option "--recodeA" (and NOT
#' "--recodeAD"). The PLINK command should therefore look like: \code{plink
#' --file data --recodeA}. For more information on this topic, please look at
#' this webpage: \url{http://zzz.bwh.harvard.edu/plink/}
#'
#' @aliases read.PLINK read.plink
#' @param file for \code{read.PLINK} a character string giving the path to the
#' file to convert, with the extension ".raw"; for \code{extract.PLINKmap}, a
#' character string giving the path to a file with extension ".map".
#' @param map.file an optional character string indicating the path to a ".map"
#' file, which contains information about the SNPs (chromosome, position). If
#' provided, this information is processed by \code{extract.PLINKmap} and
#' stored in the \code{@@other} slot.
#' @param quiet logical stating whether a conversion messages should be printed
#' (TRUE,default) or not (FALSE).
#' @param chunkSize an integer indicating the number of genomes to be read at a
#' time; larger values require more RAM but decrease the time needed to read
#' the data.
#' @param parallel a logical indicating whether multiple cores -if available-
#' should be used for the computations (TRUE, default), or not (FALSE);
#' requires the package \code{parallel} to be installed (see details).
#' @param n.cores if \code{parallel} is TRUE, the number of cores to be used in
#' the computations; if NULL, then the maximum number of cores available on the
#' computer is used.
#' @param \dots other arguments to be passed to other functions - currently not
#' used.
#'
#' @return - read.PLINK: an object of the class \code{"\linkS4class{genlight}"}
#'
#' - extract.PLINKmap: if a \code{"\linkS4class{genlight}"} is provided as argument
#' \code{x}, this object incorporating the new information about SNPs in the
#' \code{@@other} slot (with new components 'chromosome' and 'position');
#' otherwise, a list with two components containing chromosome and position
#' information.
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#' @seealso - \code{?genlight} for a description of the class
#' \code{"\linkS4class{genlight}"}.
#'
#' - \code{\link{read.snp}}: read SNPs in adegenet's '.snp' format.
#'
#' - \code{\link{fasta2genlight}}: extract SNPs from alignments with fasta
#' format.
#'
#' - other import function in adegenet: \code{\link{import2genind}},
#' \code{\link{df2genind}}, \code{\link{read.genetix}}
#' \code{\link{read.fstat}}, \code{\link{read.structure}},
#' \code{\link{read.genepop}}.
#'
#' - another function \code{read.plink} is available in the package
#' \code{snpMatrix}.
#' @keywords manip
#' @export
#' @rdname read.PLINK
read.PLINK <- function(file, map.file=NULL, quiet=FALSE, chunkSize=1000,
                       parallel=require("parallel"), n.cores=NULL, ...){
    ## HANDLE ARGUMENTS ##
    ext <- .readExt(file)
    ext <- toupper(ext)
    if(ext != "RAW") warning("wrong file extension - '.raw' expected")
    if(!quiet) cat("\n Reading PLINK raw format into a genlight object... \n\n")
    ## if(parallel && !require(parallel)) stop("parallel package requested but not installed")
    if(parallel && is.null(n.cores)){
        n.cores <- parallel::detectCores()
    }


    ## READ NAMES OF LOCI ##
    if(!quiet) cat("\n Reading loci information... \n")

    loc.names <- scan(file,what="character",sep=" ",quiet=TRUE,  nlines=1, blank.lines.skip=FALSE)
    n.loc <- length(loc.names) - 6
    misc.info <- lapply(1:6,function(i) NULL)
    names(misc.info) <- loc.names[1:6]
    loc.names <- loc.names[7:length(loc.names)]
    loc.names <- gsub("_[1-9]$","",loc.names)

    ## READ GENOTYPES ##
    if(!quiet) cat("\n Reading and converting genotypes... \n")

    res <- list() # this will be a list of SNPbin objects

    ## initialize reading
    lines.to.skip <- 1
    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
    txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))

    COUNT <- 0 # used to count the nb reads

    while(length(txt)>0){
        COUNT <- COUNT + 1
        if(!quiet) {
            if(COUNT %% 5 == 0){
                cat(length(res)+length(txt))
            } else {
                cat(".")
            }
        }


        ## handle misc info
        temp <- lapply(txt, function(e) e[1:6])
        for(i in 1:6){
            misc.info[[i]] <- c(misc.info[[i]], unlist(lapply(temp, function(e) e[[i]])) )
        }


        ## build SNPbin objects
        txt <- lapply(txt, function(e) suppressWarnings(as.integer(e[-(1:6)])))

        if(parallel){
            res <- c(res, parallel::mclapply(txt, function(e) new("SNPbin", snp=e, ploidy=2L),
                                   mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) )
        } else {
            res <- c(res, lapply(txt, function(e) new("SNPbin", snp=e, ploidy=2L)) )
        }

        lines.to.skip <-lines.to.skip + length(txt)

        ## read lines
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
        txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))
    }


    ## MAKE A FEW CHECKS ##
    if(!all(sapply(res, nLoc)==n.loc)) stop(paste("some individuals do not have",n.loc,"SNPs."))


    ## BUILD FINAL OBJECT ##
    if(!quiet) cat("\n Building final object... \n")

    res <- new("genlight",res, ploidy=2L, parallel=parallel)
    indNames(res) <- misc.info$IID
    pop(res) <- misc.info$FID
    locNames(res) <- loc.names
    misc.info <- misc.info[c("SEX", "PHENOTYPE", "PAT","MAT")]
    names(misc.info) <- tolower(names(misc.info))
    misc.info$sex[misc.info$sex==1] <- "m"
    misc.info$sex[misc.info$sex==2] <- "f"
    misc.info$sex <- factor(misc.info$sex)
    misc.info$phenotype[misc.info$phenotype==1] <- "control"
    misc.info$phenotype[misc.info$phenotype==2] <- "case"
    misc.info$phenotype <- factor(misc.info$phenotype)
    other(res) <- misc.info


    ## HANDLE MAP FILE INFO ##
    if(!is.null(map.file)){
        res <- extract.PLINKmap(map.file, res)
    }


    ## RETURN OUTPUT ##
    if(!quiet) cat("\n...done.\n\n")

    return(res)
} # end read.PLINK







###########################
## Function fasta2genlight
###########################
fasta2genlight <- function(file, quiet=FALSE, chunkSize=1000, saveNbAlleles=FALSE,
                       parallel=require("parallel"), n.cores=NULL, ...){
    ## HANDLE ARGUMENTS ##
    ext <- .readExt(file)
    ext <- toupper(ext)
    if(!ext %in% c("FASTA", "FA", "FAS")) warning("wrong file extension - '.fasta', '.fa' or '.fas' expected")
    if(!quiet) cat("\n Converting FASTA alignment into a genlight object... \n\n")
    ## if(parallel && !require(parallel)) stop("parallel package requested but not installed")
    if(parallel && is.null(n.cores)){
        n.cores <- parallel::detectCores()
    }


    ## PRIOR CHECKS ##
    ## find nb of lines per genome
    lines.to.skip <- 0
    txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=1)

    while(length(grep("^>.+", txt))<2){
        lines.to.skip <- lines.to.skip + 1
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=lines.to.skip)
    }

    LINES.PER.IND <- lines.to.skip-1


    ## find length of a genome
    NLOC <- sum(nchar(txt[2:LINES.PER.IND]))


    ## SCAN ALL POSITIONS AND IDENTIFY SNPs ##
    if(!quiet) cat("\n Looking for polymorphic positions... \n")

    ## read all genomes by chunks
    ## initialize
    lines.to.skip <- 0
    IND.LAB <- NULL
    POOL <- as.list(rep("-", NLOC))
    COUNT <- 0 # used to count the nb reads

    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)

    ## read and process chunks
    while(length(txt)>0){
        COUNT <- COUNT + 1
        if(!quiet) {
            for(i in 1:(COUNT*chunkSize)) cat(".")
        }


        nb.ind <- length(grep("^>", txt))
        IND.LAB <- c(IND.LAB, sub(">","",txt[grep("^>", txt)])) # find individuals' labels
        txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
        if(parallel){
            txt <- parallel::mclapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split=""),
                            mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) # each genome -> one vector
        } else {
            txt <- lapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split="")) # each genome -> one vector
        }

        ## POOL contains all alleles of each position
        temp <- as.list(apply(matrix(unlist(txt), byrow=TRUE, nrow=length(txt)),2,unique)) # alleles current genomes
        POOL <- mapply(function(x,y) unique(c(x,y)), POOL, temp, SIMPLIFY=FALSE) # update global pool

        lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
    }


    ## analyse pool of alleles
    letterOK <- c("a","g","c","t","A","G","C","T")
    POOL <- lapply(POOL, function(e) e[e %in% letterOK]) # keep only proper letters
    ## POOL <- lapply(POOL, setdiff, "-")
    nb.alleles <- lengths(POOL)
    snp.posi <- nb.alleles==2
    if(all(!snp.posi)){
        warning("No polymorphism in the alignment - returning empty object")
        return(new("genlight"))
    }
    sec.all <- unlist(lapply(POOL[snp.posi], function(e) e[2]))



    ## RE-READ DATA, CONVERT SNPs TO GENLIGHT ##
    if(!quiet) cat("\n Extracting SNPs from the alignment... \n")

    ## initialize
    lines.to.skip <- 0
    COUNT <- 0 # used to count the nb reads
    res <- list()

    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)

    ## returns a vector of nb of second alleles or NAs
    f1 <- function(vec){
        out <- as.integer(vec==sec.all)
        out[!vec %in% letterOK] <- NA
        return(out)
    }

    ## read and process chunks
    while(length(txt)>0){
        COUNT <- COUNT + 1
        if(!quiet) {
            for(i in 1:(COUNT*chunkSize)) cat(".")
        }


        ## read SNPs
        nb.ind <- length(grep("^>", txt))
        txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
        if(parallel){
            txt <- parallel::mclapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split="")[[1]][snp.posi],
                                        mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) # each genome -> one SNP vector
        } else {
            txt <- lapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split="")[[1]][snp.posi]) # each genome -> one SNP vector
        }

        ## convert to genlight
        ##res <- c(res, lapply(txt, function(e) new("SNPbin", as.integer(e==sec.all))))
        res <- c(res, lapply(txt, function(e) new("SNPbin", f1(e))))

        lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
    }



    ## BUILD FINAL OBJECT ##
    if(!quiet) cat("\n Building final object... \n")

    res <- new("genlight",res, ploidy=1, parallel=parallel)
    indNames(res) <- IND.LAB
    alleles(res) <- sapply(POOL[snp.posi], paste, collapse="/")
    position(res) <- which(snp.posi)
    if(saveNbAlleles) other(res) <- list(nb.all.per.loc=nb.alleles)


    ## RETURN OUTPUT ##
    if(!quiet) cat("\n...done.\n\n")

    return(res)
} # end fasta2genlight








###########################
## Function fasta2DNAbin
###########################
fasta2DNAbin <- function(file, quiet=FALSE, chunkSize=10, snpOnly=FALSE){

    ## HANDLE ARGUMENTS ##
    if (!is(file, "connection")) {
        ext <- .readExt(file)
        ext <- toupper(ext)
        if(!ext %in% c("FASTA", "FA", "FAS")) warning("wrong file extension - '.fasta', '.fa' or '.fas' expected")
    }
    if(!quiet) cat("\n Converting FASTA alignment into a DNAbin object... \n\n")


    ## PRIOR CHECKS ##
    ## find nb of lines per genome ##

    ## find length of a single line of sequence
    if(!quiet) cat("\n Finding the size of a single genome... \n\n")
    lines.to.skip <- 0
    txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=1)

    while(length(grep("^>.+", txt))==1){
        lines.to.skip <- lines.to.skip + 1
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=1)
    }

    char.per.line <- nchar(txt)
    nbOfLinesToRead <- max(round(1e6/char.per.line),1)#

    ## read file to find genome size, 1e6 characters at a time
    nblocks <- 1
    txt <- scan(file,what="character",sep="\n",quiet=TRUE, n=nbOfLinesToRead*nblocks)

    while(length(grep("^>.+", txt))<2){
        nblocks <- nblocks+1
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, n=nbOfLinesToRead*nblocks)
    }

    ## this is the nb of lines for one genome
    ## including the first line of annotation
    LINES.PER.IND <- diff(grep("^>.+", txt))[1]

    ## this is the length of a genome single
    GENOMESIZE <- sum(nchar(txt[2:LINES.PER.IND]))
    if(!quiet) cat("\n genome size is:", format(GENOMESIZE, big.mark=","), "nucleotides \n")
    if(!quiet) cat("\n(",format(LINES.PER.IND, big.mark=","), " lines per genome )\n")


    ## START READING / CONVERTING GENOMES ##
    if(!quiet) cat("\n Importing sequences... \n")

    ## read all genomes by chunks
    ## initialize
    lines.to.skip <- 0
    IND.LAB <- NULL
    COUNT <- 0 # used to count the nb reads

    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=LINES.PER.IND*chunkSize)
    res <- raw()

    ## read and process chunks
    while(length(txt)>0){
        ## clean memory ##
        invisible(gc())

        ## progression... ##
        COUNT <- COUNT + 1
        if(!quiet) {
            for(i in 1:(COUNT*chunkSize)) cat(".")
        }

        ## process txt ##
        nb.ind <- length(grep("^>", txt))
        IND.LAB <- c(IND.LAB, sub(">","",txt[grep("^>", txt)])) # find individuals' labels
        txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
        txt <- lapply(txt, function(e) unlist(strsplit(tolower(paste(e[-1], collapse="")), split=""))) # each genome -> one vector

        ## convert character vectors to DNAbin output
        res <- c(res, unlist(lapply(txt, as.DNAbin)))

        ## ## POOL contains all alleles of each position
        ## temp <- as.list(apply(matrix(unlist(txt), byrow=TRUE, nrow=length(txt)),2,unique)) # alleles current genomes
        ## POOL <- mapply(function(x,y) unique(c(x,y)), POOL, temp, SIMPLIFY=FALSE) # update global pool

        ## scan file further ##
        lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=LINES.PER.IND*chunkSize)
    }

    ## GET FINAL OBJECT ##
    if(!quiet) cat("\n Forming final object... \n")

    ## form matrix ##
    res <- matrix(res, nrow=length(IND.LAB), byrow=TRUE)
    class(res) <- "DNAbin"
    rownames(res) <- IND.LAB

    ## extract snps if needed ##
    if(snpOnly){
        if(!quiet) cat("\n Extracting SNPs... \n")
        snp.posi <- seg.sites(res)
        if(length(snp.posi)==0) warning("no polymorphic site in the sequences")
        res <- res[,seg.sites(res),drop=FALSE]
        colnames(res) <- snp.posi
    }

    ## RETURN OUTPUT ##
    if(!quiet) cat("\n...done.\n\n")

    return(res)
} # end fasta2DNAbin

Try the adegenet package in your browser

Any scripts or data that you put into this service are public.

adegenet documentation built on July 18, 2021, 1:06 a.m.