R/dataimport.R
In polysat: Tools for Polyploid Microsatellite Analysis

Documented in read.ATetra read.GeneMapper read.GenoDive read.POPDIST read.SPAGeDi read.STRand read.Structure read.Tetrasat

read.GeneMapper<-function(infiles, forceInteger=TRUE){
    # read files and get names of samples and loci
    samples<-c()
    loci<-c()
    locusdata<-list()
    length(locusdata)<-length(infiles)
    cc <- c("character","character")
    names(cc) <- c("Sample.Name","Marker")
    for(i in 1:length(infiles)){
        locusdata[[i]]<-read.table(infiles[i],sep="\t",header=TRUE,
                                   colClasses=cc, stringsAsFactors=FALSE,
                                   na.strings = c("", "NA"))
        samples<-c(samples,locusdata[[i]][["Sample.Name"]])
        loci<-c(loci,locusdata[[i]][["Marker"]])
    }
    samples<-unique(samples)
    loci<-unique(loci)

    # set up genambig object to put data into
    object <- new("genambig", samples, loci)

    # extract and insert genotypes
    for(m in 1:length(locusdata)){
        alleleindex<-grep("Allele",names(locusdata[[m]]),value=FALSE)
        if(forceInteger){
          for(a in alleleindex){
            locusdata[[m]][[a]] <- as.integer(locusdata[[m]][[a]])
          }
        }
        if(any(is.na(locusdata[[m]][[alleleindex[1]]]))){
            stop("Rows with missing data should be deleted or have -9 in the first allele slot.")
        }
        if(any(locusdata[[m]][,alleleindex[-1]] == -9, na.rm=TRUE)){
            stop("Only the first allele column should have -9 values; use blank space or NA in remaining columns.")
        }
        for(j in 1:length(locusdata[[m]][["Sample.Name"]])){
                untrimmedgenotype<-locusdata[[m]][j,alleleindex]
                #Gives an error if there are loci or samples not in the arguments
                Genotype(object,locusdata[[m]][j, "Sample.Name"],
                              fixloci(locusdata[[m]][j, "Marker"], warn = FALSE))<-
                    unique(untrimmedgenotype[!is.na(untrimmedgenotype)])
        }
    }

    validObject(object)
    return(object)
}

## should read.GenoDivd find the ploidy of each genotype?
read.GenoDive <- function(infile){
    rawdata<-readLines(infile)
    #information about number of samples, number of loci, etc. is in second line
    datainfo<-as.integer(strsplit(rawdata[[2]],"\t")[[1]])
    numsam<-datainfo[1] # number of individuals
    numpop<-datainfo[2] # number of populations
    numloc<-datainfo[3] # number of loci
    digits<-datainfo[5] # number of digits used to code each allele

    # get the data header with column names, including loci
    colheader<-strsplit(rawdata[[3+numpop]],"\t")[[1]]
    lastloc<-length(colheader) # index of the last locus column
    firstloc<-lastloc-numloc+1 # index of the first locus column (3 or 4)
    loci<-colheader[firstloc:lastloc] # get the locus names
    # get the column containing sample names
    indcol <- firstloc - 1

    # set up genotype object
    object <- new("genambig", samples=1:numsam, loci = loci)
    PopNames(object) <- rawdata[3:(2+numpop)]
    Description(object) <- rawdata[1]
    # fill in sample data
    for(s in 1:numsam){
        # get data from the row for this sample
        sampledata<-strsplit(rawdata[[3+numpop+s]],"\t")[[1]]
        # extract name and population
        PopInfo(object)[s]<-as.integer(sampledata[1])
        Samples(object)[s]<-sampledata[indcol]
        # extract alleles
        for(L in 1:numloc){
            rawalleles<-sampledata[(firstloc:lastloc)[L]]
            while(nchar(rawalleles)%%digits !=0) {
                # If leading zeros were removed
                rawalleles<-paste("0",rawalleles,sep="")
            }
            # convert the character string to an integer vector
            myfirst<-seq(length=nchar(rawalleles)/digits,from=1,by=digits)
            mylast<-seq(length=nchar(rawalleles)/digits,from=digits,by=digits)
            thesealleles<-as.integer(substring(rawalleles,myfirst,mylast))
            # get rid of duplicate alleles
            thesealleles<-unique(thesealleles)
            # insert the missing data symbol as appropriate
            if(length(thesealleles)==1 && thesealleles==0){
                thesealleles<-Missing(object)
            }
            # get rid of "missing alleles" if the whole genotype is not missing
            thesealleles<-thesealleles[thesealleles !=0]
            # write the allele vector to the list
            Genotype(object, s, L) <-thesealleles
        }
    }
    validObject(object) # check validity, give errors if invalid
    # return the object
    return(object)
}

# make extracols include the popinfo column
# popinfocol is column number not including labels, or NA
read.Structure<-function(infile,ploidy,missingin=-9,sep="\t",markernames=TRUE,
                         labels=TRUE, extrarows=1, popinfocol=1, extracols=1,
                         getexcols=FALSE, ploidyoutput="one"
                         ){
    # error checking
    if(length(ploidyoutput) != 1 ||
       !ploidyoutput %in% c("one","samplemax","matrix"))
        stop("ploidy output must be \"one\", \"samplemax\", or \"matrix\".")
    # read the file
    rawdata<-read.table(infile,header=markernames,sep=sep)
    # get an index of samples and a column labeling the samples
    if(labels){
        # if row labels are used, get those as the sample names
        samples<-unique(rawdata[[1]])
        samples<-samples[!is.na(samples)]
        samples<-as.character(samples)
        samples<-samples[samples != ""]
        names(rawdata)[1]<-"Samples"
    } else {
        # make an integer vector to represent samples
        samples<-1:((dim(rawdata)[1]-extrarows)/ploidy)
        samindex<-c(rep(0,times=extrarows),rep(samples, each=ploidy))
        rawdata[length(rawdata)+1]<-samindex
        names(rawdata)[length(rawdata)]<-"Samples"
    }
    # get an index of loci (will be V2 etc. if loci not named)
    loci<-names(rawdata)
    loci<-loci[loci != "Samples"]
    loci<-loci[(extracols+1):length(loci)]
    # mark the column containing popinfo
    if(!is.na(popinfocol)){
        names(rawdata)[popinfocol+ifelse(labels,1,0)] <- "PopInfo"
    }

    # set up the object to store genotypes
    object <- new("genambig", samples, loci)
    # set up ploidy
    if(ploidyoutput=="one"){
        object <- reformatPloidies(object, output="one", erase=TRUE)
        Ploidies(object) <- ploidy
    }

    # fill the genotypes and popinfo
    for(s in samples){
        for(L in loci){
            rawalleles<-rawdata[rawdata$Samples==s,L]
            # process missing data and get unique alleles
            thesealleles<-unique(rawalleles)
            if(length(thesealleles)==1 && thesealleles[1]==missingin){
                thesealleles<-Missing(object)
            } else {
                thesealleles<-thesealleles[thesealleles != missingin]
                # get ploidy
                if(ploidyoutput!="one"){
                    Ploidies(object)[s,fixloci(L, warn = FALSE)] <-
                       length(rawalleles[rawalleles != missingin])
                }
            }
            Genotype(object,s,fixloci(L, warn = FALSE))<-thesealleles
        }
        # get popinfo
        if(!is.na(popinfocol)){
            PopInfo(object)[s] <- rawdata[match(s, rawdata$Samples),"PopInfo"]
        }
    }
#    object <- reformatPloidies(object, output="collapse", na.rm=TRUE)
#    Loci(object) <- gsub(".", "-", loci, fixed=TRUE) # not needed with fixloci function
    if(ploidyoutput=="samplemax"){
        maxpl <- apply(Ploidies(object), 1, max, na.rm=TRUE)
        object <- reformatPloidies(object, output="sample", erase=TRUE)
        Ploidies(object) <- maxpl
    }


    # extract the extra columns, if needed
    if(getexcols){
        Extracol<-data.frame(row.names=samples)
        excolindex <- 1:extracols
        if(!is.na(popinfocol)){
            excolindex <- excolindex[-popinfocol]
        }
        if(labels){
            excolindex <- excolindex + 1
        }
        for(x in excolindex){
            # look up values by sample in this column
            # add the column to the data frame
            Extracol<-data.frame(Extracol,rawdata[[x]][seq(1+extrarows,
                                                        length(rawdata[[1]])+
                                                           1-ploidy,
                                                        by=ploidy)])
            names(Extracol)[length(Extracol)] <- x
        }
    }

    validObject(object)
    # return extra columns and genotypes
    if(getexcols){
        return(list(ExtraCol=Extracol, Dataset=object))
    } else {
        return(object)
    }
}

read.SPAGeDi<-function(infile, allelesep="/", returnspatcoord=FALSE){
    # get all lines from the file, and eliminate comment lines
    Lines<-readLines(infile)
    Lines<-Lines[Lines != ""]
    first2<-sapply(Lines, substring, first=1, last=2)
    Lines<-Lines[first2 != "//"]
    # get data from the first line
    fileinfo<-as.integer(strsplit(Lines[1], "\t")[[1]])
    numind<-fileinfo[1] # number of samples
    numcat<-fileinfo[2] # number of categories
    numsc<-fileinfo[3] # number of spatial coordinates
    numloc<-fileinfo[4] # number of loci
    digits<-fileinfo[5] # number of digits to represent alleles

    # Is there a column of categories?
    if(numcat==0){
        catpres<-0
    } else {
        catpres<-1
    }
    # Is latitude and longitude used instead of Cartesian coordinates?
    if(numsc==-2){
        numsc<-2
    }

    # read the rest of the file as a table
    tmp <- tempfile()
    cat(Lines[3:(3+numind)], sep="\n", file=tmp)
    gentable <- read.table(tmp, sep="\t", header=TRUE,
                           row.names=1,
                           colClasses=c("character",rep(NA,catpres+numsc),
                           rep("character",numloc)))
    # get sample and locus names
    samples<-row.names(gentable)
    loci<-names(gentable)[(length(gentable)-numloc+1):length(gentable)]

    # set up genambig object
    object <- new("genambig", samples=samples, loci=loci)
    # take PopInfo from categories if present
    if(catpres==1){
        PopNames(object) <- unique(as.character(gentable[[1]]))
        PopInfo(object) <- match(gentable[[1]], PopNames(object))
    }

    # If there is no separation of alleles, count digits off with substring
    if(allelesep==""){
        for(s in samples){
            #set up a list to contain genotypes
            thesegenotypes<-list(0)
            length(thesegenotypes)<-length(loci)
            names(thesegenotypes)<-loci

            for(L in loci){
                # add leading zeros if necessary
                while(nchar(gentable[s,L])%%digits !=0){
                    gentable[s,L]<-paste("0",gentable[s,L],sep="")
                }
                # split into alleles and convert to integers
                thesegenotypes[[L]]<-as.integer(substring(gentable[s,L],
                                               first=seq(1,
                                               nchar(gentable[s,L])-digits+1,
                                               by=digits),
                                               last=seq(digits,
                                               nchar(gentable[s,L]),by=digits)))
                # if genotype only has zeros, write missing data symbol
                if(length(unique(thesegenotypes[[L]]))==1 && thesegenotypes[[L]][1]==0){
                    thesegenotypes[[L]]<-Missing(object)
                }
                # otherwise remove zeros on left
                while(thesegenotypes[[L]][1]==0){
                    thesegenotypes[[L]]<-thesegenotypes[[L]][-1]
                }

                # get ploidy
                if(thesegenotypes[[L]][1] != Missing(object)){
                  Ploidies(object)[s,fixloci(L, warn = FALSE)] <- length(thesegenotypes[[L]])
                }

                # remove zeros on the right
                thesegenotypes[[L]]<-thesegenotypes[[L]][thesegenotypes[[L]] != 0]
                # get unique alleles
                thesegenotypes[[L]]<-unique(thesegenotypes[[L]])
            }
            # add genotypes to object
            Genotypes(object, samples=s)<-thesegenotypes
        }

    # get alleles by strsplit
    } else {
        for(s in samples){
            # get alleles by splitting the strings
            thesegenotypes<-sapply(gentable[s,loci],strsplit,
                                   split=allelesep,fixed=TRUE)
            names(thesegenotypes) <- loci

            for(L in loci){
                # convert to integer
                thesegenotypes[[L]]<-as.integer(thesegenotypes[[L]])
                # if genotype only has zeros, write missing data symbol
                if(length(unique(thesegenotypes[[L]]))==1 && thesegenotypes[[L]][1]==0){
                    thesegenotypes[[L]]<-Missing(object)
                }
                # otherwise remove zeros on left
                while(thesegenotypes[[L]][1]==0){
                    thesegenotypes[[L]]<-thesegenotypes[[L]][-1]
                }

                # get ploidy
                if(thesegenotypes[[L]][1] != Missing(object)){
                  Ploidies(object)[s,fixloci(L, warn = FALSE)] <- length(thesegenotypes[[L]])
                }

                # remove zeros on the right
                thesegenotypes[[L]]<-
                  thesegenotypes[[L]][thesegenotypes[[L]] != 0]
                # get unique alleles
                thesegenotypes[[L]]<-unique(thesegenotypes[[L]])
            }
            # add genotypes to list
            Genotypes(object, samples=s)<-thesegenotypes
        }
    }
    object <- reformatPloidies(object, output="collapse", na.rm=TRUE)
    validObject(object)

    # return the genotypes, popinfo, ploidies, and spatial coordinates
    if(!returnspatcoord){
        return(object)
    } else {
        spatcoord <- gentable[,!names(gentable) %in% loci]
        if(catpres == 1){
            spatcoord <- spatcoord[,-1]
        }
        return(list(SpatCoord=spatcoord,
                    Dataset=object))
    }
}

read.Tetrasat <- function(infile){
    #read the file into a character vector, containing all the lines
    rawdata<-readLines(infile)
    #find which lines delimit populations
    popindex<-grep("pop",rawdata,ignore.case=TRUE,value=FALSE)
    popindex<-popindex[popindex != 1]
    #get a character vector of loci, whether they were stored in one or
    #several lines
    loci<-rawdata[2:(popindex[1]-1)]
    if(length(loci) == 1){loci<-strsplit(loci,",")[[1]]}
    #find which lines contain genotype data
    samindex<-c(popindex[1]:length(rawdata))
    samindex<-samindex[!samindex %in% popindex]
    # set up the genambig object
    object<-new("genambig", samples=1:length(samindex) ,loci=loci)
    object <- reformatPloidies(object, output="one", erase=TRUE)
    Ploidies(object) <- 4
    Description(object) <- rawdata[1]

    #Extract the data out of the lines
    for(i in 1:length(samindex)){
        #Find the largest value in popindex that is smaller than samindex[i]
        #Use the position of that value in popindex for the pop id
        PopInfo(object)[i]<-match(max(popindex[popindex < samindex[i]]),
                                  popindex)
        #extract sample names and genotypes
        samname<-gsub(" ","",substring(rawdata[samindex[i]],1,20),fixed=TRUE)
        Samples(object)[i] <- samname
        thesegenotypes<-substring(rawdata[samindex[i]],
                                  seq(length=length(loci),from=21,by=9),
                                  seq(length=length(loci),from=28,by=9))
        for(j in 1:length(loci)){
            thesealleles<-gsub(" ","",thesegenotypes[j],fixed=TRUE)
            if(nchar(thesealleles) != 0){
              thesealleles<-substring(thesealleles,
                                      seq(length=nchar(thesealleles)/2,from=1,by=2),
                                      seq(length=nchar(thesealleles)/2,from=2,by=2))
              thesealleles<-unique(as.integer(thesealleles))
            } else {thesealleles<-Missing(object)}
            Genotype(object, i,j)<-thesealleles
        }
    }
    validObject(object)
    #return genotype and pop data
    return(object)
}

read.ATetra<-function(infile){
    #read the file into a list of vectors, one for each line
    rawdata<-strsplit(readLines(infile),",")
    #make an index of which lines contain locus and sample info
    #also make an index of all loci, and vector of population names
    locindex<-c()
    samindex<-c()
    popnames<-character(0)
    for(i in 1:length(rawdata)){
        if(rawdata[[i]][1] == "TIT"){
            description <- rawdata[[i]][2]
        }
        if(rawdata[[i]][1] == "LOC"){
            locindex<-c(locindex,i)
            names(locindex)[length(locindex)]<-rawdata[[i]][3]
        }
        if(rawdata[[i]][1] == "IND"){
            samindex<-c(samindex,i)
        }
        if(rawdata[[i]][1] == "POP" && length(locindex) == 1){
            popnames <- c(popnames, rawdata[[i]][4])
        }
    }
    #make a vector of all loci
    loci<-names(locindex)
    #make a vector of all samples, and a vector of which samples go in which pops
    samples<-c()
    popdata<-c()
    samindex1loc<-samindex[samindex < locindex[2]]
    for(j in 1:length(samindex1loc)){
        samples[j]<-rawdata[[samindex1loc[j]]][5]
        popdata[j]<-as.integer(rawdata[[samindex1loc[j]]][3])
    }
    #set up the object to contain genotypes
    object <- new("genambig", samples=samples, loci=loci)
    object <- reformatPloidies(object, output="one", erase=TRUE)
    PopInfo(object) <- popdata
    PopNames(object) <- popnames
    Ploidies(object) <- 4
    Description(object) <- description

    #fill the array of genotypes
    for(m in samindex){
        thesealleles<-rawdata[[m]][6:9]
        thesealleles<-as.integer(thesealleles[thesealleles !=""])
        thesealleles<-thesealleles[!is.na(thesealleles)]
        Genotype(object,as.integer(rawdata[[m]][4]),
                 as.integer(rawdata[[m]][2]))<-thesealleles
    }
    # check validity
    validObject(object)
    #return population data and genotypes
    return(object)
}

read.POPDIST <- function(infiles){
    # read the files into a character vector
    rawdata <- readLines(infiles[1])
    if(length(infiles) > 1){
        popindex<-(1:length(rawdata))[substr(rawdata,1,3) %in%
                                    c("Pop", "pop", "POP")]
        popindex<-popindex[popindex != 1]

        firstpop <- popindex[1]
        for(i in 2:length(infiles)){
            rawdata2 <- readLines(infiles[i])
            if(!identical(rawdata[2:firstpop],rawdata2[2:firstpop]))
               stop(paste("Loci not identical between",infiles[1],
                          "and",infiles[i]))
            rawdata <- c(rawdata, rawdata2[firstpop:length(rawdata2)])
        }
    }

    #find which lines delimit populations
    popindex<-(1:length(rawdata))[substr(rawdata,1,3) %in%
                                    c("Pop", "pop", "POP")]
    popindex<-popindex[popindex != 1]
    #get a character vector of loci, whether they were stored in one or
    #several lines
    loci<-rawdata[2:(popindex[1]-1)]
    if(length(loci) == 1){loci<-strsplit(loci,",")[[1]]}
    #find which lines contain genotype data
    samindex<-c(popindex[1]:length(rawdata))
    samindex<-samindex[!samindex %in% popindex]
    # set up the genambig object
    object<-new("genambig", samples=1:length(samindex) ,loci=loci)
    Description(object) <- rawdata[1]
    # get population names
    for(p in 1:length(popindex)){
        PopNames(object)[p] <- strsplit(rawdata[popindex[p]+1],
                                        "\t,",fixed=TRUE)[[1]][1]
    }

    # Extract data for individuals
    for(i in 1:length(samindex)){
        # get PopInfo
        PopInfo(object)[i]<-match(max(popindex[popindex < samindex[i]]),
                                  popindex)
        # get genotypes
        genotypestring <- strsplit(rawdata[samindex[i]],"\t,")[[1]][2]
        thesegenotypes <- strsplit(genotypestring,split="[[:blank:]]")[[1]]
        thesegenotypes <- thesegenotypes[thesegenotypes != ""]
        Ploidies(object)[i,] <- nchar(thesegenotypes)/2

        for(j in 1:length(loci)){
            thesealleles <- substring(thesegenotypes[j],
                                      (1:Ploidies(object)[i])*2-1,
                                      (1:Ploidies(object)[i])*2)
            thesealleles <- unique(as.integer(thesealleles))
            thesealleles <- thesealleles[thesealleles != 0]
            if(length(thesealleles) != 0){
                Genotype(object, i, j) <- thesealleles
            }
        }
    }
    object <- reformatPloidies(object, output="collapse")

    validObject(object)
    return(object)
}

read.STRand <- function(file, sep="\t", popInSam=TRUE){
  # read the spreadsheet into a table
  mydata <- read.table(file, header=TRUE, sep=sep, colClasses="character")

  if(!"Pop" %in% names(mydata))
    stop("Need Pop column")
  if(!"Ind" %in% names(mydata))
    stop("Need Ind column")
  if(popInSam){
    samples <- paste(mydata$Pop, mydata$Ind, sep="")
  } else {
    samples <- mydata$Ind
  }
  loci <- names(mydata)[!names(mydata) %in% c("Pop", "Ind")]
  genobject <- new("genambig", samples=samples, loci=loci)
  PopNames(genobject) <- unique(mydata$Pop)
  PopInfo(genobject) <- PopNum(genobject, mydata$Pop)

  # fill in the genotypes
  for(L in loci){
    for(s in 1:length(samples)){
      thisgen <- strsplit(mydata[s,L], split="*", fixed=TRUE)[[1]]
      thisgen <- unique(as.integer(strsplit(thisgen, split="/")[[1]]))
      if(thisgen[1]==0){
        Genotype(genobject, s, fixloci(L, warn = FALSE)) <- Missing(genobject)
      } else {
        Genotype(genobject, s, fixloci(L, warn = FALSE)) <- thisgen
      }
    }
  }
  Loci(genobject) <- gsub(".", "-", loci, fixed=TRUE)

  validObject(genobject)
  # return the dataset
  return(genobject)
}
Any scripts or data that you put into this service are public.
polysat documentation built on Aug. 23, 2022, 5:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
polysat
Tools for Polyploid Microsatellite Analysis

R/dataimport.R
In polysat: Tools for Polyploid Microsatellite Analysis

Defines functions read.STRand read.POPDIST read.ATetra read.Tetrasat read.SPAGeDi read.Structure read.GenoDive read.GeneMapper

Documented in read.ATetra read.GeneMapper read.GenoDive read.POPDIST read.SPAGeDi read.STRand read.Structure read.Tetrasat

Try the polysat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polysat Tools for Polyploid Microsatellite Analysis

R/dataimport.R In polysat: Tools for Polyploid Microsatellite Analysis

Defines functions read.STRand read.POPDIST read.ATetra read.Tetrasat read.SPAGeDi read.Structure read.GenoDive read.GeneMapper

Documented in read.ATetra read.GeneMapper read.GenoDive read.POPDIST read.SPAGeDi read.STRand read.Structure read.Tetrasat

Try the polysat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polysat
Tools for Polyploid Microsatellite Analysis

R/dataimport.R
In polysat: Tools for Polyploid Microsatellite Analysis