R/dataEdit.R
In gdmp: Genomic Data Management

Documented in arrayAppend GetHCS getMAF is.hwEq is.identical snpRecode snpSelect toArray

#' @import methods
#' @import utils
#' @useDynLib gdmp


#' @export
toArray <- function(finalRep) {

finalRep[,1] <- as.character(finalRep[,1]); finalRep[,2] <- as.character(finalRep[,2]);
snpNo <- nlevels(as.factor(finalRep[,1]))
animalNo <- nlevels(as.factor(finalRep[,2]))
k <- ifelse(animalNo > 2, sample(2:animalNo, 1), 2)

if(animalNo > 1) {
  if(!all(finalRep[1:snpNo, 1] == finalRep[(k-1)*snpNo + (1:snpNo), 1]))
    stop("Sequence of SNPs is not unique across animals")
}

if(sum(diff(table(finalRep[,2])))) stop("Variable number of SNPs per animal")
snps <- finalRep[1:snpNo,1]; animals <- unique(finalRep[,2])

t( array(paste(finalRep[,3],finalRep[,4],sep=""), dim=c(snpNo, animalNo),
      dimnames = list(snps, animals)) )

} # End toArray

## Append two arrays -- add new data
## #################################
#' @export
arrayAppend = function(topA, bottomA, missingVal = 5) {
#
# Old Data = topA
# New Data = bottomA
# missingVal = value to use for missing SNPs in new data, default = 5
# For differing SNPs, go by the SNP list of the old data
#

  if( ncol(topA) == ncol(bottomA) ) {
    ##if(all.equal(colnames(topA), colnames(bottomA))==TRUE)
    if(setequal(colnames(topA), colnames(bottomA)))
      topA <- rbind(topA, bottomA)
    else {
      ## 1. Files that follow may contain 4275 SNPs without the 'ARS-' prefix
      ## The prefix exists in Illumina files such as allele report and final
      ## SNP list. Therefore, the prefix will be added.
      u <- which((!is.element(colnames(bottomA), colnames(topA))))
      colnames(bottomA)[u] <- paste("ARS-", colnames(bottomA)[u], sep="")
      if(sum(is.element(colnames(bottomA), colnames(topA))) != dim(topA)[2])
        stop("Something other than ARS-, chech and interfere manually!")

      ## 2. Order of SNPs is not the same
      topA = rbind(topA, bottomA[,colnames(topA), drop=FALSE])
    }
  }
  else {
    ## Exclude SNPs in bottomA (new data) that are not in topA (old data)
    u <- which((is.element(colnames(bottomA), colnames(topA))))
    bottomA <- bottomA[,u, drop=FALSE]

    ## if bottomA is for a single animal
    if(dim(as.matrix(bottomA))[2] == 1) bottomA <- t(as.matrix(bottomA))

    ## Add SNPs in topA (old data) that are not in bottomA (new data)
    u <- which((!is.element(colnames(topA), colnames(bottomA))))
    tA <- array(missingVal, c(nrow(bottomA), length(u)), dimnames=list(rownames(bottomA), colnames(topA)[u]))
    bottomA <- cbind( bottomA, tA )
    topA = rbind(topA, bottomA[,colnames(topA), drop=FALSE])
  }

  topA
}

## Get 2 unique bases present at a SNP
## ###################################
getBases <- function(snpG, all.char = FALSE) {
## Get 2 unique bases across all animals present at a SNP
## snpG is a column vector in genotype array 'ga' with "AA", "AG", "GA", "-A", "--", whaterver ...
## Option 'all.char = TRUE' prints out all characters not only elements of (A,G,C,T)
  x <- paste(unique(snpG), collapse ='')
  nx <- nchar(x)
  y <- character(nx)
  for(i in 1:nx) y[i] <- substr(x, i, i)
  if(!all.char) sort(unique(y[is.element(y, c("A","G","C","T"))]))
  else sort(unique(y))
} # End getBases

##
##
## WARNING:
## VERY SLOW VERSION
## Not in NAMESPACE
## Not documented
## Recode base genotypes as 0, 1, 2
## #################################
snpRecode.nodesig <- function(snpG) {
# nodesig: no designations needed
## Recode snp genotypes by counting number of allele A
## snpG is a column vector in genotype array 'ga' with "AA", "AG", "GA", "-A", "--", whaterver ...
## Unknown genotypes are those with non A/G/C/T bases, code those as 5
## Takes about 2.1 seconds per animal for the 50K Chip, about 1/2 hr is needed to recode 800 animals!

  b <- getBases(snpG)
  n <- length(snpG)
  x <- integer(n)

  if(length(b) == 0) { ## missing all snp genotypes
    x <- rep.int(5, n)
  }

  if(length(b) == 1) { ## X-linked, or non polymorphic snps, code as 2
    for(i in 1:n) {
      if(substr(snpG[i],1,1) == b && substr(snpG[i],2,2) == b) x[i] <- 2
      else x[i] <- 5  ## unknown genotype
    }
  }

  if(length(b) == 2) { ## regular autosomal snp
    for(i in 1:n) {
      alleleA <- substr(snpG[i],1,1)
      alleleB <- substr(snpG[i],2,2)
      if(!is.element(alleleA, b) || !is.element(alleleB, b)) x[i] <- 5
      else { if(alleleA == b[1]) x[i] <- 1; if(alleleB == b[1]) x[i] <- x[i]+1; }
    }
  }
  x
} # Slow snpRecode


## Recode SNP genotypes as 0, 1, 2
## #################################
#' @export
snpRecode <- function(snpG, designat) {

  b <- getBases(snpG)
  n <- length(snpG)
  x <- rep.int(5, n)

  if(length(b) == 1) { ## X-linked, or non polymorphic snps, code as 0 or 2
    if(b == designat[2]) { BB <- paste(b, b, sep=""); x[snpG == BB] <- 0 }
    if(b == designat[1]) { AA <- paste(b, b, sep=""); x[snpG == AA] <- 2 }
  }

  if(length(b) == 2) { ## regular autosomal snp
    if(b[1] != designat[1]) {
      tempb = b; b[1] = tempb[2]; b[2] = tempb[1]
      ## if Designated and actual bases mismatch use unkown '--'
      if(b[1] != designat[1]) b[1] <- b[2] <- '-'
    }
    AA <- paste(b[1], b[1], sep="")
    AB <- paste(b[1], b[2], sep="")
    BA <- paste(b[2], b[1], sep="")
    BB <- paste(b[2], b[2], sep="")

    x[snpG == AA] <- 2
    x[snpG == AB] <- 1
    x[snpG == BA] <- 1
    x[snpG == BB] <- 0
  }
  x
} # End snpRecode

#' @export
is.hwEq <- function(snpG, diff) {

  g <- table(snpG)
  g <- g[c("0","1","2")]
  g <- g[!is.na(g)] ## just in case of a missing 0, 1, or 2
  if(length(g) < 3) hwEq <- FALSE
  else {
    g <- g/sum(g)
    if(abs(sqrt(4*g[1]*g[3]) - g[2]) > diff) hwEq <- FALSE
    else hwEq <- TRUE
  }
  hwEq
} # End is.hwEq

#' @export
snpSelect <- function(ga.r, select.method=c("call.rate", "heterozygosity", "HW.Eq", "MAF"),
  call.rate.min = .85, hz.min = .01, hz.diff = .15, MAF.min = .005) {

  select.method <- match.arg(select.method)

  if(select.method == "call.rate") {
    poorThreshold = round((1 - call.rate.min)*(dim(ga.r)[2]))
    poor <- apply(ga.r, 1, function(x) table(x)["5"])
    ga.r[poor < poorThreshold,]
  }
  else if(select.method == "heterozygosity") {
    hzThreshold = round(hz.min*(dim(ga.r)[1]))
    hzNumber <- apply(ga.r, 2, function(x) table(x)["1"])
    hzNumber[is.na(hzNumber)] <- 0
    ga.r[,(hzNumber > hzThreshold)]
  }
  else if(select.method == "HW.Eq") {
    keep <- apply(ga.r, 2, function(x) is.hwEq(x, hz.diff))
    ga.r[,keep]
  }
  else {
    maf <-  apply(ga.r, 2, getMAF)
    ga.r[,(maf > MAF.min)]
  }
} # End snpSelect

#' @export
is.identical <- function(x, y, allow = .005) {

  n <- length(x)
  k <- round(allow*n)
  result <- 0
  if(.C("cIdentical", as.integer(x), as.integer(y), as.integer(k), as.integer(n),
  result = as.integer(result), PACKAGE="gdmp")$result) TRUE
  else FALSE
}

#' @export
GetHCS <- function(ga.r, Exclude=1:ncol(ga.r), allow = .005) {

  ga.r <- ga.r[,Exclude]
  n <- dim(ga.r)[2]
  l <- dim(ga.r)[1]
  k <- round(allow*l)
  iex <- .C("cGetHCS", as.integer(ga.r),iex=as.integer(rep.int(1,n)),as.integer(l),as.integer(n),as.integer(k), PACKAGE="gdmp")$iex
  Exclude[iex == 0]
}

#' @export
getMAF <- function(snpG) {
## Calculate Minor Allele Frequency for a single SNP
  maf <- 0
  g <- table(snpG)[c("0","1","2")]
  g <- g[!is.na(g)] ## just in case of a missing 0, 1, or 2
  if(length(g) == 3) maf <- (2*g[1] + g[2])
  else if(length(g) == 2) {
    if(!is.na(g["1"])) maf <- g["1"]
    if(!is.na(g["0"])) maf <- maf + (2*g["0"])
    else if(!is.na(g["2"])) maf <- maf + (2*g["2"])
  }

  maf <- maf/(2*sum(g))
  ifelse(maf <= .5, maf, 1-maf)
} # End getMAF



##
## The following functions are not documented and not included in NAMESPACE
##
toBeagle <- function(finalRep) {
## Function to turn final report file into an input BEAGLE file for the purpose of imputing
## Conditions for data in final report 'finalRep':
## 1. final report is a data frame with first 5 columns listed in the following order:
##     a. snp name, as factor, with equal number of snps per animal
##     b. animal id as factor
##     c. allele 1 (one character: A, C, G, or T)
##     d. allele 2 (one character: A, C, G, or T)
##     e. gencall
## 2. all snps of animal 1 are listed first followed by snps of animal 2, and so on.
## 3. snps are listed for each animal in the same order


snpNo <- nlevels(as.factor(finalRep[,1]))
animalNo <- nlevels(as.factor(finalRep[,2]))
k <- sample(2:(animalNo-1), 1)

## Check condition #3
if(animalNo > 1) {
  if(!all(finalRep[1:snpNo, 1] == finalRep[((k*snpNo+1):((k+1)*snpNo)), 1]))
    stop("Sequence of SNPs is not unique across animals")
}

## Check condition #1
if(sum(diff(table(finalRep[,2])))) stop("Variable number of SNPs per animal")
snps <- as.character(finalRep[1:snpNo,1]); animals <- as.character(unique(finalRep[,2]))

bf <- finalRep[1:snpNo, 3:4]
if(animalNo > 1) {
  for(i in 2:animalNo) bf <- cbind(bf, finalRep[((i-1)*snpNo+1):(i*snpNo), 3:4])
}

bf <- cbind(snps, bf)
bf <- cbind(rep("M", snpNo), bf)
colnames(bf) <- c("I", "id", rep(animals, each=2))
bf

} # End toBeagle
## x <- toBeagle(finalReport)
## write.table(x, file="./x.bgl", sep=" ", quote=F, row.names=F)


## Append two beagle-formatted arrays - add individuals 'horizontally'
## ###################################################################
biAppend = function(leftA, rightA) {

  ##if(all.equal(leftA[,2], rightA[,2])==TRUE)
  if(setequal(leftA[,2], rightA[,2]))
    leftA <- cbind(leftA, rightA[,-(1:2)])
  else {
    ## 1. Files that follow may contain 4275 SNPs without the 'ARS-' prefix
    ## The prefix exists in Illumina files such as allele report and final
    ## SNP list -- the prefix continues to version 2.
    u <- which((!is.element(rightA[,2], leftA[,2])))
    rightA[u,2] <- paste("ARS-", rightA[u,2], sep="")
    if(sum(is.element(rightA[,2], leftA[,2])) != dim(leftA)[1])
      stop("Something other than ARS-, chech and interfere manually!")

    ## 2. Order of SNPs is not the same
    leftA = cbind(leftA, rightA[leftA[,2], -(1:2)])
  }
  leftA
}

## Append two beagle-formatted arrays -- add markers 'vertically'
## ##############################################################
bmAppend = function(topA, bottomA) {

  ##if(all.equal(colnames(topA), colnames(bottomA))==TRUE)
  if(setequal(colnames(topA), colnames(bottomA)))
    topA <- rbind(topA, bottomA)
  else stop("To add markers, individuals in topA and bottomA must all be equal")
  topA
}

#
# Function to read phased/imputed beagle output file
# ##################################################
readBeagle <- function(bfile) {
  imputed <- read.table(file=bfile, header=T, check.names=F)
  imputed
}

#
# Function to transform beagle format to array format
# ###################################################
b2Array <- function(imputed) {
  row.names(imputed) <- imputed$id
  imputed$I <- imputed$id <- NULL

  Nsamples <- ncol(imputed)/2
  Nsnps <- nrow(imputed)

  sampleIDs <- colnames(imputed)[seq(1, 2*Nsamples, 2)]
  snpNames <- row.names(imputed)

  a <- matrix(0, nrow=Nsnps, ncol=Nsamples, dimnames=list(snpNames,sampleIDs))
  for(i in 1:Nsamples) a[,i] <- paste(imputed[,(2*i-1)], imputed[,(2*i)], sep="")

  t(a)
}

Any scripts or data that you put into this service are public.

gdmp documentation built on May 1, 2019, 8:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gdmp
Genomic Data Management

R/dataEdit.R
In gdmp: Genomic Data Management

Defines functions toArray arrayAppend getBases snpRecode.nodesig snpRecode is.hwEq snpSelect is.identical GetHCS getMAF toBeagle biAppend bmAppend readBeagle b2Array

Documented in arrayAppend GetHCS getMAF is.hwEq is.identical snpRecode snpSelect toArray

Try the gdmp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gdmp Genomic Data Management

R/dataEdit.R In gdmp: Genomic Data Management

Defines functions toArray arrayAppend getBases snpRecode.nodesig snpRecode is.hwEq snpSelect is.identical GetHCS getMAF toBeagle biAppend bmAppend readBeagle b2Array

Documented in arrayAppend GetHCS getMAF is.hwEq is.identical snpRecode snpSelect toArray

Try the gdmp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gdmp
Genomic Data Management

R/dataEdit.R
In gdmp: Genomic Data Management