R/raw.data.R
In snpReady: Preparing Genotypic Datasets in Order to Run Genomic Analysis

raw.data <- function(data, frame = c("long","wide"), hapmap = NULL, base = TRUE, sweep.sample= 1,
                     call.rate=0.95, maf=0.05, imput=TRUE, imput.type = c("wright", "mean","knni"),
                     outfile=c("012","-101","structure"), plot = FALSE)
{

  if (call.rate < 0 | call.rate > 1 | maf < 0 | maf > 1)
    stop("Treshold for call rate and maf must be between 0 and 1")
  
 if (sweep.sample < 0 | sweep.sample > 1)
      stop("Treshold for sweep.sample must be between 0 and 1")										  
  if(missing(outfile))
    outfile = "012"

	if(!is.matrix(data))
    stop("Data must be in matrix class")
  
  match.arg(frame)
  match.arg(imput.type)
  
  if(base){
    if (frame == "long"){
      if(ncol(data)>4)
        stop("For format long, the object must have four columns")
    
    bs <- unique(na.omit(as.vector(data[, 3:4])))
    if(!any(all(bs %in% c("A","C", "G", "T")) | all(bs %in% c("A", "B"))))
      stop("SNPs must be coded as nitrogenous bases (ACGT) or as A and B")
    
    if(!all(bs %in% c("A","C", "G", "T")) & outfile == "structure")
      stop("For outfile 'structure', SNPs must be coded as nitrogenous bases (ACGT)")
    
    sample.id <- sort(unique(data[,1L]))
    snp.name <- sort(unique(data[,2L]))
    
    col2row <- function(x, data){
      curId <- data[,1L] %in% x
      curSnp <- ifelse(is.na(data[curId, 3L]) | is.na(data[curId, 4L]), NA, 
                       paste(data[curId, 3L], data[curId, 4L], sep = ""))
      curPos <- match(snp.name, data[curId, 2L])
      if(any(is.na(curPos))){
        vec <- rep(NA,length(snp.name))
        vec[which(!is.na(curPos))] <- curSnp[na.omit(curPos)]
        }
      else{
        vec <- curSnp[curPos]
        return(vec)}
    }
    
    mbase <- sapply(sample.id, function(x) col2row(x, data))
    colnames(mbase) <- sample.id
    rownames(mbase) <- snp.name
    data <- t(mbase)
  } else{
    bs <- unique(unlist(strsplit(unique(data[!is.na(data)]), "")))
    if(!any(all(bs %in% c("A","C", "G", "T")) | all(bs %in% c("A", "B"))))
      stop("SNPs must be coded as nitrogenous bases (ACGT) or as AB")
    
    if(!all(bs %in% c("A","C", "G", "T")) & outfile == "structure")
      stop("For outfile 'structure', SNPs must be coded as nitrogenous bases (ACGT)")
  }
  
	whichBase <- all(bs %in% c("A","C", "G", "T"))											  
   count_allele <- function(m, nitrBase = T){
    #' @importFrom stringr str_count
	if(nitrBase){		   
    A <- matrix(str_count(m, "A"), ncol = ncol(m), byrow = FALSE)
    C <- matrix(str_count(m, "C"), ncol = ncol(m), byrow = FALSE)
    G <- matrix(str_count(m, "G"), ncol = ncol(m), byrow = FALSE)
    C[, colSums(A, na.rm = TRUE)!=0] <- 0
    G[, colSums(A, na.rm = TRUE)!=0 | colSums(C, na.rm = TRUE)!=0] <- 0
    res <- A + C + G
	}else{
        A <- matrix(str_count(m, "A"), ncol = ncol(m), byrow = FALSE)
        res <- A
    }	
    if (any(colSums(res, na.rm=TRUE) == 0))
      res[,colSums(res, na.rm=TRUE) == 0] <- 2
    res[is.na(m)] <- NA
    rownames(res) <- rownames(m)
    colnames(res) <- colnames(m)
    return(res)
  }
    
  m <- count_allele(data, nitrBase = whichBase)
  }else{
    if(frame == "long")
      stop("format long only accepts nitrogenous bases. Check base argument")
    
    if(outfile == "structure")
      stop("output for 'structure' only accepts nitrogenous bases. Check base argument")
    
    m <- data
  }
  
  if(is.null(colnames(m)))
    stop("Marker names are missing")
	
  if(!is.null(hapmap)){
    if(ncol(m) != nrow(hapmap))
      stop("The number of markers differ in the hapmap and the data")
    
    hapmap <- hapmap[order(hapmap[,2L], hapmap[,3L], na.last = TRUE, decreasing = F), ]
    
    tmp <- match(hapmap[,1L], colnames(m)) #match between marker names
    
    if(any(is.na(tmp)))
      stop("Marker names in data does not match the marker names in hapmap")
											
    
    m <- m[,tmp]
    
    tmpd <- match(hapmap[,1L], colnames(data)) #match between marker names
    data <- data[,tmpd]
  }
  
  idName <- rownames(m)
  markerName <- colnames(m)
	
  missID <- rowMeans(is.na(m))
  posSS <- missID <= sweep.sample #selected by ID call rate
  
  #id.rmv <- rownames(m)[missID > sweep.sample]
  m <- m[posSS,]
  data <- data[posSS,]
  
  CR <- 1 - colMeans(is.na(m))
  poscr <- CR >= call.rate #selected by CR							  
  p <- colMeans(m, na.rm = T)/2
  minor <- apply(cbind(p, 1-p), 1, min)
  minor[is.nan(minor)] <- 0
  posmaf <- minor >= maf #selected by maf	
  #snp.rmv <- vector("list", 2)
  #snp.rmv[[1]] <- colnames(m)[CR < call.rate]
  #snp.rmv[[2]] <- colnames(m)[minor < maf]
  #position <- (CR >= call.rate) & (minor >= maf)
  if (sum(poscr & posmaf)==0L)
     stop("All markers were removed. Try again with another treshold for CR and MAF")
   
  m <- m[, poscr & posmaf]
  data <- data[, poscr & posmaf]
  
  if (imput){
    if(missing(imput.type) | is.null(imput.type))
      stop("A imput type must be chosen")
    
    
  if (any(CR[poscr & posmaf] == 0))
      stop("There are markers with all missing data. There is no way to
           impute. Try again using another call rate treshold")
  
   if(imput.type == "wright"){
      
      if (any(missID[posSS] == 1L))
        stop("There are samples with all missing data. There is no way to do
             imputation. Try again using another sweep.sample treshold")
      
							   
      f <- rowMeans(m != 1, na.rm = TRUE)
      f[is.nan(f)] <- 1
      m <- input.fun(m=m, p=p[poscr & posmaf], f=f)
    } 
    if(imput.type == "mean"){
      tmp <- which(is.na(m), arr.ind = TRUE)
      m[tmp] <- colMeans(m, na.rm = T)[tmp[,2]]
    }
    if(imput.type == "knni"){
      m <- t(impute.knn(t(m), 2)$data)
    }
  }
  
  switch(outfile,
         "-101" = {
           m[m == 0] <- -1
           m[m == 1] <- 0
           m[m == 2] <- 1
           },
         "structure" = {
           tmp <- lapply(as.data.frame(data), function(x){
             curCol <- strsplit(as.character(x), split = "")
             curCol[ which(is.na(curCol))] <- list(rep(NA, 2))
             res <- unlist(curCol)
             return(res)} )
           m <- as.matrix(do.call(cbind, tmp))
           colnames(m) <- colnames(data)
           rownames(m) <- rep(rownames(data), each=2)
           m <- chartr("ACGT", "1234", m)
           m[is.na(m)] <- -9
         },
         "012" = {
           m <- m
         },
         {
         stop("Output selected is not available")
           }
         )
  
  report <- list(maf = list(r = paste(sum(!posmaf), "Markers removed by MAF =", maf, sep = " "),
                            whichID = markerName[!posmaf]),
                 cr = list(r = paste(sum(!poscr), "Markers removed by Call Rate =", call.rate, sep=" "),
                           whichID = markerName[!poscr]),
                 sweep = list(r = paste(sum(!posSS), "Samples removed by sweep.sample =", sweep.sample, sep = " "),
                              whichID = idName[!posSS]),
                 imput = ifelse(imput, paste(sum(is.na(data)), "data points were inputed = ", round((sum(is.na(data))/length(data))*100, 2), "%"),
                                "No data point was imputed"))
  
  for(i in 1:3){
    if(length(report[[i]]$whichID) == 0)
      report[[i]]$whichID <- NULL
  }
  
  storage.mode(m) <- "numeric"
  if(is.null(hapmap)){
    return(list(M.clean = m, report = report))
  } else{
    
    if(plot){
      nimp <- aggregate(colSums(is.na(data)), by = list(hapmap[poscr & posmaf, 2]), FUN = sum)$x
      
      bychrom <- t(cbind("cr" = table(hapmap[!poscr | (!poscr & !posmaf) , 2])/table(hapmap[,2]), 
                         "maf" = table(hapmap[!posmaf & poscr, 2])/table(hapmap[,2]),
                         "n.imput" = nimp/(table(hapmap[, 2])*nrow(data))) )
      barplot.rd(bychrom)
    }
    
    hap <- hapmap[poscr & posmaf, ]															  
    colnames(hap) <- c("rs","chrom","pos")	  
    return(list(M.clean = m, Hapmap = hap, report = report))
  }
}

barplot.rd <- function(data, col = c("#7293CB", "#E1974C", "#D35E60"), plotName = "QCreport"){
  pdf = pdf(paste(plotName,".pdf", sep = ""), width = 10, height = 4)
  
  barplot(data, beside = TRUE, horiz = FALSE, ylab = "frequency", axes = F,
          xlab = "chromosome", ylim = c(0, max(data) ), 
          main = "Quality control", col = col)
  axis(side = 2, las = 2)
  par(new = T, mar = c(2,2,2,0))
  plot(1, type = "n", xlab = "", ylab = "", bty="n", axes = F)
  legend(x = "topright", fill = col, legend = c("Maf", "CR", "Imput"), 
         bty = "n")
  dev.off()
}

samplefp <- function(p, f){
  samp <- sample(c(0,1,2), 1,
                 prob=c(((1-p)^2+((1-p)*p*f)),
                        (2*p*(1-p)-(2*p*(1-p)*f)),
                        (p^2+((1-p)*p*f))))
  return(as.integer(samp))
}

input.fun <- function(m, p, f){
  indicesM <- which(x = is.na(m), arr.ind = TRUE)
  m[indicesM] <- mapply(samplefp, p[indicesM[,2]], f[indicesM[,1]])
  return(m)
}

Any scripts or data that you put into this service are public.

snpReady documentation built on May 2, 2019, 1:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

snpReady
Preparing Genotypic Datasets in Order to Run Genomic Analysis

R/raw.data.R
In snpReady: Preparing Genotypic Datasets in Order to Run Genomic Analysis

Try the snpReady package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

snpReady Preparing Genotypic Datasets in Order to Run Genomic Analysis

R/raw.data.R In snpReady: Preparing Genotypic Datasets in Order to Run Genomic Analysis

Try the snpReady package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

snpReady
Preparing Genotypic Datasets in Order to Run Genomic Analysis

R/raw.data.R
In snpReady: Preparing Genotypic Datasets in Order to Run Genomic Analysis