#' Convert Frequency Table into Raw Data
#'
#' Data with unique rows and a frequency column is converted into
#' data with duplicate rows.
#'
#' The ouput data frame can be used as input to \code{\link{bbl}}.
#'
#' @param data Data frame with factors in columns
#' @param freq Vector of frequency of each row in \code{data}
#' @return Data frame with one row per instances
#' @examples
#' Titanic
#' x <- as.data.frame(Titanic)
#' head(x)
#' titanic <- freq2raw(data=x[,1:3], freq=x$Freq)
#' head(titanic)
#' @export
freq2raw <- function(data,freq){
if(length(freq)!=NROW(data))
stop('Frequency length does not match data')
n <- nrow(data)
dat <- NULL
for(i in 1:n){
w <- data[rep(i,freq[i]),]
dat <- rbind(dat, w)
}
rownames(dat) <- seq_len(NROW(dat))
return(dat)
}
#' Read FASTA File
#'
#' Read nucleotide sequence files in FASTA format
#'
#' Sequence data in FASTA files are converted into data frame
#' suitable as input to \code{\link{bbl}}. If sequence lengths are different,
#' instances longer than those already read will be truncated. Empty sequences
#' are skipped.
#'
#' @param file File name of FASTA input.
#' @param rownames Use the sequence annotation line in file (starts with
#' \code{'>'}) as the row names. Will fail if there are duplicate items.
#' @return Data frame of each sequence in rows.
#' @examples
#' file <- tempfile('data')
#' write('>seq1', file)
#' write('atgcc', file, append=TRUE)
#' write('>seq2', file, append=TRUE)
#' write('gccaa', file, append=TRUE)
#' system(paste0('cat ',file))
#' x <- readFasta(file)
#' x
#' @export
readFasta <- function(file, rownames=FALSE){
if(!file.exists(file)) stop(paste0(file,' does not exist'))
fl <- readLines(file)
if(length(fl)==0) stop(paste0(file, ' is empty'))
label <- dat <- NULL
for(i in seq_along(fl)){
if(substr(fl[[i]],start=1,stop=1)=='>'){
x <- strsplit(fl[[i]],split='\t')[[1]]
x <- paste(x,collapse=' ')
label <- c(label, x)
}
else{
seqs <- strsplit(fl[[i]],split='')[[1]]
if(NROW(dat)>0){
if(length(seqs)<NCOL(dat)) next()
if(length(seqs)>NCOL(dat)) seqs <- seqs[seq_len(NCOL(dat))]
}
dat <- rbind(dat, as.data.frame(matrix(seqs,nrow=1)))
}
}
if(rownames){
if(length(label)!=NROW(dat))
stop('Annotation lines do not match sequences.')
rownames(dat) <- label
}
colnames(dat) <- seq_len(NCOL(dat))
return(dat)
}
#' Remove Non-varying Predictors
#'
#' Constant predictor is identified and removed
#'
#' Variables with only one factor level is removed from data. Intended
#' for use before calling \code{\link{bbl}}.
#' @param x Data frame containing discrete factor variables in each column
#' @return Data frame omitting non-varying variables from \code{x}.
#' @examples
#' set.seed(351)
#' nt <- c('a','c','g','t')
#' x <- data.frame(v1=sample(nt,size=50,replace=TRUE),
#' v2=rep('a',50),v3=sample(nt,size=50,replace=TRUE))
#' y <- sample(c('case','ctrl'),size=50,replace=TRUE)
#' dat <- cbind(data.frame(y=y), x)
#' summary(dat)
#' dat <- removeConst(dat)
#' summary(dat)
#' @export
removeConst <- function(x){
bad <- lapply(x, function(x){length(levels(factor(x)))==1})
bad <- unlist(bad)
return(x[,!bad])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.