Nothing
buildFeatureVector <- function(peaks, BSgenomeName=Drerio,
upstream=50L, downstream=40L,
wordSize=6L, alphabet=c("ACGT"),
sampleType=c("TP", "TN", "unknown"),
replaceNAdistance=40L,
method=c("NaiveBayes", "SVM"),
ZeroBasedIndex=1L, fetchSeq=FALSE)
{
if((!class(upstream) %in% c("integer", "numeric")) ||
(!class(downstream) %in% c("integer", "numeric")) ||
(!class(wordSize) %in% c("integer", "numeric")))
stop("upstream, downstream and wordSize must be objects of integer")
if(class(BSgenomeName)!="BSgenome")
stop("BSgenomeName must be an object BSgenome")
#### use peak end as the separator to get 50bp upstream and 40 bp downstream
#### for peak at negative strand, it is the reverse complement of the plus strand
if (fetchSeq == TRUE)
{
if (ZeroBasedIndex)
{
start(peaks) <- start(peaks) +1
end(peaks) <- end(peaks) + 1
}
seq <- getUpstreamSequence(peaks, upstream=upstream, genome=BSgenomeName)
upstream.seq <- seq$sequence
seq <- getDownstreamSequence(peaks , downstream=downstream,
genome=BSgenomeName)
downstream.seq <- seq$sequence
}
else
{
upstream.seq <- peaks$upstream.seq
end.up <- unlist(lapply(upstream.seq, nchar))
start.up <- end.up - upstream + 1
start.up <- unlist(lapply(start.up, function(i) {max(i,1)}))
upstream.seq <- substr(upstream.seq,start.up, end.up)
downstream.seq <- peaks$downstream.seq
end.down <- unlist(lapply(downstream.seq, function(i) {
min(nchar(i),downstream)
}))
downstream.seq <- substr(downstream.seq, 1, end.down)
}
if (method !="SVM")
{
newData <- do.call(rbind, lapply(1:length(upstream.seq),
function(i) {
t(count(s2c(as.character(upstream.seq[i])),
wordsize=wordSize,
alphabet=s2c(alphabet)))
}
))
newData[newData>0] <- 1
newData <- as.data.frame(newData)
newData[,1:ncol(newData)] <- lapply(newData[,1:ncol(newData)], factor)
}
else
{
newData <- cbind(as.character(upstream.seq),as.character(downstream.seq))
}
n.A <- unlist(lapply(1:length(downstream.seq), function(i) {
table(factor(s2c(as.character(downstream.seq[i])), levels=c("A")))
}))
n.C <- unlist(lapply(1:length(downstream.seq), function(i) {
table(factor(s2c(as.character(downstream.seq[i])), levels=c("C")))
}))
n.T <- unlist(lapply(1:length(downstream.seq), function(i) {
table(factor(s2c(as.character(downstream.seq[i])), levels=c("T")))
}))
n.G <- unlist(lapply(1:length(downstream.seq), function(i) {
table(factor(s2c(as.character(downstream.seq[i])), levels=c("G")))
}))
avg.distanceA2PeakEnd <- unlist(lapply(1:dim(newData)[1], function(i) {
temp <- mean(grep("A", s2c(as.character(downstream.seq[i]))))
if (is.na(temp)) {replaceNAdistance} else {temp}
}))
newData2 <- do.call(rbind, lapply(1:length(downstream.seq),
function(i) {
t(count(s2c(as.character(downstream.seq[i])),
wordsize=2,
alphabet=s2c(alphabet)))
}
))
newData2 <- as.data.frame(newData2)
if (sampleType == "TP")
{
newData1 <- data.frame(cbind(rep(1,dim(newData)[1]),
n.A,
n.C,
n.T,
n.G,
avg.distanceA2PeakEnd,
newData2,
newData))
colnames(newData1)[1] <- "y"
colnames(newData1)[2] <- "n.A.Downstream"
colnames(newData1)[3] <- "n.C.Downstream"
colnames(newData1)[4] <- "n.T.Downstream"
colnames(newData1)[5] <- "n.G.Downstream"
colnames(newData1)[6] <- "avg.distanceA2PeakEnd"
id <- 23:ncol(newData1)
}
else if (sampleType == "TN")
{
newData1 <- data.frame(cbind(rep(0,dim(newData)[1]),
n.A,
n.C,
n.T,
n.G,
avg.distanceA2PeakEnd,
newData2,
newData))
colnames(newData1)[1] <- "y"
colnames(newData1)[2] <- "n.A.Downstream"
colnames(newData1)[3] <- "n.C.Downstream"
colnames(newData1)[4] <- "n.T.Downstream"
colnames(newData1)[5] <- "n.G.Downstream"
colnames(newData1)[6] <- "avg.distanceA2PeakEnd"
id <- 23:ncol(newData1)
}
else
{
newData1 <- data.frame(cbind(n.A, n.C, n.T, n.G,
avg.distanceA2PeakEnd,
newData2,
newData))
colnames(newData1)[1] <- "n.A.Downstream"
colnames(newData1)[2] <- "n.C.Downstream"
colnames(newData1)[3] <- "n.T.Downstream"
colnames(newData1)[4] <- "n.G.Downstream"
colnames(newData1)[5] <- "avg.distanceA2PeakEnd"
id <- 22:ncol(newData1)
}
newData1$n.A.Downstream <- as.numeric(as.character(newData1$n.A.Downstream))
newData1$n.C.Downstream <- as.numeric(as.character(newData1$n.C.Downstream))
newData1$n.T.Downstream <- as.numeric(as.character(newData1$n.T.Downstream))
newData1$n.G.Downstream <- as.numeric(as.character(newData1$n.G.Downstream))
newData1$avg.distanceA2PeakEnd <- as.numeric(as.character(newData1$avg.distanceA2PeakEnd))
if (method != "SVM")
{
newData1 <- cbind(newData1, upstream.seq, downstream.seq)
}
else
{
nParam <- dim(newData1)[2]
u.index <- nParam-1
colnames(newData1)[u.index:nParam] <- c("upstream.seq", "downstream.seq")
}
if (fetchSeq)
{
rownames(newData1) <- as.character(names(seq))
}
else
{
rownames(newData1) <- as.character(names(peaks))
}
new("featureVector", data=newData1,
info=new("modelInfo", genome=BSgenomeName,
upstream=as.integer(upstream), downstream=as.integer(downstream),
wordSize=as.integer(wordSize), alphabet=alphabet))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.