In as4378/LabelledOpart: Optimal Partitioning

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(data.table)
library(directlabels)
library(ggplot2)
library(penaltyLearning)
library(fpop)
library(Segmentor3IsBack)

Labelled Optimal Partitioning

Loading the dataset:

selData <- as.data.table(read.csv(file="../data/signal_list.csv"))
labels <- as.data.table(read.csv(file="../data/labels.csv"))
selData <- selData[order(position)]
head(selData)
head(labels)
modifiedLabels <- data.frame("start" = c(), "end" = c(), "breaks" = c())
for(i in 1:nrow(labels)){
  min <- labels[[i,4]]
  max <- labels[[i,5]]
  positions <- selData[position >= min & position <= max][,2]
  modifiedLabels <- rbind(modifiedLabels, data.frame(start=min(positions),                                                      end=max(positions), breaks=1))
}

Plot the given data:

zoom.gg.lab <- ggplot()+
  geom_point(aes(position, logratio),
             data=selData,
             shape=1)+
  scale_x_continuous(name="position", limits=c(0, 51194612)) +
  scale_y_continuous(
    "logratio (approximate DNA copy number)")+
  penaltyLearning::geom_tallrect(aes(
    xmin=start, xmax=end, fill=breaks),
    alpha=0.4,
    color=NA,
    data=modifiedLabels)

#print(zoom.gg.lab)

Using Segmentor3IsBack we get:

sfit <- Segmentor(data=selData$logratio, model = 2, Kmax = 2*nrow(labels))
breaks <- sfit@breaks[nrow(labels),]
positions <- selData[breaks]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")


breaks <- sfit@breaks[nrow(labels) + 5,]
positions <- selData[breaks]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")


breaks <- sfit@breaks[nrow(labels) + 10,]
positions <- selData[breaks]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")


breaks <- sfit@breaks[nrow(labels) + 20,]
positions <- selData[breaks]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")


breaks <- sfit@breaks[2*nrow(labels),]
positions <- selData[breaks]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")

changePointCount = data.frame("start" = c(), "count" = c())

for(i in 1:nrow(modifiedLabels)){
    begin = modifiedLabels[i,1]
    # end = labels[i,2]
    changePointCount <- rbind(changePointCount, data.frame(start = begin, 
                                                           count = 0))
}
falsePositives = 0
errors = 0
#find the misclassifications
for(i in 1:nrow(positions)){
  currentPosition = positions[i]$position
  for(i in 1:nrow(modifiedLabels)){
    begin = modifiedLabels[i,1]
    end = modifiedLabels[i,2]

    if(currentPosition >= begin && currentPosition <= end){
      if(isTRUE(changePointCount[changePointCount$start == begin,]$count > 0))
      {
        falsePositives = falsePositives + 1  
      }
      else{
        changePointCount[changePointCount$start == begin,]$count = 1
      }
    }
  }
}
falseNegatives = nrow(changePointCount[changePointCount$count==0,])

#falsePositives
#errors

limit = 2*nrow(labels)
results <- data.frame("K" = c(), "FalsePositives" = c(), "Errors" = c())
for(i in 1:limit){
  changePointCount = data.frame("start" = c(), "count" = c())
  for(l in 1:nrow(modifiedLabels)){
    begin = modifiedLabels[l,1]
    # end = labels[i,2]
    changePointCount <- rbind(changePointCount, data.frame(start = begin, 
                                                           count = 0))
  }
  breaks <- sfit@breaks[i,]
  positions <- selData[breaks]
  falsePositives = 0
  errors = 0
  #find the misclassifications
  for(j in 1:nrow(positions)){
    currentPosition = positions[j]$position
    for(k in 1:nrow(modifiedLabels)){
      begin = modifiedLabels[k,1]
      end = modifiedLabels[k,2]

      if(currentPosition >= begin && currentPosition <= end){
        if(isTRUE(changePointCount[changePointCount$start == begin,]$count > 0))
        {
          falsePositives = falsePositives + 1  
        }
        else{
          changePointCount[changePointCount$start == begin,]$count = 1
        }
      }
    }
  }
  errors = nrow(changePointCount[changePointCount$count==0,])

  results <- rbind(results, data.frame(K=i, FalsePositives = falsePositives,
                                       Errors = errors))
}

#results

Running the labelled optimal partitioning on the dataset we get:

labels <- data.frame("start" = c(), "end" = c(), "breaks" = c())
for(i in 1:nrow(modifiedLabels)){
  start <- modifiedLabels[i, 1]
  end <- modifiedLabels[i, 2]
  index1 <- which(selData[,position] == start)
  index2 <- which(selData[,position] == end)
  labels <- rbind(labels, data.frame(start = index1, end = index2, breaks = 1))
}
labels <- labels[order(labels$start),]

labelled_fit <- LabelledOpart::labelled_opart_gaussian(selData$logratio, labels, 1000000000)

positions <- selData[labelled_fit$end.vec]

#zoom.gg.lab +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=1,
#    linetype="dashed")

Loading the profile614chr2 dataset we get:

selData <- as.data.table(read.csv(file="../data/profile614chr2signal.csv"))
labels <- as.data.table(read.csv(file="../data/profile614chr2labels.csv"))
selData <- selData[order(position)]
modifiedLabels <- data.frame("start" = c(), "end" = c(), "breaks" = c())

for(i in 1:nrow(labels)){
  min <- labels[[i,2]]
  max <- labels[[i,3]]
  annotation <- labels[[i,4]]
  positions <- selData[position >= min & position <= max][,2]

  if(annotation == "0breakpoints"){
  modifiedLabels <- rbind(modifiedLabels, data.frame(start=min(positions), 
                                                     end=max(positions), breaks=0))
  }
  else{
  modifiedLabels <- rbind(modifiedLabels, data.frame(start=min(positions), 
                                                     end=max(positions), breaks=1))  
  }
}

head(modifiedLabels)

Plotting the given dataset:

zeroLabels <- modifiedLabels[modifiedLabels$breaks == 0,]
oneLabels <- modifiedLabels[modifiedLabels$breaks == 1,]

zoom.gg.lab <- ggplot()+
  geom_point(aes(position, logratio),
             data=selData,
             shape=1)+
  scale_x_continuous(name="position", limits=c(0, 51194612)) +
  scale_y_continuous(
    "logratio (approximate DNA copy number)")+
  penaltyLearning::geom_tallrect(aes(
    xmin=start, xmax=end, fill=breaks),
    fill="yellow",
    color="yellow",
    alpha = 0.5,
    data=zeroLabels)+
  penaltyLearning::geom_tallrect(aes(
    xmin=start, xmax=end, fill=breaks),
    fill="pink",
    color="pink",
    alpha = 0.5,
    data=oneLabels)

print(zoom.gg.lab)

Running labelled opart on this dataset we get:

labels <- data.frame("start" = c(), "end" = c(), "breaks" = c())
for(i in 1:nrow(modifiedLabels)){
  start <- modifiedLabels[i, 1]
  end <- modifiedLabels[i, 2]
  b <- modifiedLabels[i, 3]
  index1 <- which(selData[,position] == start)
  index2 <- which(selData[,position] == end)

  labels <- rbind(labels, data.frame(start = index1, end = index2, breaks = b))
}
labels <- labels[order(labels$start),]
labels


labelled_fit <- LabelledOpart::labelled_opart_gaussian(selData$logratio, labels, 2)

fpop_fit <- fpop::Fpop(selData$logratio, 2)
fpop_pos <- selData[fpop_fit$t.est]

positions <- selData[labelled_fit$end.vec]

core <- ggplot()+
  geom_point(aes(position, logratio),
             data=selData,
             shape=1)+
  scale_x_continuous(name="position", limits=c(0, 15194612)) +
  scale_y_continuous(
    "logratio (approximate DNA copy number)")+
  penaltyLearning::geom_tallrect(aes(
    xmin=start, xmax=end, fill=breaks),
    fill="yellow",
    color="yellow",
    alpha = 0.5,
    data=zeroLabels)+
  penaltyLearning::geom_tallrect(aes(
    xmin=start, xmax=end, fill=breaks),
    fill="pink",
    color="pink",
    alpha = 0.5,
    data=oneLabels)

#core +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=0.7,
#    linetype="dashed")

labelled_fit <- LabelledOpart::labelled_opart_gaussian(selData$logratio, labels, 1)
positions <- selData[labelled_fit$end.vec]

fpop_fit <- fpop::Fpop(selData$logratio, 1)
fpop_pos <- selData[fpop_fit$t.est]

#core +
#  geom_vline(aes(
#    xintercept=position),
#    data=positions,
#    color="green",
#    size=0.7,
#    linetype="dashed")

add.x.var <- function(df, x.var){
  data.frame(df, x.var=factor(x.var, c("lopart", "fpop")))
}

labels <- labels[1:3,]
labels

labelled_fit <- LabelledOpart::labelled_opart_gaussian(selData$logratio, labels, 1)
positions <- selData[labelled_fit$end.vec]

fpop_fit <- fpop::Fpop(selData$logratio, 1)
fpop_pos <- selData[fpop_fit$t.est]

panels <- ggplot() + 
          geom_point(aes(position, logratio),
             data=add.x.var(selData, "lopart"),
             shape=1)+
          geom_vline(aes(
            xintercept=position),
            data=add.x.var(positions, "lopart"),
            color="green",
            size=0.7,
            linetype="dashed")+

          geom_point(aes(position, logratio),
             data=add.x.var(selData, "fpop"),
             shape=1)+
          geom_vline(aes(
            xintercept=position),
            data=add.x.var(fpop_pos, "fpop"),
            color="green",
            size=0.7,
            linetype="dashed")+
          scale_x_continuous(name="position", limits=c(0, 40194612)) +
          scale_y_continuous("logratio (approximate DNA copy number)")+
          penaltyLearning::geom_tallrect(aes(
            xmin=start, xmax=end, fill=breaks),
            fill="yellow",
            color="yellow",
            alpha = 0.5,
            data=zeroLabels)+
          penaltyLearning::geom_tallrect(aes(
            xmin=start, xmax=end, fill=breaks),
            fill="pink",
            color="pink",
            alpha = 0.5,
            data=oneLabels)+
                   facet_grid(x.var ~ ., scales="free")
print(panels)

labelled_fit <- LabelledOpart::labelled_opart_gaussian(selData$logratio, labels, 100000)
positions <- selData[labelled_fit$end.vec]
positions <- positions[1,]
positions
fpop_fit <- fpop::Fpop(selData$logratio, 100000)
fpop_pos <- selData[fpop_fit$t.est]
fpop_pos

panels <- ggplot() + 
          geom_point(aes(position, logratio),
             data=add.x.var(selData, "lopart"),
             shape=1)+

          geom_point(aes(position, logratio),
             data=add.x.var(selData, "fpop"),
             shape=1)+
          scale_x_continuous(name="position", limits=c(0, 40194612)) +
          scale_y_continuous("logratio (approximate DNA copy number)")+
          penaltyLearning::geom_tallrect(aes(
            xmin=start, xmax=end, fill=breaks),
            fill="yellow",
            color="yellow",
            alpha = 0.5,
            data=zeroLabels)+
          penaltyLearning::geom_tallrect(aes(
            xmin=start, xmax=end, fill=breaks),
            fill="pink",
            color="pink",
            alpha = 0.5,
            data=oneLabels)+
          geom_vline(aes(
            xintercept=position),
            data=add.x.var(positions, "lopart"),
            color="green",
            size=0.7,
            linetype="dashed")+
                   facet_grid(x.var ~ ., scales="free")
print(panels)