R/treethresh.R
In treethresh: Methods for Tree-Based Local Adaptive Thresholding

Documented in estimate.sdev estimate.sdev.imwd estimate.sdev.wd extract.coefficients extract.coefficients.imwd extract.coefficients.wd get.t get.t.treethresh get.t.wtthresh get.w get.w.treethresh get.w.wtthresh insert.coefficients insert.coefficients.imwd insert.coefficients.wd prune prune.treethresh prune.wtthresh subtree subtree.treethresh subtree.wtthresh thresh thresh.treethresh thresh.wtthresh treethresh wavelet.treethresh wtthresh

######################### TIM

subtree <- function(object,C)
  UseMethod("subtree")

prune <- function(object,v=5,sd.mult=0.5,plot=TRUE)
  UseMethod("prune")

get.w <- function(object,C)
  UseMethod("get.w")

get.t <- function(object,C)
  UseMethod("get.t")

thresh <- function(object,data,C,postmed=TRUE)
  UseMethod("thresh")

treethresh <- function(data,beta,criterion="score",control=list(),rho=sys.frame(sys.parent())) {
  use.beta <- FALSE
  use.data <- NULL
  if (!missing(beta) && !is.null(beta)) {
    use.beta <- TRUE
    use.data <- as.array(beta)
  } else {
    if (!missing(data) && !is.null(data)) {
      use.data <- as.array(data)
    }
  }
  if (is.null(use.data))
    stop("You either need to specify the original data sequence or the betas.")
  default.control <- list(max.iter=30, max.depth=10, minimum.width=3, minimum.size=5^length(dim(use.data)), tolerance.grad=1e-8, tolerance=1e-6, absolute.improvement=-Inf, relative.improvement=-Inf, absolute.criterion=0, beta.max=1e5, a=0.5, crit.signif=0.90, lr.signif=0.5, first.step=0.2)
  if (!is.list(control))
    stop("The control argument must be a list.")
  for (name in names(default.control)) {
    if (!(name %in% names(control)))
      control[[name]] <- default.control[[name]]
  }
  list.diff <- setdiff(names(control),names(default.control))
  if (length(list.diff)>0) {
    warning.text <- "The following arguments in the control list were ignored:"
    for (elt in list.diff)
      warning.text <- paste(warning.text," \"",elt,"\"",sep="")
    warning(warning.text)
  }
  if (!is.function(criterion)) {
    possible.criteria <- c("score","likelihood","heuristic")
    criterion <- possible.criteria[pmatch(criterion,possible.criteria)]
  }
  data.dim <- dim(use.data)
  result<-.External(C_fit_tree,data=as.numeric(use.data),dims=as.integer(length(data.dim)),size=as.integer(data.dim),use.beta=as.logical(use.beta),criterion=criterion,control=control,rho=rho)
  names(result) <- c("membership","splits","beta")
  result$membership <- array(result$membership,dim=data.dim)
  result$beta <- array(result$beta,dim=data.dim)
  dimnames(result$splits)=list(NULL,c("id","parent.id","dim","pos","left.child.id","right.child.id","crit","w","t","loglikelihood","alpha","C"))
  if (!all(is.na(result$splits[,"alpha"])))
    result$splits[,"C"] <- result$splits[,"alpha"]/max(result$splits[,"alpha"],na.rm=TRUE)
  else
    result$splits[,"C"] <- NA
  result$splits[,"t"] <- tfromw(result$splits[,"w"])
  result$control <- control
  result$criterion <- criterion
  if (!missing(data))
    result$data <- data
  class(result) <- "treethresh"
  result
}

subtree.treethresh <- function(object,C) {
  if (missing(C) || is.null(C))
    return(object)
  if (all(is.na(object$splits[,"C"])))
    return(object)
  delete.me <- rep(FALSE,nrow(object$splits))
  for (i in nrow(object$splits):2) {
    if (object$splits[i,"C"]<C) {
      delete.me[i] <- TRUE
      parent.id <- abs(object$splits[i,"parent.id"])
      object$splits[parent.id==object$splits[,"id"],3:7] <- NA
      object$membership[object$membership==object$splits[i,"id"]] <- parent.id
    }
  }
  object$splits <- object$splits[!delete.me,,drop=FALSE]
  object
}

get.w.treethresh <- function(object,C) {
  if (!missing(C))
    if (!is.null(C))
      object <- subtree.treethresh(object,C)
  map <- numeric(max(object$splits[,"id"]))
  map[object$splits[,"id"]] <- 1:nrow(object$splits)
  array(object$splits[map[as.vector(object$membership)],"w"],dim=dim(object$membership))
}

get.t.treethresh <- function(object,C) {
  if (!missing(C))
    if (!is.null(C))
      object <- subtree.treethresh(object,C)
  map <- numeric(max(object$splits[,"id"]))
  map[object$splits[,"id"]] <- 1:nrow(object$splits)
  array(dim=dim(object$membership),object$splits[map[as.vector(object$membership)],"t"])
}




# Compute best C in cross-validation
prune.treethresh <- function(object,v=5,sd.mult=0.5,plot=TRUE) {
  sizes <- function(object) {
    C <- object$splits[-1,"C"]
    C.values <- sort(unique(C))
    c(apply(outer(C,C.values,function(a,b) as.numeric(a>=b)),2,sum)/2+1,1)
  }
  if (is.na(v))
    return(object)
  if (all(is.na(object$splits[,"C"])))
    return(object)
  # Split the dataset into v portions
  permut <- sample(length(object$data))
  sample.size <- length(object$data)/v
  C.values <- sort(as.numeric(na.omit(c(unique(object$splits[,"C"]),2))))
  results <- matrix(nrow=v,ncol=length(C.values))
  to <- 0
  # Fitting v cv models
  for (j in 1:v) {
    from <- to + 1
    to <- round(sample.size*j)
    test.index <- permut[from:to]
    a.blanked <- object$data[test.index]
    if (is.function(object$criterion) || (object$criterion=="heuristic")) {
      a.data <- object$data
      a.data[test.index] <- NA
      a.tree <- treethresh(data=a.data,control=object$control,criterion=object$criterion)
    } else {
      a.beta <- object$beta
      a.beta[test.index] <- NA
      a.tree <- treethresh(beta=a.beta,control=object$control,criterion=object$criterion)
    }
    for (k in 1:length(C.values)) {
      a.tree <- subtree.treethresh(a.tree,C=C.values[k])
      w <- get.w.treethresh(a.tree,C=NULL)[test.index]
      beta <- pmin(1e5,object$beta[test.index])
      results[j,k] <- sum(log(1+w*beta))
    }
  }
  results <- results - results[,ncol(results)] 
  results.mean <- apply(results,2,mean)
  results.sd <- apply(results,2,sd)
  lower.bound <- (results.mean-sd.mult*results.sd)[which.max(results.mean)]
  if (plot) {
    results.range <- c(min(results.mean-results.sd),max(results.mean+results.sd))
    ns <- 1:length(results.mean)
    plot(ns,results.mean,xaxt="n",ylab="Loglikelihood from CV (+const)",type="o",xlab="C",ylim=results.range)
    segments(ns, results.mean - results.sd, ns, results.mean + results.sd)
    axis(1, at = ns, labels = c(format(C.values[-length(C.values)],digits=3),">1"))
    mtext("number of regions", side = 3, line = 3)
    axis(3, at = ns, labels = as.character(sizes(object)))
    abline(h=lower.bound,lty=3)
  }
  subtree.treethresh(object,C.values[max(which(results.mean>=lower.bound))])
}

thresh.treethresh <- function(object,data,C,postmed=TRUE) {
  if (missing(data) && !is.null(object$data))
    data <- object$data
  data.dim <- dim(data)
  if (postmed) {
    w <- get.w.treethresh(object,C)
    if (length(data)!=length(w))
      stop("Data does not have the right length.")
    data <- postmed.laplace(data,w)
  } else {
    t <- get.t.treethresh(object,C)
    if (length(data)!=length(t))
      stop("Data does not have the right length.")
    data[abs(data)<t] <- 0
  }
  dim(data) <- data.dim
  data
}

######################### TIM_WAVE

# Input for data or betas is a list 
wtthresh <- function(data,beta,weights,control=list()) {
  use.beta <- FALSE
  src.data <- NULL
  if (!missing(beta) && !is.null(beta)) {
    use.beta <- TRUE
    src.data <- beta
    data <- NULL
  } else {
    if (!missing(data) && !is.null(data)) {
      src.data <- data
    }
  }
  if (missing(weights))
    weights <- rep(1,length(src.data))
  if (length(weights)!=length(src.data))
    stop("There must be a weight for every array in the data list.")
  if (is.null(src.data))
    stop("You either need to specify the original data sequence or the betas.")
  use.data <- list()
  data.dim <- length(dim(src.data[[1]]))
  for (i in 1:length(src.data)) {
    use.data[[i]] <- as.array(src.data[[i]])
    cur.dim <- dim(use.data[[i]])
    if (length(unique(cur.dim))!=1)
      stop("Input data needs to be a list of squared arrays.")
    if (data.dim!=length(cur.dim))
      stop("Input data must have identical dimensions.")
  }
  default.control <- list(max.iter=30, max.depth=10, minimum.width=4, min.minimum.width=1, minimum.size=8^data.dim, min.minimum.size=4^data.dim,tolerance.grad=1e-8, tolerance=1e-6, absolute.improvement=-Inf, relative.improvement=-Inf, absolute.criterion=0, beta.max=1e5, a=0.5, rescale.quantile=0.5, lr.signif=0.5, first.step=0.2, min.width.scale.factor=1, min.size.scale.factor=1)
  if (!is.list(control))
    stop("The control argument must be a list.")
  for (name in names(default.control)) {
    if (!(name %in% names(control)))
      control[[name]] <- default.control[[name]]
  }
  list.diff <- setdiff(names(control),names(default.control))
  if (length(list.diff)>0) {
    warning.text <- "The following arguments in the control list were ignored:"
    for (elt in list.diff)
      warning.text <- paste(warning.text," \"",elt,"\"",sep="")
    warning(warning.text)
  }
  result<-.External(C_fit_tree_wave,data=use.data,dims=data.dim,use.beta=as.logical(use.beta),weights=as.double(weights),control=control)
  names(result) <- c("splits","details","w","t","membership","beta")
  dimnames(result$splits)=list(NULL,c("id","parent.id","dim","pos","left.child.id","right.child.id","crit","loglikelihood","alpha","C"))
  if (!all(is.na(result$splits[,"alpha"]))) {
    result$splits[,"C"] <- result$splits[,"alpha"]/max(result$splits[,"alpha"],na.rm=TRUE)
  } else {
    result$splits[,"C"] <- NA
  }
  result$t <- matrix(tfromw(result$w),nrow=nrow(result$w))
  result$data <- data
  result$weights <- weights
  result$control <- control
  if (is.null(result$beta)) {
    result$beta <- beta
  } else {
    for (i in 1:length(result$beta))
      result$beta[[i]] <- array(result$beta[[i]],dim=dim(result$data[[i]]))
  }
  for (i in 1:length(result$membership))
    result$membership[[i]] <- array(result$membership[[i]],dim=dim(result$beta[[i]]))
  class(result) <- "wtthresh"
  result
}

subtree.wtthresh <- function(object,C=NULL) {
  if (missing(C) || is.null(C))
    return(object)
  if (all(is.na(object$splits[,"C"])))
    return(object)
  if (is.null(object$splits))
    stop("Not the right type of object")
  nrow <- nrow(object$splits)
  old.types <- as.integer(object$splits[,"id"])
  result <- .C(C_prune_tree,splits=as.double(object$splits),nrow=as.integer(nrow),kill=integer(nrow),leaf=as.integer(is.na(object$splits[,"dim"])),membership=as.integer(old.types),C=as.double(C),NAOK=TRUE)
  result$leaf <- as.logical(result$leaf)
  result$kill <- as.logical(result$kill)
  object$splits[result$leaf,3:7] <- NA
  object$details[result$leaf] <- NA
  object$splits <- object$splits[!result$kill,,drop=FALSE]
  object$details <- object$details[!result$kill,,drop=FALSE]
  object$w <- object$w[!result$kill,,drop=FALSE]
  object$t <- object$t[!result$kill,,drop=FALSE]
  for (i in 1:length(object$membership)) {
    len <- length(object$membership[[i]])
    object$membership[[i]] <- array(.C(C_update_membership,old.membership=as.integer(object$membership[[i]]),new.membership=integer(len),n=as.integer(len),old.types=as.integer(old.types),new.types=as.integer(result$membership),n.types=as.integer(length(old.types)),NAOK=TRUE)$new.membership,dim=dim(object$membership[[i]]))
  }
  object
}


get.w.wtthresh <- function(object,C) {
  if (!missing(C))
    if (!is.null(C))
      object <- subtree.wtthresh(object,C)
  map <- numeric(max(object$splits[,"id"]))
  map[object$splits[,"id"]] <- 1:nrow(object$splits)
  result <- list()
  for (k in 1:length(object$beta)) {
    result[[k]] <- array(object$w[map[as.vector(object$membership[[k]])],k],dim=dim(object$membership[[k]]))
  }
  result
}

get.t.wtthresh <- function(object,C) {
  if (!missing(C))
    if (!is.null(C))
      object <- subtree.wtthresh(object,C)
  map <- numeric(max(object$splits[,"id"]))
  map[object$splits[,"id"]] <- 1:nrow(object$splits)
  result <- list()
  for (k in 1:length(object$beta)) {
    result[[k]] <- array(object$t[map[as.vector(object$membership[[k]])],k],dim=dim(object$membership[[k]]))
  }
  result
}

# Compute best C in cross-validation
prune.wtthresh <- function(object,v=5,sd.mult=0.5,plot=TRUE) {
  reep <- function(max,cur,dim) {
    reep <- 1:max
    if (dim>1) {
      for (pos in 2:dim) 
        if (pos<=cur)
          reep <- rep(reep,each=max)
        else
          reep <- rep(reep,max)
    }
    reep
  }
  create.coords <- function(index,size,dim,factor) {
    coords <- matrix(nrow=length(index)*factor^dim,ncol=dim)
    index <- index-1
    multiple <- 1
    pos <- rep(1,length(index)*factor^dim)
    for (i in 1:dim) {
      cur.coords <- index%%size
      cur.coords <- factor*rep(cur.coords,each=factor^dim)+reep(factor,i,dim)
      coords[,dim+1-i] <- cur.coords
      index <- index%/%size
      pos <- pos+(cur.coords-1)*multiple
      multiple <- multiple*size*factor
    }
    list(coords=coords,pos=pos)
  }
  get.sizes <- function(object) {
    C <- object$splits[-1,"C"]
    C.values <- sort(unique(C))
    c(apply(outer(C,C.values,function(a,b) as.numeric(a>=b)),2,sum)/2+1,1)
  }
  if (is.na(v))
    return(object)
  if (all(is.na(object$splits[,"C"])))
    return(object)
  sizes <- sapply(object$beta,length)
  data.dim <- length(dim(object$beta[[1]]))
  # Split the dataset into v portions
  permut <- sample(min(sizes))
  sample.size <- min(sizes)/v
  C.values <- sort(as.numeric(na.omit(c(unique(object$splits[,"C"]),2))))
  results <- matrix(0,nrow=v,ncol=length(C.values))
  to <- 0
  for (j in 1:v) {
    from <- to + 1
    to <- round(sample.size*j)
    test.index <- permut[from:to]
    a.beta <- object$beta
    for (k in 1:length(a.beta)) {
      a.beta[[k]][create.coords(test.index,min(sizes)^(1/data.dim),data.dim,(sizes[k]/min(sizes))^(1/data.dim))$pos] <- NA
    }
    a.tree <- wtthresh(beta=a.beta,control=object$control,weights=object$weights)
    for (l in 1:length(C.values)) {
      a.tree <- subtree.wtthresh(a.tree,C=C.values[l])
      w <- get.w.wtthresh(a.tree)
      for (k in 1:length(a.beta)) {
        test.pos <- create.coords(test.index,min(sizes)^(1/data.dim),data.dim,(sizes[k]/min(sizes))^(1/data.dim))$pos
        beta <- pmin(1e5,object$beta[[k]][test.pos])
        results[j,l] <- results[j,l] + sum(log(1+w[[k]][test.pos]*beta))
      }
    }
  }
  results <- results - results[,ncol(results)] 
  results.mean <- apply(results,2,mean)
  results.sd <- apply(results,2,sd)
  lower.bound <- (results.mean-sd.mult*results.sd)[which.max(results.mean)]
  if (plot) {
    results.range <- c(min(results.mean-results.sd),max(results.mean+results.sd))
    ns <- 1:length(results.mean)
    plot(ns,results.mean,xaxt="n",ylab="Loglikelihood from CV (+const)",type="o",xlab="C",ylim=results.range)
    segments(ns, results.mean - results.sd, ns, results.mean + results.sd)
    axis(1, at = ns, labels = c(format(C.values[-length(C.values)],digits=3),">1"))
    mtext("number of regions", side = 3, line = 3)
    axis(3, at = ns, labels = as.character(get.sizes(object)))
    abline(h=lower.bound,lty=3)
  }
  subtree.wtthresh(object,C.values[max(which(results.mean>=lower.bound))])
}

thresh.wtthresh <- function(object,data,C,postmed=TRUE) {
  if (missing(data) && !is.null(object$data))
    data <- object$data 
  if (postmed) {
    w <- get.w.wtthresh(object,C)
    if (length(data)!=length(w))
      stop("Data does not have the right length.")
    for (i in 1:length(data)) 
      data[[i]] <- postmed.laplace(data[[i]],w[[i]])
  } else {
    t <- get.t.wtthresh(object,C)
    if (length(data)!=length(t))
      stop("Data does not have the right length.")
    for (i in 1:length(data)) 
      data[[i]][abs(data[[i]])<t[[i]]] <- 0
  }
  data
}


########################### UTILITY FUNCTIONS

extract.coefficients.imwd <- function(object,start.level=5) {
  if (start.level>=object$nlevels)
    return(list())
  types <- c("CD","DD","DC")
  result <- list()
  for (level in start.level:object$nlevels) 
    for (type in types) {
      nm <- lt.to.name(level=level-1,type)
      result[[nm]] <- array(object[[nm]],dim=rep(2^(level-1),2))
    }
  result
}

estimate.sdev.imwd <- function(object,dev=mad) {
  types <- c("CD","DD","DC")
  mads <- c()
  for (type in types) {
    nm <- lt.to.name(level=object$nlevels-1,type)
    elt <- object[[nm]]
    our.mads <- dev(elt)
    mads <- c(mads,as.vector(our.mads))
  }
  median(mads)
}
    
insert.coefficients.imwd <- function(object,update) {
  for (name in names(update))
    object[[name]] <- update[[name]]
  object
}

extract.coefficients.wd <- function(object,start.level=5) {
  if (start.level>=object$nlevels)
    return(list())
  result <- list()
  for (level in start.level:object$nlevels) {
    foo <- accessD(object,level=level-1)
    nm <- as.character(level)
    result[[nm]] <- array(foo,dim=length(foo))
  }
  result
}

estimate.sdev.wd <- function(object,dev=mad) {
  elt <- accessD(object,level=object$nlevels-1)
  dev(elt)
}
    
insert.coefficients.wd <- function(object,update) {
  for (name in names(update))
    object <- putD(object,level=as.numeric(name)-1,update[[name]])
  object
}

extract.coefficients <- function(object,start.level=5)
  UseMethod("extract.coefficients")

estimate.sdev <- function(object,dev=mad)
UseMethod("estimate.sdev")

estimate.sdev.numeric <- function(object, block=8, dev=mad) {
  if (is.na(block))
    return(dev(object))
  block <- min(c(block,length(object)))
  n.blocks <- floor(length(object)/block)
  our.mads <- numeric(n.blocks)
  for (i in 1:n.blocks) {
    dta <- object[block*(i-1)+1:block]
    our.mads[i] <- dev(dta)
  }
  median(our.mads)
}

insert.coefficients <- function(object, update)
  UseMethod("insert.coefficients")

wavelet.treethresh <- function(object,sdev=NA,dev=mad,start.level=5,levelwise=FALSE,v=5,sd.mult=0.5,postmed=TRUE,...) {
  # If no sdev is provided estimate it
  if (is.na(sdev)) {
    sdev <- estimate.sdev(object,dev=dev)   
  }
  # Extract the coefficients
  coefs <- extract.coefficients(object,start.level=start.level)
  # Rescale the coefficients
  for (nm in names(coefs))
    coefs[[nm]] <- coefs[[nm]] / sdev
  if (!levelwise) {
    # JOINT THRESHOLDING
    # Compute the thresholding tree
    coefs.tree <- wtthresh(coefs,...)
    # Prune the tree
    coefs.pruned.tree <- prune.wtthresh(coefs.tree,v=v,sd.mult=sd.mult,plot=FALSE)
    # Threshold according to the pruned tree
    coefs.threshed <- thresh(coefs.pruned.tree,postmed=postmed)
  } else {
    # LEVELWISE THRESHOLDING
    coefs.threshed <- list()
    for (nm in names(coefs)) {
      coefs.tree <- treethresh(coefs[[nm]],...)
      coefs.pruned.tree <- prune(coefs.tree,v=v,sd.mult=sd.mult,plot=FALSE)
      coefs.threshed[[nm]] <- thresh(coefs.pruned.tree,postmed=postmed)
    }
  }     
  # Undo the resclaing
  for (nm in names(coefs))
    coefs.threshed[[nm]] <- coefs.threshed[[nm]] * sdev
  # Update coefficients
  insert.coefficients(object,coefs.threshed)
}