R/newfuncs.R

Defines functions tableRanks tableMA2 tableSpikeInSD tableLS tableOverallSNR affycomp.figure1b affycomp.figure5e affycomp.compfig5e affycomp.figure5d affycomp.compfig5d affycomp.figure5c affycomp.compfig5c affycomp.compfig5cde affycomp.figure2b affycomp.compfig2b affycomp.figure4c affycomp.compfig4c assessLS assessSpikeInSD assessSpikeIn2 remove.hgu133a.xhyb

Documented in affycomp.compfig2b affycomp.compfig4c affycomp.compfig5c affycomp.compfig5cde affycomp.compfig5d affycomp.compfig5e affycomp.figure1b affycomp.figure2b affycomp.figure4c affycomp.figure5c affycomp.figure5d affycomp.figure5e assessLS assessSpikeIn2 assessSpikeInSD remove.hgu133a.xhyb tableLS tableMA2 tableOverallSNR tableRanks tableSpikeInSD

######
##ASSESSMENTS
#######
remove.hgu133a.xhyb <- function(s,bp=c("200","150","100")){
  bp <- match.arg(bp)
  bp <- paste("bp",bp,sep="")
  hgu133a.spikein.xhyb <- getData("hgu133a.spikein.xhyb")
  out <- match(hgu133a.spikein.xhyb[[bp]],featureNames(s))

  if(any(is.na(out))) warning("No probesets removed. Make sure you are using the hgu133a spikein ExpressionSet or that you didnt already remove the xbybs.")
  else s <- s[-out,]
  return(s)
}

assessSpikeIn2 <- function(s,method.name=NULL,verbose=TRUE){
  if(ncol(exprs(s))==59){
    s <- s[,c(1:13,17,21:33,37)]
    cat("Using only a subset of the spike in data\n")
  }
  if(verbose) cat("Performing 3 assessments that will take a few seconds")
  tmp1 <- assessSpikeInSD(s,method.name=method.name)
  if(verbose) cat(".")
  tmp2 <- assessLS(s,method.name=method.name)
  if(verbose) cat(".")
  tmp3 <- assessMA2(s,method.name=method.name)
  if(verbose) cat(".\n")
  return(list(MA2=tmp3,SpikeInSD=tmp1,LS=tmp2,what="SpikeIn2",
              method.name=method.name))
}

assessSpikeInSD <- function(exprset,method.name=NULL,span=1/3){
  require("splines",quietly = TRUE)
  genenames <- colnames(pData(exprset))
  spikein <-match(genenames,featureNames(exprset))
  y <- esApply(exprset[-spikein,],1,sd)
  x <- esApply(exprset[-spikein,],1,mean)
  smooth1 <- loess(y~x,span=span,family="gaussian",degree=1)
  x2 <- sort(x)[seq(1,length(x),length=100)]
  y2 <- smooth1$fitted[order(x)][seq(1,length(x),length=100)]

  list(x=x,y=y,xsmooth=x2,ysmooth=y2,loess=smooth1,method.name=method.name,what="SpikeInSD")
}

assessLS <-  function(exprset,method.name=NULL){
  e <- exprs(exprset)
  pdata <- pData(exprset)
  genenames <- colnames(pdata)
  y <- as.vector(t(e[match(genenames,featureNames(exprset)),]))
  names(y) <- rep(colnames(pdata),rep(nrow(pdata),ncol(pdata)))
  x <- log2(as.vector(as.matrix(pdata)))
  names(x) <- names(y)
  fit1 <- lm(y~x,subset=x>-Inf)
  tmp <- tapply(y,x,mean) ##this is the line we plot

  nc <- unique(x)
  localslope <- rep(0,length(nc)-1)
  localr2 <- rep(0,length(nc)-1)
  for(i in 1:(length(nc)-1)){
    Index1 <- which(x==nc[i])
    Index2 <- which(x==nc[i+1])
    localslope[i] <- mean(y[Index2])-mean(y[Index1])
    localr2[i]<-cor(c(rep(nc[i+1],length(Index2)),
                      rep(nc[i],length(Index1))),
                    c(y[Index2],y[Index1]))^2
  }

  tmplist <- list()
  tmplist[[1]]   <- which(x <= 1)
  tmplist[[2]]  <- which(x >= 2 & x <= 5)
  tmplist[[3]]   <- which(x>=6)

  threeslopes <- matrix(0,length(tmplist),2) #2 columns: slope and R^2
  for(i in 1:length(tmplist)){
    xx <- x[tmplist[[i]]]
    yy <- y[tmplist[[i]]]
    tmpfit <-lm(yy~xx,subset=xx>-Inf)
    threeslopes[i,] <- c(tmpfit$coef[2],summary(tmpfit)$r.squared)
  }

  list(slope=fit1$coef[2],R2=summary(fit1)$r.squared,
       plotx=x,ploty=y,linex=names(tmp),liney=as.numeric(tmp),
       localslopes=localslope,localr2s=localr2,method.name=method.name,
       low.slope=threeslopes[1,1],
       med.slope=threeslopes[2,1],
       high.slope=threeslopes[3,1],
       low.R2=threeslopes[1,2],
       med.R2=threeslopes[2,2],
       high.R2=threeslopes[3,2],
       what="LS")
}

assessMA2 <- function (exprset, method.name = NULL) 
{
    mat <- exprs(exprset)
    if (ncol(mat) == 28) {
        WHICHSPIKEIN <- "HGU95A"
        NCOMP <- 91 * 2
        I <- c(1:13, 15:27)
    }
    else {
        if (ncol(mat) == 42) {
            WHICHSPIKEIN <- "HGU133A"
            NCOMP <- 91 * 3
            I <- c(1:13, 15:27, 29:41)
        }
        else stop("Not the right number of columns in expression matrix\n")
    }
    pdata <- pData(exprset)
    genenames <- colnames(pdata)
    spikein <- match(genenames, featureNames(exprset))
    quants <- matrix(0, nrow(mat) - length(spikein), NCOMP)
    m <- matrix(0, nrow(mat), NCOMP)
    a <- matrix(0, nrow(mat), NCOMP)
    intended <- matrix(0, length(spikein), NCOMP)
    denom <- matrix(0, length(spikein), NCOMP)
    num <- matrix(0, length(spikein), NCOMP)
    count <- 0
    for (i in I) {
        for (j in (i + 1):(14 * ceiling(i/14))) {
            count <- count + 1
            fc <- mat[, j] - mat[, i]
            quants[, count] <- sort(fc[-spikein])
            a[, count] <- (mat[, i] + mat[, j])/2
            m[, count] <- fc
            num[, count] <- as.numeric(pdata[j, ])
            denom[, count] <- as.numeric(pdata[i, ])
            intended[, count] <- log2(as.numeric(pdata[j, ])/as.numeric(pdata[i, ]))
        }
    }
    set.seed(12345)
    flip = which(sample(c(FALSE, TRUE), NCOMP, replace = T))
    m[, flip] = -m[, flip]
    intended[, flip] = -intended[, flip]
    N2D = num[, flip]
    num[, flip] = denom[, flip]
    denom[, flip] = N2D
    Index <- spikein

    lowIndex <- which(pmax(as.vector(num),as.vector(denom)) <= 2^2  &
                         abs(log2(as.vector(num)/as.vector(denom)))<=1)
    
    medIndex <- which(pmin(as.vector(num),as.vector(denom)) >= 2^2 &
                         pmax(as.vector(num),as.vector(denom)) <= 2^6 &
                         abs(log2(as.vector(num)/as.vector(denom)))<=1)
    
    highIndex <- which(pmin(as.vector(num),as.vector(denom))>=2^6&
                       abs(log2(as.vector(num)/as.vector(denom)))<=1 )

        
    spikes <- as.vector(m[Index, ])
    nulls <- apply(quants, 1, median)
    tp.low <- sort(abs(spikes[lowIndex]), decreasing = TRUE)
    fp.low <- sapply(tp.low, function(k) sum(abs(nulls) >= k))
    tp.low <- seq(along = tp.low)/length(tp.low)
    tp.med <- sort(abs(spikes[medIndex]), decreasing = TRUE)
    fp.med <- sapply(tp.med, function(k) sum(abs(nulls) >= k))
    tp.med <- seq(along = tp.med)/length(tp.med)
    tp.high <- sort(abs(spikes[highIndex]), decreasing = TRUE)
    fp.high <- sapply(tp.high, function(k) sum(abs(nulls) >= 
        k))
    tp.high <- seq(along = tp.high)/length(tp.high)
    rownames(intended) <- genenames
    SUB <- sample(1:length(m[-Index, ]), length(nulls))
    m <- c(m[Index, ], m[-Index, ][SUB])
    a <- c(a[Index, ], a[-Index, ][SUB])
    names(m)<- c(rep(rownames(mat)[Index], NCOMP), rep(NA,length(m)-length(Index)*NCOMP))
    names(a)<-names(m)
    return(list(qs = nulls, m = m, a = a, spikein = spikein, 
        intended = intended, num = num, denom = denom, fp.low = fp.low, 
        tp.low = tp.low, fp.med = fp.med, tp.med = tp.med, fp.high = fp.high, 
        tp.high = tp.high, method.name = method.name, what = "MA2"))
}


######
## FIGURES
#######
affycomp.compfig4c <- function(l,method.names=as.character(1:length(l)),
                               add.legend=TRUE,rotate=TRUE, main="Figure 4c"){
  xs <- c()
  ys <- c()

  for(i in seq(along=l)){
    if(rotate){
      x <- as.numeric(l[[i]]$linex)[-c(1,2)]
      y <- l[[i]]$localslopes[-1]-1 ##take out the ones related to -Inf
      ylab="Bias"
    }
    else{
      x <- as.numeric(l[[i]]$linex)
      y <- l[[i]]$liney
      In<- which(abs(x)<Inf)
      x<- x[In];y<-y[In]
     ylab<-"Observed log expression"
    }
    xs <- rbind(xs,x)
    ys <- rbind(ys,y)
  }
  if(length(l)==1){ xs <- matrix(xs,nrow=1);ys <- matrix(ys,nrow=1); }
  plot(0,0,type="n",ylim=range(ys),xlim=range(xs),ylab=ylab,
       xlab="Log nominal concentration",main=main,las=1)

  for(i in seq(along=l))
    lines(xs[i,],ys[i,],col=i+1,lty=i,lwd=3)

  if(add.legend){
    if(rotate){
      legend(max(xs)*.7,max(ys)*.95,method.names,lwd=2,col=seq(along=l)+1,lty=seq(along=l))
      abline(h=0)
    }
    else
      legend(min(xs),max(ys)*.95,method.names,lwd=2,col=seq(along=l)+1,lty=seq(along=l))
  }
}
affycomp.figure4c <- function(l,rotate=TRUE, main="Figure 4c")
  affycomp.compfig4c(list(l),method.names=l$method.name,add.legend=FALSE,
                     main=main,rotate=rotate)

affycomp.compfig2b <- function(l,method.names=as.character(1:length(l)),
                              add.legend=TRUE,main="Figure 2b"){
  xs <- c()
  ys <- c()
  for(i in seq(along=l)){
    x <- l[[i]]$xsmooth
    y <- l[[i]]$ysmooth
    xs <- rbind(xs,x)
    ys <- rbind(ys,y)
  }
  plot(0,0,type="n",ylim=range(ys),xlim=range(xs),xlab="Average log expression",ylab="Log expression SD", main=main,las=1)

  for(i in seq(along=l))
    lines(xs[i,],ys[i,],col=i+1,lty=i,lwd=3)

  if(add.legend)
    legend(max(xs)*.7,max(ys)*.95,method.names,lwd=2,col=seq(along=l)+1,lty=seq(along=l))
}
affycomp.figure2b <- function(l,main="Figure 2b")
  affycomp.compfig2b(list(l),method.name=l$method.name,add.legend=FALSE,
                     main=main)

affycomp.compfig5cde <- function(l,method.names=as.character(1:length(l)),
                                 add.legend=TRUE,main="Figure 5c",maxfp=100,
type=c("low","med","high")){
  type <- match.arg(type)
  tp.type <- paste("tp.",type,sep="")
  fp.type <- paste("fp.",type,sep="")
  N <- length(l)
  FP <- vector(mode="list",length=N)
  TP <- vector(mode="list",length=N)
  MIN <- 10
  MAX <- 0
  for(i in 1:N){
    x <- l[[i]][[fp.type]]
    y <- l[[i]][[tp.type]]
    Index <- x<=maxfp
    if(any(Index)){
      MIN <- min(MIN,min(y[Index]))
      MAX <- max(MAX,max(y[Index]))
      TP[[i]] <- c(0,y[Index])
      FP[[i]] <- c(0,x[Index])
    }
    else{ ##there are no true positives!
      TP[[i]] <- c(0,0)
      FP[[i]] <- c(0,maxfp)
      MIN <- 0
    }
  }
  XLIM <- c(0,maxfp)
  YLIM <- c(MIN,MAX)
  plot(1,1,xlab="False Positives",ylab="True Positives",type="n",xlim=XLIM,ylim=YLIM,main=main,las=1)
  for(i in 1:N)
    lines(FP[[i]],TP[[i]],col=i+1,lty=i,lwd=3)
  if(add.legend)
    legend(XLIM[1]+.6*(XLIM[2]-XLIM[1]),
           YLIM[1]+.3*(YLIM[2]-YLIM[1]),
           method.names,col=2:(N+1),lty=1:N,lwd=1)

}

affycomp.compfig5c <- function(l,method.names=as.character(1:length(l)),
                              add.legend=TRUE,main="Figure 5c",maxfp=100)
  affycomp.compfig5cde(l,method.names=method.names,add.legend=add.legend,
                       main=main,maxfp=100,type="low")
affycomp.figure5c <- function(l,main="Figure 5c",maxfp=100)
  affycomp.compfig5c(list(l),method.names=l$method.name,main=main,maxfp=maxfp,
                     add.legend=FALSE)

affycomp.compfig5d <- function(l,method.names=as.character(1:length(l)),
                               add.legend=TRUE,main="Figure 5d",maxfp=100)
  affycomp.compfig5cde(l,method.names=method.names,add.legend=add.legend,
                       main=main,maxfp=100,type="med")
affycomp.figure5d <- function(l,main="Figure 5d",maxfp=100)
  affycomp.compfig5d(list(l),method.names=l$method.name,main=main,maxfp=maxfp,
                     add.legend=FALSE)

affycomp.compfig5e <- function(l,method.names=as.character(1:length(l)),
                              add.legend=TRUE,main="Figure 5e",maxfp=100)
  affycomp.compfig5cde(l,method.names=method.names,add.legend=add.legend,
                        main=main,maxfp=100,type="high")
affycomp.figure5e <- function(l,main="Figure 5e",maxfp=100)
  affycomp.compfig5e(list(l),method.names=l$method.name,main=main,maxfp=maxfp,
                     add.legend=FALSE)


###MvA plot
affycomp.figure1b <- function(l,main="Figure 1b",xlim=NULL,ylim=NULL,cex=.85,
                              all=FALSE){
  x <- l$a
  y <- l$m
  Index <- which(!is.na(names(x)))
  xx <- as.vector(x[Index])
  yy <- as.vector(y[Index])
  x <- as.vector(x[-Index])
  y <- as.vector(y[-Index])

  if(all){
    fc <- as.character(l$intended)
    colors <- abs(l$intended)
    colors[colors>11] <- 11
    Colors <- rev((c(topo.colors(8)[1:6],rev(heat.colors(8))[4:8])))

    if(is.null(ylim))
      ylim <- range(c(y,yy),na.rm=TRUE,finite=TRUE)
    if(is.null(xlim))
      xlim <- range(c(x,xx),na.rm=TRUE,finite=TRUE)

    plot(x,y,pch=".",xlim=xlim,ylim=ylim,main=main,xlab="A",ylab="M",
       las=1)
    o <- abs(y)>1
    points(x[o],y[o],pch=".",col="red")
    o1 <- fc=="Inf"
    o2 <- fc=="-Inf"
    text(xx[!o1 & !o2],yy[!o1&!o2],fc[!o1 & !o2],col=Colors[colors[!o1 & !o2]])
    # HJ
    if(any(o1))
	text(xx[o1],yy[o1],expression(infinity),col="black")
    if(any(o2))
    	text(xx[o2],yy[o2],expression(-infinity),col="black")

  }
  else{
    num <- as.vector(l$num); denom <- as.vector(l$denom)
    tmplist <- list()

    tmplist[[1]] <- which(pmax(as.vector(num),as.vector(denom)) <= 2^1)
        

    tmplist[[2]] <- which(pmin(as.vector(num),as.vector(denom)) >= 2^2 &
                         pmax(as.vector(num),as.vector(denom)) <= 2^5 &
                         abs(log2(as.vector(num)/as.vector(denom)))<=2)
    
    tmplist[[3]] <- which(pmin(as.vector(num),as.vector(denom))>=2^6&
                       abs(log2(as.vector(num)/as.vector(denom)))<=2 )

    
    if(is.null(ylim))
      ylim <- range(c(y,yy[unlist(tmplist)]),na.rm=TRUE,finite=TRUE)
    if(is.null(xlim))
      xlim <- range(c(x,xx[unlist(tmplist)]),na.rm=TRUE,finite=TRUE)


    plot(x,y,pch=".",xlim=xlim,ylim=ylim,main=main,xlab="A",ylab="M",
         las=1)

    colors <- c("blue","darkgreen","red")
    FC <- as.character(l$intended)
    for(i in 1:length(tmplist)){
      fc <- FC[ tmplist[[i]] ]
      xxx <- xx[ tmplist[[i]] ]
      yyy <- yy[ tmplist[[i]] ]
      o1 <- fc=="Inf"
      o2 <- fc=="-Inf"
      text(xxx[!o1 & !o2],yyy[!o1 & !o2],fc[!o1 & !o2],col=colors[i],cex=cex)
      # HJ
      if(any(o1))
      	text(xxx[o1],yyy[o1],expression(infinity),col=colors[i],cex=cex)
      if(any(o2))
      	text(xxx[o2],yyy[o2],expression(-infinity),col=colors[i],cex=cex)
    }
    o <- abs(y)>1
    points((x)[o],(y)[o],pch=".",col="red")
  }

}


#########
## TABLES
#########

tableOverallSNR <- function(...,assessment.list=NULL,method.names=NULL,
                            ngenes=12626){
  if(is.null(assessment.list)) l<-list(...) else l <- assessment.list
  N <- length(l)

  if(is.null(method.names)){
    method.names <- vector(mode="character",len=N)
    for(i in 1:N){
      tmp <- l[[i]]$method.name
      if(is.null(tmp)) method.names[i] <- i else method.names[i] <- tmp
    }
  }

  results <- matrix(0,length(l),6)
  colnames(results) <- c("slope","R2","medianSD","Null FC IQR","Null FC 99.9%","Rank")
  rownames(results) <- method.names
  for(i in 1:N){
    results[i,1] <- l[[i]]$LS$slope
    results[i,2] <- l[[i]]$LS$R2
    results[i,3] <- median(l[[i]]$SpikeInSD$y)
    results[i,4] <-  IQR(l[[i]]$MA$qs)
    results[i,5] <- quantile(abs(l[[i]]$MA$qs),prob=c(0.999))
    qs <- l[[i]]$MA2$qs
    tmp <- abs(qs-results[i,1])
    tmp <- which(tmp==min(tmp))/length(qs)
    results[i,6] <- round(ngenes-tmp*ngenes)
  }
  return(results)
}

tableLS <- function(l,method.names=NULL){
  N <- length(l)
  if(is.null(method.names)) method.names <- 1:N
  results <- matrix(0,length(l[[1]]$localslopes)+6,N)
  colnames(results) <- method.names
  tmp <-  as.character((2^as.numeric(l[[1]]$linex)))
  rownames(results) <- c("low.slope","med.slope","high.slope",
                         "low.R2","med.R2","high.R2",
                         paste(tmp[-1],tmp[-length(tmp)],sep=":"))

  for(i in 1:N){
    results[1:6,i] <- unlist(l[[i]][10:15])
    results[-(1:6),i] <- l[[i]]$localslopes
  }
  return(results)
}

tableSpikeInSD <- function(l,method.names=NULL){
  N <- length(l)
  if(is.null(method.names)) method.names <- 1:N
  results <- matrix(0,4,N)
  colnames(results) <- method.names
  rownames(results) <- c("25% SD",
                         "Median SD",
                         "75% SD",
                         "99% SD")
  for(i in 1:N)
    results[,i] <- quantile(l[[i]]$y,prob=c(0.25,0.5,0.75,0.99))
  return(results)
}

tableMA2 <- function(l,method.names=NULL){
  N <- length(l)
  if(is.null(method.names)) method.names <- 1:N
  results <- matrix(0,7,N)
  colnames(results) <- method.names
  rownames(results) <- c("null log-fc IQR",
                         "null log-fc 99%",
                         "null log-fc 99.9%",
                         "low AUC",
                         "med AUC",
                         "high AUC",
                         "weighted avg AUC")

  myauc <- function(mat){
    y <- tapply(c(0,mat[,2]),c(0,mat[,1]),max)
    x <- as.numeric(names(y))
    y <- y[x<100]
    x <- x[x<100]
    return(mean(approx(x,y,0:100,method="constant",rule=2)$y))
  }
  for(i in 1:N){
    results[1,i] <- IQR(l[[i]]$qs)
    results[2:3,i] <- quantile(abs(l[[i]]$qs),prob=c(0.99,0.999))
    results[4,i] <- myauc(cbind(l[[i]]$fp.low,l[[i]]$tp.low))
    results[5,i] <- myauc(cbind(l[[i]]$fp.med,l[[i]]$tp.med))
    results[6,i] <- myauc(cbind(l[[i]]$fp.high,l[[i]]$tp.high))
    results[7,i] <- sum(c(0.75,0.24,0.01)*results[4:6,i])
  }
  return(results)
}
tableRanks <- function(...,assessment.list=NULL,method.names=NULL,
                            ngenes=12626,rank=TRUE){
  if(is.null(assessment.list)) l<-list(...) else l <- assessment.list
  N <- length(l)

  if(is.null(method.names)){
    method.names <- vector(mode="character",len=N)
    for(i in 1:N){
      tmp <- l[[i]]$method.name
      if(is.null(tmp)) method.names[i] <- i else method.names[i] <- tmp
    }
  }

  results <- matrix(0,length(l[[1]]$LS$localslopes),N)
  colnames(results) <- method.names
  tmp <-  as.character((2^as.numeric(l[[1]]$LS$linex)))
  rownames(results) <- paste(tmp[-1],tmp[-length(tmp)],sep=":")

  for(i in 1:N)
    results[,i] <- l[[i]]$LS$localslopes

  if(!rank) return(results)
  else{
    for(i in 1:N){
       qs <- l[[i]]$MA2$qs
       results[,i] <- sapply(results[,i],function(x){
         tmp <- abs(qs-x)
         tmp <- which(tmp==min(tmp))/length(qs)
         round(ngenes-tmp*ngenes)+1
       })
     }
    return(results)
  }
}

Try the affycomp package in your browser

Any scripts or data that you put into this service are public.

affycomp documentation built on Nov. 8, 2020, 5:55 p.m.