R/scan.R
In miraculix: Algebraic and Statistical Functions for Genetics

Documented in plot.scanning plot.scan.statistics print.scanning print.scan.statistics scanning scan.statistics summary.scanning summary.scan.statistics

## Authors 
## Martin Schlather, schlather@math.uni-mannheim.de
##
##
## Copyright (C) 2017 -- 2019 Martin Schlather
##
## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License
## as published by the Free Software Foundation; either version 3
## of the License, or (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.  

summary.scanning <- function(object, ...) {
  if (length(object$significant.areas) == 0 ||
       length(object$significant.areas) == 0 ) return(NULL)
  if (is.list(object$significant.areas)) {
    if (length(object$significant.areas) == 1) {
      return (object$significant.areas[[1]])
    } else {
      warning("not programmed yet") ## tabelle mit allen werten nebeneinander, sodass man sieht welche intervalle aufgesplittet oder verkleinert wurden 
      return(object$significant.areas)
    }
  } else {
    return (object$significant.areas)
  }
}

print.scanning <- function(x, ...) {
  str(x)#
}

plot.scanning <- function(x, ..., lwd=3, col=1:8) {
  graphics::plot(x$pos, x$freq, pch=".")
  if (length(x$pos) > 50000) {
    w <- windower(data=data.frame("-", x$pos, x$pos + 1, x$freq), length=5000)
    points(0.5*(w[,1] + w[,2]), w[,3], col="lightblue", pch=20, cex=0.5)
  }
  if (length(x$significant.areas) > 0) {
    if (is.list(x$significant.areas)) {
      for (i in 1:length(x$significant.areas)) {
        if (ncol(x$significant.areas[[i]]) > 0) {
          colour <- col[1 + (i) %% length(col)]
          for (j in 1:nrow(x$significant.areas[[i]]))
            lines(x$significant.areas[[i]][j, 1:2], c(0,0), col=colour, lwd=lwd)
        } else {
          message("no significant values for ",
                  if (x$was.null.threshold) "alpha" else "threshold",
                  "[",  i, "] = ",
                  if (x$was.null.threshold) x$alpha[i] else x$threshold[i])
        }
      }
    } else {      
      colour <- col[1 + (x$significant.areas[, 3]-1) %% length(col)]
      for (i in 1:nrow(x$significant.areas)) {
        lines(x$significant.areas[i, 1:2],
              c(0,0) + (x$significant.areas[i, 3] - 1) / 20,
              col=colour[i], lwd=lwd)
      }
    }
  } else {
    message("No significant area found at all.")
  }
  #points(x$pos, x$freq, pch=".")
}


scanning <- function(pos, freq, file, tuningUnits, alpha = 0.1,
                     coarsening = 1,
                     minscans = 0,
                     maxscans = 0,
                     sumscan = FALSE,  perSNP = TRUE,                     
                     colname, n,                 
                     threshold = NULL, collect=!old.def,
                     old.def = FALSE,
                     max.intervals = length(alpha) * 100000,
                     max.basepair.distance = 50000,
                     exclude.negative.at.boundary = TRUE,
                     maximum = TRUE,
                     mean.freq, sd.freq, mean.n) { 
  stopifnot(all(alpha > 0), all(alpha < 1)) ## Gleichheit nicht zulassen
  stopifnot(xor(missing(pos) && missing(freq), missing(file)))

  verbose <- RFoptions(GETOPTIONS="basic")$verbose
    
  if (missing(file)) {
    if (missing(colname)) colname <- "HeterAB"
  } else {
    ext <- strsplit(file, "\\.")[[1]]
    ext <- ext[length(ext)]
    if (ext == "rda") {
      load (file)
    } else {
      if (missing(colname)) {
        colname <- if (ext=="bed") c(pos=3, freq=4, n=5)
        else "HeterAB"
      }    
      if (is.character(colname)) {
        z <- read.table(file)[c("Pos", colname)]
        freq <- z[[colname]]
        pos <- z$Pos
      } else {
        stopifnot(is.numeric(colname))
        stopifnot(missing(n))
        z <- as.matrix(read.table(file)[colname])     
        pos <- z[, which(colname["pos"] == colname)]
        if (ext=="bed") {
          freq <- -z[, which(colname["freq"] == colname)]
        } else freq <- z[, which(colname["freq"] == colname)]
        n <- z[, which(colname["n"] == colname)]
       #
        if (FALSE) {
          idx <- pos > 3.5e7 & pos < 5e7
          freq <- freq[idx]
          pos <- pos[idx]
          n <- n[idx]
          save (file="data/genetics.rda", freq, pos, n)
        }
      }
      rm("z")
    }
    if (verbose) message(length(freq), " data read from file '", file, "'.")
  }

  coarsening <- as.integer(coarsening)
  if (coarsening > 1) {
    w <- windower(data=data.frame("-", pos, pos + 1, freq), length=coarsening)
    freq <- w[ , 3]
    idx <- !is.na(freq)
    if (!missing(n)) {
      w <- windower(data=data.frame("-", pos, pos + 1, n), length=coarsening)
      n <- w[idx, 3]
    }
    freq <- freq[idx]
    ##  pos <- w[idx, 1]
    pos <- 0.5 * (w[idx, 1] + w[idx, 2])
 }

  N <- length(pos)
  Nalpha <- if (is.null.threshold <- is.null(threshold))
              length(alpha) else length(threshold)
  
  stopifnot(all(abs(pos - round(pos)) < 1e-8))
  freq <- as.double(freq)
  pos <- as.integer(round(pos))
  perSNP <- as.integer(perSNP)
  minscans <- as.integer(minscans)
  maxscans <- as.integer(maxscans)

 
  if (missing(n)) n <- round(1 / min(freq[freq != 0])) ##
  if (length(n) == 1 && !all(abs(n * freq - round(n * freq)) < 1e-8)) {
    msg <- paste("Frequencies are not multiples of 1/n.",
                 if (length(n)==1) paste("Here, n=", n, ".", sep=""))
    stop(msg) # else warning(msg)
  }

  if (missing(mean.freq)) {
    mean.freq <- mean(freq)
    if (verbose) cat("mean freq:", mean.freq, "\n")
  }
  if (missing(sd.freq)) {
    sd.freq <- sd(freq)
    if (verbose) cat("sd freq:", sd.freq, "\n")
  }
  if (missing(mean.n)) {
    mean.n <- mean(n)
    if (verbose) cat("mean n:", mean.n, "\n")
  }
  freq <- (freq - mean.freq) / sd.freq
  freq <- freq - tuningUnits * mean.n / n
  mn <- mean(freq)
  eps <- 1e-16
  if (mn > -eps) {
    msg <- "The mean of the shifted frequencies should "
    deltatU <- sum(freq) / sum(1/n)
    deltatU <- deltatU +  10^{-log(deltatU) / log(10) - 2}
    if (mn > eps) stop(msg, "not be positive (but it is ",
                       formatC(mn, digits=4),
                       "; so 'tuningUnits' should be at least ",
                       format(deltatU, digits=4), " larger)")
    warning(msg, "be negative.")
  }

  if (perSNP) {
    v <- var(freq)
    nn <- N
  } else {
    stop("not programmed yet")
    v <- var(freq)
    nn <- diff(range(pos))
  }

 
  if (tuningUnits == 0.0) 
    KabluchkoSpodarev <- sqrt(nn * v) * qnorm(alpha, lower.tail=FALSE)
  else {
    KabluchkoSpodarev <- -999999 # not programmed yet
    
    if (FALSE) {
      u <- unique(freq)    
      f <- numeric(length(u))
      for (i in 1:length(u)) f[i] <- sum(freq == u[i])
      stopifnot(sum(f) == length(freq))
      logmoment <- function(theta) log(sum(f  * exp(theta * u)))
      theta.star <- uniroot(logmoment, interval=c(1/max(u), min(-log(f) / u)))
                                        # remark 4.1, (4.11)
      H.star <- log(H) - log(a.star * theta.star)
      KabluchkoSpodarev <- (log(nn) + H.star - log(-log(1-alpha))) / theta.star
    }
  }
  
  if (is.null(threshold))
    threshold <- as.double(KabluchkoSpodarev)
  else {
    threshold <- as.double(threshold)
   # if (FALSE && !is.null(alpha))
   #   cat("threshold by Kabluchko & Spodarev:", KabluchkoSpodarev, "\n")
  }

  above.threshold <- integer(if (sumscan) Nalpha * N else Nalpha)
  
  maximum <- double(1)
  if (old.def) {
    if (sumscan) 
      .Call(C_sumscan,
            pos, N, freq, minscans, maxscans, threshold, Nalpha, perSNP,
            above.threshold, maximum)
    else
      .Call(C_scan,
            pos, N, freq, minscans, maxscans, threshold, Nalpha, perSNP,
            above.threshold, maximum)
  } else {
    signif.areas <-
      .Call(C_collect_scan2,  pos, N, freq, minscans, maxscans,
            threshold, Nalpha,
            perSNP, as.integer(max.intervals),
            as.integer(max.basepair.distance),
            as.integer(exclude.negative.at.boundary),
            above.threshold, maximum)
  }
  
  if (any(above.threshold < 0))
    above.threshold <- rep(Inf, length(above.threshold))
  any.above.threshold <- any(above.threshold > 0)
   
  if (sumscan) {
    dim(above.threshold) <- c(N, Nalpha)
  }

  l <- list(if (!missing(file)) file=file,
            pos=pos, freq=freq,
            tuningUnits = tuningUnits,
            alpha=alpha,
            n = n,
            minscans=minscans,
            maxscans=maxscans,
            sumscan = sumscan,
            perSNP=perSNP,
            was.null.threshold = is.null.threshold,
            threshold=threshold,
            any.above.threshold =  any.above.threshold,
            max.basepair.distance = max.basepair.distance,
            exclude.negative.at.boundary = exclude.negative.at.boundary,
            # output
            above.threshold=above.threshold,
            maximum=maximum)
  if (!collect || sumscan) {
    if (!old.def) l$significant.areas <- t(signif.areas)
      class(l) <- "scanning"
    return(invisible(l))
  }  
  if (verbose) cat(if (old.def) "First scan done.\n" else "Scanning done.\n")
  
  significant.areas <- areas <- values <- NULL
  if (any.above.threshold) {
    ## Mengen vereinigen
    if (old.def) {
      n.areas <- max(above.threshold)
      if (n.areas > 1e8) stop(n.areas, " areas are above the threshold. This is too much.")
      areas <- integer(n.areas * 3) # start, end, thresholdlevel
      ## signif.areas <- integer(n.areas * 3) # start, end, thresholdlevel
      values <- double(n.areas)      # value
      
      signif.areas <-
        .Call(C_collect_scan,  pos, N, freq,
              minscans, maxscans, threshold, Nalpha,
              perSNP, areas, values)
      dim(areas) <- c(3, n.areas)
      signif.areas <- signif.areas[, signif.areas[3, ] != 0, drop=FALSE]   
      if (verbose) cat("Second scan done.\n")
    }
    
    significant.areas <- list()
    totallength <- numeric(Nalpha)
    START <- 1
    END <- 2
    for (i in 1:Nalpha) {
      s <- signif.areas[, signif.areas[3, ] >= i, drop=FALSE]
      j <- 1
      while (j <= ncol(s)) {
        k <- j + 1
        while (k <= ncol(s)) {
          if ((s[START, j] <= s[START, k] && s[START, k] <= s[END, j]) ||
              (s[START, k] <= s[START, j] && s[START, j] <= s[END, k])) {
            if (s[START, k] < s[START, j]) s[START, j] <- s[START, k]
            if (s[END, k] > s[END, j]) s[END, j] <- s[END, k]
            s <- s[, -k, drop=FALSE]
          } else k <- k + 1
        }
        j <- j + 1
      }
      s[3, ] <- i
      significant.areas[[i]] <- t(s)
      totallength[i] <- diff(colSums(significant.areas[[i]])[1:2])
    }
    Message <- paste("Null hypothesis is rejected at ",
                     if (is.null.threshold) paste("alpha =", alpha[1])
                     else  paste("threshold =",formatC(threshold[1], digits=4)),
                     " that the data are homogeneous", sep="")
  } else {
    totallength <- rep(0, Nalpha)
    p <- if (is.null(threshold))
      formatC(pnorm(maximum / sqrt(nn * v), lower.tail=FALSE), digits=4)
    else p <- "'unknown'"
     Message <- paste("Null hypothesis is accepted at ",
                     if (is.null.threshold) paste("alpha =", alpha[1])
                     else paste("threshold =", formatC(threshold[1], digits=4)),
                      " that the data are homogeneous (p=",p, ")", sep="")
  }

  stopifnot(length(totallength) == Nalpha)
  l <- c(l, list(areas = areas, values=values,
                 significant.areas = significant.areas,
                 totallength = totallength,
                 Message = Message))
  class(l) <- "scanning"
  return(invisible(l))
}


print.scan.statistics <- function(x, ...) {
  str(x) #
}

summary.scan.statistics <- function(object, ...) {
  cat(object$Message, "\n")  
}


plot.scan.statistics <- function(x, ..., pch="|", col=2:8) {
  txt <- ""
  alpha <- x$alpha
  Nalpha <- length(alpha)
  ylim <- if (hasArg("ylim")) list(...)$ylim else range(alpha, x$orig.freq)
  colname <- x$colname
  if (is.numeric(colname)) colname <- paste("column", colname["freq"])
  graphics::plot(x$pos, x$orig.freq, pch=pch, cex=0.75, ylim=ylim, main=x$file,
       xlab="positions", ylab=colname, col="black")
  if (x$sumscan) {
    above <- t(x$above.threshold)
    maxi <- apply(above, 1, max)
    txt <- paste("mx.cnts=", paste(maxi[1], collapse=","), "  ", sep="")
    above <- t(above / maxi)
    graphics::matplot(x$pos, above, type="l", add=TRUE, col=col);
    for (i in 1:Nalpha) {
      cur <- col[(i-1) %% length(col) + 1]
      points(x$pos, alpha[i] * (above[, i] > 0), col=cur, pch=20, cex=0.65) 
    }      
  } else { ### not sumscan
    for (i in 1:Nalpha) {
      xx <- x$significant.areas[[i]]
      if (length(xx) == 0) next;
      cur <- col[(i-1) %% length(col) + 1]
      for (j in 1:nrow(xx))
        if (xx[j, 1] == xx[j, 2]) points(xx[j, 1], alpha[i], col=cur, pch=20)
        else lines(c(xx[j, 1], xx[j, 2]), rep(alpha[i], 2), col=cur, lwd=3)
    }
  }
  
  if (x$minscans != 0) txt <- paste(txt, "minscans=", x$maxscans, "  ", sep="")
  if (x$maxscans != 0) txt <- paste(txt, "maxscans=", x$maxscans, sep="")

  if (txt!="") text(x=0, y=0.4, labels=txt, adj=c(0,0), col=6)
  title(sub=x$Message, col.sub="darkred")
  legend(x="topright",
         legend=alpha, col=col, text.col=col, lwd=2, adj=c(0, 0),
         title=expression(alpha))
}



scan.statistics <-
  function(file, tuningUnits, alpha=c(0.05, 0.01), repet=1000,
           coarsening = 1,
           minscans=0, maxscans=0,
           sumscan = FALSE,
           perSNP=TRUE, # otherwise per "basepair"
           colname, n, return.simu = FALSE,
           debug = FALSE,
           formula = FALSE, 
           old.def=FALSE,
           max.intervals = length(alpha) * 100000,
           max.basepair.distance = 50000,
           exclude.negative.at.boundary = TRUE,
           pos, freq) {
  if (!perSNP) stop("not programmed yet") ## simu muss auf pktprozessen beruhen
  if (formula) stop("Kabluchko-Spodarev formulae not programmed yet")
  if (1/min(alpha) > repet)
    stop("value of 'repet' too small to identify the 'alpha' level(s)")

  if (hasArg("file")) {
    if (hasArg("pos") || hasArg("freq") || hasArg("colname") || hasArg("n"))
      stop("either the file name or 'pos', 'freq', 'colname', 'n' must be given.")
    if (is.character(file)) {
      ext <- strsplit(file, "\\.")[[1]]
      ext <- ext[length(ext)]
      if (ext == "rda") {
        load (file)
      }
      else {
        if (missing(colname)) {
          colname <- if (ext=="bed")  c(pos=3, freq=4, n=5) else "HeterAB"
        }
        if (is.character(colname)) {
          z <- read.table(file)[c("Pos", colname)]
          freq <- z[[colname]]
          pos <- z$Pos
        } else {
          stopifnot(is.numeric(colname))
          stopifnot(missing(n))
          z <- as.matrix(read.table(file)[colname])
          pos <- z[, which(colname["pos"] == colname)]
          if (ext=="bed") {
            freq <- 1 - z[, which(colname["freq"] == colname)]
          } else freq <- z[, which(colname["freq"] == colname)]
          n <- z[, which(colname["n"] == colname)]  
        }
        rm("z")
      }
    } else {
      for (i in c('pos', 'freq', 'colname', 'n' ))
        assign(i, file[[i]])
    }
  }

  coarsening <- as.integer(coarsening)
  if (coarsening > 1) {
    w <- windower(data=data.frame("-", pos, pos + 1, freq), length=coarsening)
    freq <- w[ , 3]
    idx <- !is.na(freq)
    if (!missing(n)) {
      w <- windower(data=data.frame("-", pos, pos + 1, n), length=coarsening)
      n <- w[idx, 3]
    }
    freq <- freq[idx]
    ##  pos <- w[idx, 1]
    pos <- 0.5 * (w[idx, 1] + w[idx, 2])
  }

 
  N <- length(pos)
  Nalpha <- length(alpha)
  
  if (return.simu) ret.simu <- matrix(NA, nrow=N, ncol=repet)
  if (missing(n)) n <- round(1 / min(freq[freq != 0])) ##
  if (length(n) == 1) stopifnot(all(abs(n * freq - round(n * freq)) < 1e-8))
  
  ylim <- range(freq, alpha)
 
  maxima <- probab <- NULL
  if (debug && hasArg("file") &&
      file.exists(filename <- paste(file, ".max.rda", sep="")))
    load (filename)
  else if (is.null(formula) || !formula) {
    probab <- sum(freq) / N ## sum(n * freq) / (N * n) // counts
    maxima <- double(repet)
    verbose <- RFoptions(GETOPTIONS="basic")$verbose
     for (i in 1:repet) {
      if (verbose) cat(i, "")
      ## simufreq <- rbinom(N, n, probab) / n
      simufreq <- sample(freq, replace=TRUE)
      if (return.simu) {
        ret.simu[, i] <- simufreq
        stopifnot(!any(is.na(ret.simu[,i])))
      }
      maxima[i] <-
        scanning(pos, simufreq, tuningUnits=tuningUnits,
                 minscans=minscans, maxscans=maxscans,
                 sumscan=FALSE, perSNP=perSNP,
                 alpha=0.999, colname=colname, n=n,
                 old.def = old.def,
                 max.intervals = max.intervals,
                 max.basepair.distance = max.basepair.distance,
                 exclude.negative.at.boundary = exclude.negative.at.boundary
                 )$maximum      
      if (debug == 2) {
        graphics::plot(pos, simufreq, pch=".", ylim=ylim,
                       main= if (hasArg("file")) file,
                       sub=i)
        readline(paste(sum(simufreq), "press return"))
      }
    }
    maxima <- sort(maxima)
    threshold <- as.double(maxima[ceiling((1-alpha) * repet)])
    if (debug && hasArg("file")) {
      save (file=filename, maxima, threshold); return()
    }
  }

  res <-
    scanning(pos, freq, tuningUnits=tuningUnits,
             minscans=minscans, maxscans=maxscans,
             sumscan=sumscan, perSNP=perSNP,
             alpha=alpha,
             colname=colname, 
             threshold = if (is.null(formula) || !formula) threshold else NULL,#
             n=n,
             collect=TRUE,
             old.def = old.def,
             max.intervals = max.intervals,
             max.basepair.distance = max.basepair.distance,
             exclude.negative.at.boundary = exclude.negative.at.boundary
             )

  if ( (is.null(formula) || !formula) && !res$any.above.threshold)  {
     p <- mean(res$maximum >= maxima)
     res$Message <- paste("Null hypothesis is accepted at alpha=", alpha[1],
                          " that the data are homogeneous (p =",
                          formatC(p, digits=4), ")", sep="")
  }

  res <- c(res, list(file = if (hasArg("file")) file else NULL,
                     orig.freq = freq,
                     maxima = maxima, repet = repet,
                     colname = if (missing(colname)) NULL else colname ,
                     if (return.simu) ret.simu = ret.simu))
  class(res) <- "scan.statistics"

  return(res)
}
Any scripts or data that you put into this service are public.
miraculix documentation built on Sept. 22, 2021, 5:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
miraculix
Algebraic and Statistical Functions for Genetics

R/scan.R
In miraculix: Algebraic and Statistical Functions for Genetics

Defines functions scan.statistics plot.scan.statistics summary.scan.statistics print.scan.statistics scanning plot.scanning print.scanning summary.scanning

Documented in plot.scanning plot.scan.statistics print.scanning print.scan.statistics scanning scan.statistics summary.scanning summary.scan.statistics

Try the miraculix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

miraculix Algebraic and Statistical Functions for Genetics

R/scan.R In miraculix: Algebraic and Statistical Functions for Genetics

Defines functions scan.statistics plot.scan.statistics summary.scan.statistics print.scan.statistics scanning plot.scanning print.scanning summary.scanning

Documented in plot.scanning plot.scan.statistics print.scanning print.scan.statistics scanning scan.statistics summary.scanning summary.scan.statistics

Try the miraculix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

miraculix
Algebraic and Statistical Functions for Genetics

R/scan.R
In miraculix: Algebraic and Statistical Functions for Genetics