RegParallel: Standard regression functions in R enabled for parallel processing over large data-frames

Documented in glmParallel

glmParallel <- function(
  data,
  formula.list,
  FUN,
  variables,
  terms,
  startIndex,
  blocksize,
  blocks,
  APPLYFUN,
  conflevel,
  excludeTerms,
  excludeIntercept)
{

  ExpBeta <- l <- NULL

  # loop through and process each block of variants
  # with foreach, this loop is parallelised
  foreach(l = seq_len(blocks),
    .combine = rbind,
    .multicombine = TRUE,
    .inorder = FALSE,
    .packages = c('data.table', 'doParallel')) %dopar% {

    # first block - will be executed just once
    if (l==1) {
      message('Processing ', blocksize,
        ' formulae, batch ', l, ' of ', blocks)
      message('-- index1: 1; ', 'index2: ', (blocksize*l))

      # subset the data to only include variants that will be
      # processsed in this block
      df <- data[,c(
        which(colnames(data) %in% terms),
        startIndex + (seq_len(blocksize*l)))]

      # run the models
      models <- APPLYFUN(formula.list[seq_len(blocksize*l)],
        function(f) summary(FUN(formula = f, data = df))$coefficients)
      names(models) <- variables[seq_len(blocksize*l)]

      # detect failed models (return 'try' error)
      nullindices <- c(
        which(mapply(inherits, models, 'try-error'))
      )
      if (length(nullindices) > 1) {
        print(models[[nullindices[1]]][1])
        stop('\nOne or more models have failed. ',
          'Please recheck your data and model formula. ',
          'In particular, check that your variable names ',
          'are valid, i.e., no spaces, do not begin with ',
          'a number, no hyphens or other punctuation, et ',
          'cetera.')
      }

      # convert to data frames
      wObjects <- APPLYFUN(names(models),
        function(x) data.frame(
          rep(x, length(rownames(models[[x]]))),
          rownames(models[[x]]),
          models[[x]],
          row.names=rownames(models[[x]])))

      # remove intercept and / or specified terms from the output
      if (!is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c('\\(Intercept\\)', excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      } else if (is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            '\\(Intercept\\)',
            rownames(x),
            invert=TRUE),])
      } else if (!is.null(excludeTerms) && excludeIntercept == FALSE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c(excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      }

      #Convert the list into a data table
      wObjects <- data.table(rbindlist(wObjects), stringsAsFactors=FALSE)

      # set colnames
      # detect if the y variable in a glm is a continous variable
      # (model will contain t stat; otherwise, Z)
      if (any(grepl('^t\\.value$', colnames(wObjects))) == TRUE) {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 't', 'P')
      } else {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 'Z', 'P')
      }

      # sort out final data to return
      wObjects$Variable <- as.character(wObjects$Variable)
      wObjects$Beta <- as.numeric(as.character(wObjects$Beta))
      wObjects$StandardError <- as.numeric(
        as.character(wObjects$StandardError))
      if (any(grepl('^t$', colnames(wObjects))) == TRUE) {
        wObjects$t <- as.numeric(as.character(wObjects$t))
      } else {
        wObjects$Z <- as.numeric(as.character(wObjects$Z))
      }
      wObjects$P <- as.numeric(as.character(wObjects$P))

      # calculate OR and confidence intervals
      wObjects$OR <- exp(wObjects[,'Beta'])
      wObjects$ORlower <- wObjects$OR * exp(
        qnorm(((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)
      wObjects$ORupper <- wObjects$OR * exp(
        qnorm(1 - ((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)

      # change Inf and NaN to NA
      if (any(grepl('^t$', colnames(wObjects)) == TRUE)) {
        wObjects$t[is.infinite(wObjects$t)] <- NA
        wObjects$t[wObjects$t == 'NaN'] <- NA
      } else {
        wObjects$Z[is.infinite(wObjects$Z)] <- NA
        wObjects$Z[wObjects$Z == 'NaN'] <- NA
      }
      wObjects$P[is.infinite(wObjects$P)] <- NA
      wObjects$P[wObjects$P == 'NaN'] <- NA
      wObjects$OR[is.infinite(wObjects$OR)] <- NA
      wObjects$OR[wObjects$OR == 'NaN'] <- NA
      wObjects$ORlower[is.infinite(wObjects$ORlower)] <- NA
      wObjects$ORlower[wObjects$ORlower == 'NaN'] <- NA
      wObjects$ORupper[is.infinite(wObjects$ORupper)] <- NA
      wObjects$ORupper[wObjects$ORupper == 'NaN'] <- NA

      return(wObjects)
    }

    # final block - will be executed just once
    if (l==blocks) {
      message('Processing final batch ', l, ' of ',
        blocks)
      message('-- index1: ', (1+(blocksize*(l-1))), '; ',
        'index2: ', length(formula.list))

      # subset the data to only include variants that will be
      # processsed in this block
      df <- data[,c(
        which(colnames(data) %in% terms),
        startIndex + (((1+(blocksize*(l-1)))):(length(formula.list))))]

      # run the models
      models <- APPLYFUN(
        formula.list[(1+(blocksize*(l-1))):length(formula.list)],
        function(f) summary(FUN(formula = f, data = df))$coefficients)
      names(models) <- variables[(1+(blocksize*(l-1))):length(formula.list)]

      # detect failed models (return 'try' error)
      nullindices <- c(
        which(mapply(inherits, models, 'try-error'))
      )
      if (length(nullindices) > 1) {
        print(models[[nullindices[1]]][1])
        stop('\nOne or more models have failed. ',
          'Please recheck your data and model formula. ',
          'In particular, check that your variable names ',
          'are valid, i.e., no spaces, do not begin with ',
          'a number, no hyphens or other punctuation, et ',
          'cetera.')
      }

      # convert to data frames
      wObjects <- APPLYFUN(names(models),
        function(x) data.frame(
          rep(x, length(rownames(models[[x]]))),
          rownames(models[[x]]),
          models[[x]],
          row.names=rownames(models[[x]])))

      # remove intercept and / or specified terms from the output
      if (!is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c('\\(Intercept\\)', excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      } else if (is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            '\\(Intercept\\)',
            rownames(x),
            invert=TRUE),])
      } else if (!is.null(excludeTerms) && excludeIntercept == FALSE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c(excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      }

      #Convert the list into a data table
      wObjects <- data.table(rbindlist(wObjects), stringsAsFactors=FALSE)

      # set colnames
      # detect if the y variable in a glm is a continous variable
      # (model will contain t stat; otherwise, Z)
      if (any(grepl('^t\\.value$', colnames(wObjects))) == TRUE) {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 't', 'P')
      } else {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 'Z', 'P')
      }

      # sort out final data to return
      wObjects$Variable <- as.character(wObjects$Variable)
      wObjects$Beta <- as.numeric(as.character(wObjects$Beta))
      wObjects$StandardError <- as.numeric(
        as.character(wObjects$StandardError))
      if (any(grepl('^t$', colnames(wObjects))) == TRUE) {
        wObjects$t <- as.numeric(as.character(wObjects$t))
      } else {
        wObjects$Z <- as.numeric(as.character(wObjects$Z))
      }
      wObjects$P <- as.numeric(as.character(wObjects$P))

      # calculate OR and confidence intervals
      wObjects$OR <- exp(wObjects[,'Beta'])
      wObjects$ORlower <- wObjects$OR * exp(
        qnorm(((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)
      wObjects$ORupper <- wObjects$OR * exp(
        qnorm(1 - ((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)

      # change Inf and NaN to NA
      if (any(grepl('^t$', colnames(wObjects)) == TRUE)) {
        wObjects$t[is.infinite(wObjects$t)] <- NA
        wObjects$t[wObjects$t == 'NaN'] <- NA
      } else {
        wObjects$Z[is.infinite(wObjects$Z)] <- NA
        wObjects$Z[wObjects$Z == 'NaN'] <- NA
      }
      wObjects$P[is.infinite(wObjects$P)] <- NA
      wObjects$P[wObjects$P == 'NaN'] <- NA
      wObjects$OR[is.infinite(wObjects$OR)] <- NA
      wObjects$OR[wObjects$OR == 'NaN'] <- NA
      wObjects$ORlower[is.infinite(wObjects$ORlower)] <- NA
      wObjects$ORlower[wObjects$ORlower == 'NaN'] <- NA
      wObjects$ORupper[is.infinite(wObjects$ORupper)] <- NA
      wObjects$ORupper[wObjects$ORupper == 'NaN'] <- NA

      return(wObjects)
    }

    # any other blocks - executed any number of times between first and final block
    if (l>1 && l<blocks) {
      message('Processing ', blocksize, ' formulae, batch ',
        l, ' of ', blocks)
      message('-- index1: ', (1+(blocksize*(l-1))), '; ',
        'index2: ', (blocksize*l))

      # subset the data to only include variants that will be
      # processsed in this block
      df <- data[,c(
        which(colnames(data) %in% terms),
        startIndex + ((1+(blocksize*(l-1))):(blocksize*l)))]

      # run the models
      models <- APPLYFUN(
        formula.list[(1+(blocksize*(l-1))):(blocksize*l)],
        function(f) summary(FUN(formula = f, data = df))$coefficients)
      names(models) <- variables[(1+(blocksize*(l-1))):(blocksize*l)]

      # detect failed models (return 'try' error)
      nullindices <- c(
        which(mapply(inherits, models, 'try-error'))
      )
      if (length(nullindices) > 1) {
        print(models[[nullindices[1]]][1])
        stop('\nOne or more models have failed. ',
          'Please recheck your data and model formula. ',
          'In particular, check that your variable names ',
          'are valid, i.e., no spaces, do not begin with ',
          'a number, no hyphens or other punctuation, et ',
          'cetera.')
      }

      # convert to data frames
      wObjects <- APPLYFUN(names(models),
        function(x) data.frame(
          rep(x, length(rownames(models[[x]]))),
          rownames(models[[x]]),
          models[[x]],
          row.names=rownames(models[[x]])))

      # remove intercept and / or specified terms from the output
      if (!is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c('\\(Intercept\\)', excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      } else if (is.null(excludeTerms) && excludeIntercept == TRUE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            '\\(Intercept\\)',
            rownames(x),
            invert=TRUE),])
      } else if (!is.null(excludeTerms) && excludeIntercept == FALSE) {
        wObjects <- APPLYFUN(wObjects,
          function(x) x[grep(
            paste(c(excludeTerms), collapse='|'),
            rownames(x),
            invert=TRUE),])
      }

      #Convert the list into a data table
      wObjects <- data.table(rbindlist(wObjects), stringsAsFactors=FALSE)

      # set colnames
      # detect if the y variable in a glm is a continous variable
      # (model will contain t stat; otherwise, Z)
      if (any(grepl('^t\\.value$', colnames(wObjects))) == TRUE) {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 't', 'P')
      } else {
        colnames(wObjects) <- c('Variable', 'Term', 'Beta',
          'StandardError', 'Z', 'P')
      }

      # sort out final data to return
      wObjects$Variable <- as.character(wObjects$Variable)
      wObjects$Beta <- as.numeric(as.character(wObjects$Beta))
      wObjects$StandardError <- as.numeric(
        as.character(wObjects$StandardError))
      if (any(grepl('^t$', colnames(wObjects))) == TRUE) {
        wObjects$t <- as.numeric(as.character(wObjects$t))
      } else {
        wObjects$Z <- as.numeric(as.character(wObjects$Z))
      }
      wObjects$P <- as.numeric(as.character(wObjects$P))

      # calculate OR and confidence intervals
      wObjects$OR <- exp(wObjects[,'Beta'])
      wObjects$ORlower <- wObjects$OR * exp(
        qnorm(((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)
      wObjects$ORupper <- wObjects$OR * exp(
        qnorm(1 - ((1 - (conflevel / 100)) / 2)) * wObjects$StandardError)

      # change Inf and NaN to NA
      if (any(grepl('^t$', colnames(wObjects)) == TRUE)) {
        wObjects$t[is.infinite(wObjects$t)] <- NA
        wObjects$t[wObjects$t == 'NaN'] <- NA
      } else {
        wObjects$Z[is.infinite(wObjects$Z)] <- NA
        wObjects$Z[wObjects$Z == 'NaN'] <- NA
      }
      wObjects$P[is.infinite(wObjects$P)] <- NA
      wObjects$P[wObjects$P == 'NaN'] <- NA
      wObjects$OR[is.infinite(wObjects$OR)] <- NA
      wObjects$OR[wObjects$OR == 'NaN'] <- NA
      wObjects$ORlower[is.infinite(wObjects$ORlower)] <- NA
      wObjects$ORlower[wObjects$ORlower == 'NaN'] <- NA
      wObjects$ORupper[is.infinite(wObjects$ORupper)] <- NA
      wObjects$ORupper[wObjects$ORupper == 'NaN'] <- NA

      return(wObjects)
    }
  }
}
kevinblighe/RegParallel documentation built on Oct. 2, 2023, 2:55 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
kevinblighe/RegParallel
Standard regression functions in R enabled for parallel processing over large data-frames

R/glmParallel.R
In kevinblighe/RegParallel: Standard regression functions in R enabled for parallel processing over large data-frames

Defines functions glmParallel

Documented in glmParallel

R Package Documentation

Browse R Packages

We want your feedback!

kevinblighe/RegParallel Standard regression functions in R enabled for parallel processing over large data-frames

R/glmParallel.R In kevinblighe/RegParallel: Standard regression functions in R enabled for parallel processing over large data-frames

Defines functions glmParallel

Documented in glmParallel

R Package Documentation

Browse R Packages

We want your feedback!

kevinblighe/RegParallel
Standard regression functions in R enabled for parallel processing over large data-frames

R/glmParallel.R
In kevinblighe/RegParallel: Standard regression functions in R enabled for parallel processing over large data-frames