R/aggregate.R
In caroline: A Collection of Database, Data Structure, Visualization, and Utility Functions for R

Documented in addFactLevs bestBy groupBy leghead nerge regroup sstable

 
 .pasteUnique <- function(x, ...){
    paste(unique(x), ...)
 }
 
 #=c('sum','mean','var','sd','max','min','length','concat','none')

.aggregateDFrow <- function(x, funcs, clmns, distinct=FALSE, collapse=',', ...){

  #funcs <- match.arg(funcs, several.ok=TRUE)
   
  if(length(clmns)!= length(funcs))
    stop('clmns length does not equal funcs length')
  dfl <- list()
  for(i in seq(along=clmns)){
    args <- list(x[,clmns[i]])
    if(funcs[i]=='paste'){
      args <-  append(list(collapse=collapse), args)
      if(distinct)
        funcs[i] <- '.pasteUnique'
    }
    if(!funcs[i] %in% c('length','paste'))
      args <- append(args, list(...))
    dfl[[paste(clmns[i],funcs[i],sep='_')]] <- do.call(funcs[i], args)
  }
  df <- as.data.frame(dfl, stringsAsFactors=FALSE);

  return(df)  
}


.rowlist2df <- function(x)
 do.call(rbind.data.frame, x)


bestBy <- function(df, by, best, clmns=names(df), inverse=FALSE, sql=FALSE){
  if(!is.character(best))
    stop('best column name must be of class caracter')
   
  if(!best %in% clmns){
    warning("'best' not included in 'clmns', adding it for you")
    clmns <- c(clmns, best)
  }
  byinclmns <- by %in% clmns
  if(!byinclmns)
      clmns <- c(clmns, by)
  
  if(!sql){    
    gb <- groupBy(df, by, clmns=clmns, aggregation=NULL)
    out <- .rowlist2df(lapply(gb, function(g)   g[order(g[,best], decreasing=inverse)[1],]))
    out <- out[order(out[,best], decreasing=inverse),clmns]			
  }else{
    requireNamespace("RSQLite")
    
    dots.in.names <- any(grepl('\\.', names(df)))
    if(dots.in.names){
      ## protect SQL from R's allowance of '.'s in variable names
      by <- gsub('\\.','_', by)  
      names(df) <- gsub('\\.','_', names(df))  
      dotpos.clmns <- .gedots(clmns)     #remember
      clmns <- gsub('\\.','_', clmns)
    }
    
    tmpfile <- tempfile() 
    con <- RSQLite::dbConnect(RSQLite::dbDriver("SQLite"), dbname = tmpfile)
    RSQLite::dbWriteTable(con, 'tab', df)
    sql <- paste("SELECT * FROM tab AS tab1 WHERE tab1.oid IN 
                                   (SELECT tab2.oid FROM tab AS tab2
         				WHERE tab1.",by," = tab2.",by,"
         				ORDER BY tab2.", best," ", c('ASC','DESC')[inverse+1],"
     				LIMIT 1)", sep='')
    out <- RSQLite::dbGetQuery(con, sql)
    out <- out[order(out[,best], decreasing = inverse), clmns]
    
    ## convert underscores back to '.'s 
    if(dots.in.names)
      names(out) <- .redots(names(out), dotpos.clmns) 
    
    rownames(out)<- out[,by]
    if(!byinclmns)
	out[,by] <- NULL
  }
  return(out)
}


## the next two functions compensate for sql turning '.'s in names to '_' 
## it finds the positions, let's R -> SQL do it's conversion ...
.gedots <- function(v)
  lapply(v,function(x) gregexpr('\\.',x)[[1]])
  
# ... and then remembers where the original '.'s were and puts them back
.redots <- function(v, pos.list){
  for(i in seq(along=v)){
    if(any(pos.list[[i]] < 0)){
      next
    }else{
     for(pos in pos.list[[i]]){
       begin <- substr(v[i],1,pos-1)
       end <- substr(v[i],pos+1, nchar(v[i]))
       v[i] <-  paste(begin,end, sep='.')
     }
    } 
   }  
  return(v)
}

groupBy <- function(df, by, aggregation, clmns=names(df), collapse=',', distinct=FALSE, sql=FALSE, full.names=FALSE, ...){

  #aggregation <- match.arg(aggregation, several.ok=TRUE)
  
  #if(any(aggregation!='none'))
  if(!any(is.null(aggregation)))
    if(length(aggregation) != length(clmns))
      if(length(aggregation) == 1){
        warning(paste("automatically extending your only specified aggregation to all ",length(clmns)," clmns"))
        aggregation <- rep(aggregation, length(clmns))
      }else{
        stop("length of 'aggregation' does not equal length of 'clmns'")
      }
  
  if(is.character(by)){
    if(!by %in% names(df)){
      stop("character argument 'by' not in names of dataframe 'df'")
    }else{
       if(!sql)
         by <- df[,by]
    }
  }else{  #not character
    if(sql)
      stop('sql version requires character field name for grouping factor')
  }
  
  if(any(sapply(as.character(by), nchar)> 255))
    stop("levels of by must have no more than 256 characters")

  if(!sql){

    if(any(is.null(aggregation))){
      return(by(df, by, function(x) x[,clmns]))
    }else{
      #aggregation <- sub('count','length', aggregation)
      out <- .rowlist2df(
               by(df, by, function(x) .aggregateDFrow(x, funcs=aggregation, clmns=clmns, collapse=collapse, ...)))
    }
  }else{
    if(any(is.null(aggregation)))
      stop('cannot return group by with out agregation for sql version')
    if(any(c('sd','var') %in% aggregation))
      stop('SQLite does not yet support STDDEV or VARIANCE SQL calls')
   
    supported.sql.funcs <- c('sum','mean','max','min','length','paste')
    if(!all(aggregation %in% supported.sql.funcs))
      stop("In SQL mode, please stick to the supported functions: ", paste(supported.sql.funcs, collapse=','))
    
    dots.in.names <- any(grepl('\\.', names(df)))
    if(dots.in.names){
        ## protect SQL from R's allowance of '.'s in variable names
	by <- gsub('\\.','_', by)  
	names(df) <- gsub('\\.','_', names(df))  
        dotpos.clmns <- .gedots(clmns) #remember positions for reinitroduction
        clmns <- gsub('\\.','_', clmns)
    }
    
    requireNamespace("RSQLite")
    tmpfile <- tempfile() 
    con <- RSQLite::dbConnect(RSQLite::dbDriver("SQLite"), dbname = tmpfile)
    ## sort by 'by' column: important for ordering match to 'unique' row renaming below
    df <- df[order(df[,by]),]
    RSQLite::dbWriteTable(con, 'tab', df)
    aggregation <- sub('mean','avg', aggregation)
    aggregation <- sub('length','count', aggregation)
    #aggregation <- sub('sd','stddev', aggregation)  #not yet supported by SQLite
    #aggregation <- sub('var','variance', aggregation) #not yet supported by SQLite
    sql.concat <- 'group_concat('
    sql.collapse <- paste(", '",collapse,"'",sep='')

    if(distinct){
      sql.concat <- paste(sql.concat, 'DISTINCT ', sep='')
      if(collapse == ','){
        sql.collapse <- ''
      }else{
        stop("collapse must be ',' when distinct, sql are both TRUE")
      }
    }
	
    select <- paste(paste(aggregation,"(",clmns,") AS ", clmns,'_',aggregation, sep=''), collapse=', ')  #
    select <- gsub('paste\\(([^\\)]+)',paste(sql.concat, "\\1", sql.collapse ,sep=''), select) #array_to_string(array_agg(field), '; ') #postgresql
    
    ## add the 'by' variable
    select <- paste(by,"AS sortbyid,", select)
    
    sql <- paste("SELECT", select, "FROM tab GROUP BY", by)
    out <- RSQLite::dbGetQuery(con, sql)
    
    rownames(out) <- out$sortbyid
    out$sortbyid <- NULL
    
    ## convert underscores back to '.'s 
    if(dots.in.names){
      clmns      <- .redots(clmns     , dotpos.clmns)  
      names(out) <- .redots(names(out), dotpos.clmns) 
    }
  }

  if(!full.names){
    if(length(unique(clmns)) != ncol(out))
      warning('Selected columns are not unique, cannot use original names. Using full names instead')
    else
      names(out) <- clmns
  }
  
  
  return(out)
}


#.dbGroupBy <- function(df, by, clmns, aggregation){

#}


regroup <- function(df, old, new, clmns=names(df), funcs=rep('sum',length(clmns)), combine=TRUE){
  
  if(length(old) != length(new))
    stop('old and new must be the same length')

  if(combine)
    groupings <- row.names(df)
  else
    groupings <- rep(NA, nrow(df))
  matches <- match(old, row.names(df))
  groupings[matches[!is.na(matches)]] <- new[!is.na(matches)]

  regroup <- groupBy(df, by=as.factor(groupings), funcs, clmns=clmns)
  return(regroup)
}

  
addFactLevs <- function(x, new.levs=NA){

  if(!is.data.frame(x))
    stop('x must be a data.frame')

  isfacts <- sapply(x, is.factor)
  for(clmn in names(x)[isfacts])
    x[,clmn] <- factor(x[,clmn], levels=c(levels(x[,clmn]), new.levs))
  x
}



sstable <- function(x, idx.clmns, ct.clmns=NULL, na.label='NA'){#,exclude=exclude, ...){

  if(!is.data.frame(x))
    stop('x must be a data.frame')

  if(any(sapply(x[,idx.clmns], function(z) !is.factor(z)))){# class)!= 'factor')){
    warning('some or all of your index columns are not factors. coercing them...')
    for(idx in idx.clmns)
      x[,idx] <- as.factor(x[,idx])
  }
    
  
  # should I make this run more like the other 'table' functions?
  if(is.null(ct.clmns)){
    ## plain old table method
    tab <- table(x[,idx.clmns], exclude='ifany')
    rownames(tab)[is.na(rownames(tab))] <- na.label
    
  }else{
    ## this whole section need reworking into a much simpler agg2tab function perhaps    
    ncc <- length(ct.clmns)
    nic <- length(idx.clmns)
  
    if(nic > 2)
      stop("can't have more than 2 index columns")
    if(ncc > 1 & nic > 1)
      stop('cannot have both more than one index and count columns')
       
    y <- tab2df(x[,idx.clmns])
    y <- addFactLevs(y, 'NA')
    y[is.na(y)] <- na.label
      
    if(nic == 2 & ncc == 1){
      tmp <- as.table( by(x[,ct.clmns], as.list(y), sum))
    }else{
      tmp <- as.table(sapply(ct.clmns, function(cc)
                             by(x[,cc], as.list(y), sum)
                             )
                      )
    }

    tab <- tab2df(tmp)
    tab [is.na(tab)] <- 0

    if(dim(tmp)[1]==2 & NROW(tab)==1 & ncc==2){
      colnames(tab) <- ct.clmns
      rownames(tab) <- levels(y) # was unique(y)
    }
    
  }
  if(NCOL(tab) == 1 & NROW(tab) == 1){
    ret <- data.frame(tab[1,1], row.names=rownames(tab))
    colnames(ret) <- colnames(tab)
    return(ret)
  }
  
  if(NCOL(tab) > 1){
    tab <- tab2df(tab)    
    tab$sum <- apply(tab, 1, sum)

    tab <- subset(tab, sum > 0)
    
    return(tab[rev(order(tab$sum)),])
  }else{
    return(tab[rev(order(tab[,1])),])
  }
}


leghead <- function(x, n=7, tabulate=FALSE, colors=TRUE, na.name='NA',na.col='white', other.col='gray', na.last=TRUE){

  if(as.logical(tabulate))
    x <- sstable(x, tabulate)
  
  if(n > 3 & n < nrow(x)){
    
    o.rn <- rownames(x)    
    is.n <- sapply(x, is.numeric)
    n <- n + sum(!is.n)  # so that we can utilize (an) extra color(s)
    others.summed <- apply(x[(n+1):nrow(x), is.n],2,sum)    
    x <- rbind(head(x,n), c(others.summed, 'NA'[!is.n]))
    rownames(x) <- c(o.rn[1:n],'other')
    
  }

  if('color' %in% colnames(x)){
    warn.col.clmn <- 'dataframe already has a color column:'
    isna <- rownames(x)== 'NA'
    if(sum(isna) > 0){
      warn.col.clmn <- paste(warn.col.clmn , 'changing NA levels to', na.col, '.')
      x$color[isna] <- na.col
    }else{
      warn.col.clmn <- paste(warn.col.clmn , 'leaving it as is.')
    }
    warning(warn.col.clmn)
    colors <- FALSE
  }
  
  if(all(colors != FALSE)){
    
    if(length(colors)>1){

      matches <- match(names(colors),rownames(x))
      if(all(is.na(matches)))
        stop('color vector names do not match the top n table row names')
      x$color <- colors[matches]
      
    }else{
      isna <- rownames(x)=='NA'
      x$color[!isna] <- rainbow(nrow(x) - sum(isna))
      x$color[ isna] <- na.col
    }
  }
  if('color' %in% colnames(x))
    x$color[rownames(x)=='other'] <- other.col  

  if(na.name != 'NA')
    rownames(x)[match('NA', rownames(x))] <- na.name
    
  if(na.last){
    isna <- rownames(x)==na.name
    if(sum(isna) > 0)
      x <- x[c(rownames(x)[!isna], na.name),]
  }
  x
}




.vle2df <- function(vl,i){
  ## vector list element to dataframe (preserves list element names)

  if(is.data.frame(vl[[i]])){
    df <- vl[[i]]
    names(df) <- paste(names(df),'.',i,sep='')    
  }else{
    df <- as.data.frame(vl[[i]])
    colnames(df) <- i
    rownames(df) <- names(vl[[i]])
  }
  df$rownames <- rownames(df)  #necessary because the built in b='row.names' merge is really slow (if not completely broken)
  df
}


nerge <- function(l, ...){
  ## named data.frame or vector merge

  if(!all(sapply(l, function(k) is.data.frame(k) | is.vector(k) | is.factor(k))))
     stop('list elements must be either of class data.frame or of type vector (or factor)')
  if(length(l) < 2)
    stop('list l must have at least 2 elements')
  if(is.null(names(l))){
    warning("each merge element in the list 'l' should have a name. making some up.")
    names(l) <- letters[1:length(l)]
  }
  if(!all(sapply(l, function(j) !is.null(rownames(j)) | !is.null(names(j)))))
    stop('all list elements must have named components (dataframes must have row names)')
    
  df <- .vle2df(l,names(l)[1])
  for(e in 2:length(l)){
    df <- merge(df, .vle2df(l, names(l)[e]), by='rownames', ...)
    rownames(df) <- df$rownames
  }
  df$rownames <- NULL


  ## removing appended colnames if unnecessary
  orig.names <- sub(paste('\\.(',paste(names(l),collapse='|'),')$',sep=''),'', names(df))
  if(length(orig.names)== length(unique(orig.names)))
    names(df) <- orig.names
  
  df
}

Any scripts or data that you put into this service are public.

caroline documentation built on Oct. 21, 2024, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

caroline
A Collection of Database, Data Structure, Visualization, and Utility Functions for R

R/aggregate.R
In caroline: A Collection of Database, Data Structure, Visualization, and Utility Functions for R

Defines functions nerge .vle2df leghead sstable addFactLevs regroup groupBy .redots .gedots bestBy .rowlist2df .aggregateDFrow .pasteUnique

Documented in addFactLevs bestBy groupBy leghead nerge regroup sstable

Try the caroline package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

caroline A Collection of Database, Data Structure, Visualization, and Utility Functions for R

R/aggregate.R In caroline: A Collection of Database, Data Structure, Visualization, and Utility Functions for R

Defines functions nerge .vle2df leghead sstable addFactLevs regroup groupBy .redots .gedots bestBy .rowlist2df .aggregateDFrow .pasteUnique

Documented in addFactLevs bestBy groupBy leghead nerge regroup sstable

Try the caroline package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

caroline
A Collection of Database, Data Structure, Visualization, and Utility Functions for R

R/aggregate.R
In caroline: A Collection of Database, Data Structure, Visualization, and Utility Functions for R