ERTMon-R: Event Records Transformations Monad

Documented in AddDateTags ColMaxes ConvertMultiColumnDataFrameToSparseMatrix FileColumnsIngest FileTriplets FileTripletsIngest ImposeColumnIDs ImposeRowIDs IngestMultiValuedDataColumn MakeMatrixByColumnPartition MakePiecewiseFunction RowMaxes SparseMatrixToTriplets SplitColumnOfTags SummaryPartitioned TagBasketsMatrixIntoItemTagMatrix ToColumnValueIncidenceMatrix TripletsToSparseMatrix TwoFilesTriplets

##=======================================================================================
## Data conversion functions in R
##
## LGPL-3.0 License, see https://www.gnu.org/licenses/lgpl-3.0.txt
##
## Copyright (c) 2015, Anton Antonov
## All rights reserved.
##
##=======================================================================================
## Start date: April 2014
##
## This file has R functions that help the ingesting of data in R.
## Some of the functions address problematic reading of TSV / CSV files.
## Some are for field paritioning.
## Some are for conversion to sparse matrices.
##
##=======================================================================================
##
## This file is included in ERTMon-R R directory in order to simplify ERTMon-R's package 
## installation and function dependencies load.
##
## The most recent version of this file can be found in:
##     https://github.com/antononcube/MathematicaForPrediction/blob/master/R/DataConversionFunctions.R
##
## It might be better this file functions to be in a separated package, 
## but the trade-off is not clear.
##
##=======================================================================================

# Load libraries
library(purrr)
library(stringr)
library(reshape2)
library(Matrix)
library(lubridate)
library(dplyr)

#' Split column of tags.
#' @description Partition combined columns
#' @param dataColumn data vector
#' @param numberOfSplits number of strings into which is field is split
#' @param emptyStringReplacement with what value empty strings are replaced
#' @export
SplitColumnOfTags <- function( dataColumn, numberOfSplits, sep = ",", emptyStringReplacement = "null"  ) {
  spCol <- str_split_fixed( dataColumn, sep, numberOfSplits )
  spCol[ spCol == "" ] <- emptyStringReplacement
  #spCol[ spCol == "null"] <- NA
  ##spCol <- apply( spCol, c(1,2), str_trim)
  spCol <- apply( spCol, c(2), str_trim)
  spCol
}

#' Tag baskets matrix into item tag matrix
#' @description Tag table (an R-matrix) into binary incidence sparse matrix
#' @param itemTagMat item-tag table
#' @export
TagBasketsMatrixIntoItemTagMatrix <- function( itemTagMat ) {
  # Convert to long form
  itRows <- reshape2::melt( itemTagMat, id.vars = colnames(itemTagMat)[[1]] )
  itRows$value[ itRows$value == "" ] <- "null"
  # Convert long form to sparse martix
  SMRCreateItemTagMatrix( itRows, "Var1", "Var2" )
}

#' File columns ingestion.
#' @description Given a file name reads the data into column data frame
#' @param fname filename assumed with suffix "TSV"
#' @param sep separator
#' @param header a logical value indicating whether the file contains the names of 
#' the variables as its first line
#' @export
FileColumnsIngest <- function( fname, sep="\t", expectedColumns=3, header=TRUE, apply.iconv = TRUE ) {
  con <- file(fname, "rb")
  if ( apply.iconv ) {
    rawContent <- iconv( readLines(con) )
  } else {
    rawContent <- readLines(con)
  }
  close(con)  # close the connection to the file, to keep things tidy

  # for each line in rawContent
  # count the number of delims and compare that number to expectedColumns
  indexToOffenders <-
    purrr::map_int(rawContent, function(x) {
      length(gregexpr(sep, x)[[1]]) != (expectedColumns-1)
    })

  # triplets <- read.csv2(rawContent[-indxToOffenders], header=TRUE, sep=cdelim)
  if ( sum(indexToOffenders) > 0 ) {
    rawContent <- rawContent[-indexToOffenders]
  }

  triplets <-
    purrr::map_dfr( rawContent, function(x) {
      if ( is.null(x) ) {
        NULL
      } else {
        res <- strsplit(x, sep )
        if ( is.null(res) || length(res) == 0 ) { NULL } else { res[[1]] }
      }
    } )

  if ( header ) {
    names(triplets) <- triplets[1,]
    triplets <- triplets[-1,]
  }
  triplets
}

#' File triplets ingestion.
#' @description Given a file name reads the data into three-column data frame, removes rows with NA and "null"
#' @param fname file name
#' @param sep separator
#' @export
FileTripletsIngest <- function( fname, sep="\t" ) {
  lines <- FileColumnsIngest(fname, sep, 3)
  lines[,3] <- as.numeric( lines[,3] )
  lines[,2] <- as.character( lines[,2] )
  lines[,1] <- as.character( lines[,1] )
  lines[ is.na( lines[,3]), 3] <- 0
  lines <- lines[ nchar( lines[,2] ) > 0,  ]
  lines <- lines[ nchar( lines[,1] ) > 0,  ]
  lines <- lines[ complete.cases(lines), ]
}

#' File triplets.
#' @description Turns the triplet records of a file into a sparse matrix
#' @param fname file name
#' @param sep field separator
#' @param propertiesToStrings should all properties be turned into strings
#' @export
FileTriplets <-  function( fname, sep="\t", propertiesToStrings=TRUE ) {
  df <- FileTripletsIngest( fname, sep )
  TripletsToSparseArray( df )
}

#' Combine two triplets files.
#' @description Combining the triplets of two files into a sparse matrix
#' @param fname1 name of the first file
#' @param fname2 name of the second file
#' @param sep field separator
#' @param propertiesToStrings should the properties be converted into stings
#' @export
TwoFilesTriplets <- function( fname1, fname2, sep="\t", propertiesToStrings=TRUE ) {
  df1 <- FileTripletsIngest( fname1, sep )
  df2 <- FileTripletsIngest( fname2, sep )
  TripletsToSparseArray( rbind(df1, df2) )
}

#' Make a sparse array/matrix from triplets.
#' @description Turns a data frame of three columns (triplets) into a sparse matrix
#' @param triplets a data frame with three columns
#' @family Sparse matrix transformation functions
#' @export
TripletsToSparseMatrix <-  function( triplets ) {
  itemIDs <- unique( triplets[,1] )
  propertyIDs <- unique( triplets[,2] )
  itemIDToIndex <- 1:length(itemIDs)
  names(itemIDToIndex) <- itemIDs
  propertyIDToIndex <- 1:length(propertyIDs)
  names(propertyIDToIndex) <- propertyIDs
  smat <- sparseMatrix( i=itemIDToIndex[ triplets[,1] ],
                        j=propertyIDToIndex[ triplets[,2] ],
                        x=triplets[,3],
                        dims=c( length(itemIDs), length(propertyIDs) )  )
  rownames(smat) <- itemIDs
  colnames(smat) <- propertyIDs

  # I don't think we need the rules arrays. We can always re-create them if needed.
  #list( Matrix=smat, ItemIDToIndex=itemIDToIndex, PropertyIDToIndex=propertyIDToIndex )
  smat
}

#' Convert sparse matrix into triplets
#' @description Converts a sparse matrix into a data frame triplets.
#' @param smat A sparse matrix.
#' @return A data frame of triplets.
#' @family Sparse matrix transformation functions
#' @export
SparseMatrixToTriplets <- function( smat ) {
  
  triplets <- summary(smat)
  
  if( !is.null(rownames(smat)) ) {
    triplets$i <- rownames(smat)[ triplets$i ]
  }
  
  if( !is.null(colnames(smat)) ) {
    triplets$j <- colnames(smat)[ triplets$j ]
  }
  
  as.data.frame(triplets, stringsAsFactors=FALSE)
}

#' Impose row ID's to a sparse matrix.
#' @description Makes sure that the rows of a matrix are in 1-to-1 correspondence to an array of row ID's
#' @param rowIDs A character vector of row ID's.
#' @param smat A matrix with named rows.
#' @return A sparse matrix.
#' @export
ImposeRowIDs <- function( rowIDs, smat ) {

  missingRows <- setdiff( rowIDs, rownames(smat) )
  nMissingRows <- length( missingRows )

  if ( nMissingRows > 0 ) {
    # Rows are missing in the matrix
    complMat <- sparseMatrix(i=c(1), j=c(1), x=c(0), dims = c( nMissingRows, ncol(smat) ) )

    rownames(complMat) <- missingRows
    colnames(complMat) <- colnames(smat)

    smat <- rbind( smat, complMat )
  }
  # At this point each element of rowIDs should have a corresponding row in the matrix
  smat[rowIDs,,drop=FALSE]
}

#' Impose column ID's to a sparse matrix.
#' @description Makes sure that the rows of a matrix are in 1-to-1 correspondence to an array of row ID's
#' @param colIDs A character vector of column ID's.
#' @param smat A matrix with named columns.
#' @return A sparse matrix.
#' @export
ImposeColumnIDs <- function( colIDs, smat ) {

  t( ImposeRowIDs( colIDs, t(smat)) )
}

#' Sparse matrix row maximums.
#' @description Find the row maximums in a sparse matrix.
#' @param smat A sparse matrix.
#' @details The argument \code{smat} is expected to be of type "dgCMatrix".
#' This condition should hold \code{mean( names( RowMaxes(smat) ) == rownames(smat) ) == 1}.
#' @return A numeric vector with named elements
#' @export
RowMaxes <- function( smat ) {
  
  rowMaxes <- setNames( ERTMon::SparseMatrixToTriplets( smat ), c("RowID", "ColID", "Value") )
  rowMaxes <- rowMaxes %>% dplyr::group_by( RowID ) %>% dplyr::summarise( Max = max(Value), .groups = "drop" ) 
  rowMaxes <- setNames( rowMaxes %>% dplyr::pull( Max ), rowMaxes %>% dplyr::pull( RowID ) ) 
  rowMaxes <- rowMaxes[rownames(smat) ]
  
  rowMaxes
}

#' Sparse matrix column maximums.
#' @description Find the column maximums in a sparse matrix.
#' @param smat A sparse matrix.
#' @details The argument \code{smat} is expected to be of type "dgCMatrix".
#' @export
ColMaxes <- function( smat ) {
  RowMaxes(t(smat))
}

#' Piecewise function constructor.
#' @description Make piecewise function for a list of values.
#' The names of the values are used as function's result.
#' If the names are NULL they are automatically assigned to be ordinals starting from 1.
#' Similar behavior is provided by the base function \code{findInterval}.
#' @param points A list of named values; if the values are not named automatic naming is used.
#' @param tags The values to be returned for the ranges defined by points.
#' @details \code{length(points) == length(tags) - 1}
MakePiecewiseFunction <- function( points, tags=NULL ) {
  if ( length(points) == 0 || is.null(points) ) {
    warning("NULL or an empty list is given as an argument.", call.=TRUE )
    return( NULL )
  }
  if ( !is.numeric(points) ) {
    warning("The first argument, points, is expected to be a numeric list.", call. =TRUE )
    return( NULL )
  }
  if ( !is.null(tags) && !is.numeric(tags) ) {
    warning("The second argument, tags, is expected to be NULL or a numeric list.", call.=TRUE )
    return( NULL )
  }

  points <- sort(points)

  if ( is.null( tags ) ) {
    tags <- 0:length(points)
  }

  funcStr <- paste( "function(x){ ( x <=" , points[1], " ) *", tags[1] )

  for( i in 1:(length(points)-1) ) {
    funcStr <- paste( funcStr, "+  (", points[i], "< x & x <=", points[i+1], ")*", tags[i+1] )
  }

  funcStr <- paste( funcStr, "+ (", points[length(points)], " < x )*", tags[length(points)+1], "}" )

  eval( parse( text=funcStr ) )
}

#' Multi-value column ingestion.
#' @param itemRows a data frame of flat content data
#' @param tagTypeColName column name of the relationship to be ingested in \code{itemRows}
#' @param itemIDName 
#' @param nTagsPerField number of tags per field of the column \code{colName} in\code{itemRows}
#' @export
IngestMultiValuedDataColumn <- function( itemRows, tagTypeColName, itemIDColName = "ID", nTagsPerField = 12, split = "," ) {

  spdf <- str_split_fixed( itemRows[, tagTypeColName], pattern = split, n=nTagsPerField )
  spdf <- as.data.frame( spdf, stringsAsFactors = FALSE )
  for( i in 1:ncol(spdf) ) {
    spdf[[i]] <- gsub( pattern = "^\\W", replacement = "", spdf[[i]] )
  }
  names( spdf ) <- paste( "tag", 1:nTagsPerField, sep="_" )

  tags.itemRows <- cbind( "id"=itemRows[[itemIDColName]], spdf )

  tags.itemRows$'tag_1'[ tags.itemRows$'tag_1' == "N/A" ] <- NA
  tags.itemRows <- tags.itemRows[ !is.na( tags.itemRows$'tag_1' ), ]

  ## In order to fit the sparse matrix creation
  tags <- unique( do.call(c, spdf) )
  tags <- data.frame( 'id'=1:length(tags), 'name'=tags[order(tags)] )
  tagToIDRules <- tags$id; names(tagToIDRules) <- tags$name
  for( i in 2:ncol(tags.itemRows) ) {
    tags.itemRows[[i]] <- tagToIDRules[ tags.itemRows[[i]] ]
  }
  names(tags.itemRows) <- c( "id", paste( "tag_id", 1:nTagsPerField, sep="_" ) )

  # result
  list( tags = tags, tags.items = tags.itemRows )
}

## For backward compatibility
IngestMovieDataColumn <- IngestMultiValuedDataColumn

#' Convert multi-column data frame into a sparse matrix.
#' @param Multi-column data frame id-tag relationship
#' @param idColName the column name of the item ID
#' @param tagTypeColNames names of the tag type column names 
#' @details This does not work if the  \code{tagTypeColNames} have dash in them.
#' I assume because of the string-to-formula conversion in  \code{SMRCreateItemTagMatrix}.
#' Obviously, the dependence of the  \code{SMRCreateItemTagMatrix} can be removed.
#' @family Sparse matrix transformation functions
#' @export
ConvertMultiColumnDataFrameToSparseMatrix <- function( multiColDF, itemColName, tagTypeColNames ) {

  emptyColumns <- purrr::map_lgl( tagTypeColNames, function(tt) mean( is.na( multiColDF[,tt] ) ) == 1 )

  if ( sum( !emptyColumns ) < 1 ) {
    stop( "All tag columns are empty.", call. = TRUE )
  }
  tagTypeColNames <- tagTypeColNames[ !emptyColumns ]

  ## Find all the sub-matrices with for the tag types
  gmats <- 
    purrr::map( tagTypeColNames, function( tt ) {
      SMRCreateItemTagMatrix( dataRows = multiColDF, itemColumnName = itemColName, tagType = tt )
    } )

  ## Find all tags
  allTags <- unique( unlist( purrr::map( gmats, colnames ) ) )
  allIDs <- unique( unlist( purrr::map( gmats, rownames ) ) )

  ## Impose the tags to all tags matrices
  gmats <- purrr::map( gmats, function(m) { ImposeRowIDs( allIDs, ImposeColumnIDs( allTags, m ) ) })

  ## Sum the tag matrices into one matrix
  gmat <- gmats[[1]]
  for( i in 2:length(gmats) ) { gmat <- gmat + gmats[[i]] }

  gmat
}


#' Make a matrix by column partitioning.
#' @description Make categorical representation of the numerical values of a column in a data frame and 
#' produce a matrix with the derived categorical tags as columns and values of a specified data column as rows. 
#' @param data a data frame
#' @param colNameForRows a column name in data for the rows of the result matrix
#' @param colNameForColumns a column name in data for the columns of the result matrix
#' @param breaks the points over which the breaking of \code{data[colNameForColumns]} is done
#' @param leftOverlap vector of weights for the neighboring columns to left
#' @param rightOverlap vector of weights for the neighboring columns to right
#' @param colnamesPrefix prefix for the columns names
#' @export
MakeMatrixByColumnPartition <- function( data, colNameForRows, colNameForColumns, breaks = 10, leftOverlap = NULL, rightOverlap = NULL, colnamesPrefix = "" ) {

  if( is.numeric( breaks ) && length( breaks ) == 1 ) {
    d0 <- min(data[[colNameForColumns]]); d1 <- max(data[[colNameForColumns]])
    breaks <- seq( d0, d1, (d1-d0)/(breaks-1) )
  }

  smat <- data[ , c(colNameForRows, colNameForColumns) ]
  qF <- MakePiecewiseFunction( breaks )
  smat <- cbind( smat, parts = purrr:map_dbl( smat[[colNameForColumns]], qF ) )
  smat <- xtabs( as.formula( paste( "~", colNameForRows, "+ parts") ), smat, sparse = TRUE )
  colnames(smat) <- paste( colnamesPrefix, colnames(smat), sep="" )

  if ( !is.null( leftOverlap ) && !is.null( rightOverlap ) ) {
    genMat <- smat
  }

  if ( !is.null( leftOverlap ) ) {

    addMat <- smat
    zeroCol <- sparseMatrix(  i = c(1), j = c(1), x = 0, dims = c( nrow(smat), 1 ) )

    for( w in rev(leftOverlap) ) {
      addMat <- addMat[,2:ncol(addMat)]
      addMat <- cBind( addMat, zeroCol )
      smat <- smat + w * addMat
    }
  }

  if ( !is.null( rightOverlap ) ) {

    if ( is.null( leftOverlap ) ) { addMat <- smat } else { addMat <- genMat }

    zeroCol <- sparseMatrix(  i = c(1), j = c(1), x = 0, dims = c( nrow(smat), 1 ) )

    for( w in rightOverlap ) {
      addMat <- addMat[,1:(ncol(addMat)-1)]
      addMat <- cBind( zeroCol, addMat )
      smat <- smat + w * addMat
    }
  }

  smat
}

#' Convert to incidence matrix by column values.
#' @description Replaces each a column of a integer matrix with number of columns corresponding to the integer values.
#' The matrix \code{[[2,3],[1,2]]} is converted to \code{[[0,0,1,0,0,0,0,1],[0,1,0,0,0,0,1,0]]} .
#' @param mat an integer matrix to be converted to column value incidence matrix.
#' @param rowNames boolean to assign or not the result matrix row names to be the argument matrix row names
#' @param colNames boolean to assign or not the result matrix column names derived from the argument matrix column names
#' @family Sparse matrix transformation functions
#' @export
ToColumnValueIncidenceMatrix <- function( mat, rowNames = TRUE, colNames = TRUE ) {

   tmat <- as( mat, "dgCMatrix")
   df <- summary(tmat)
   df <- data.frame(df)
   # minInt <- min(mat,na.rm = T); maxInt <- max(mat,na.rm = T)
   minInt <- min(tmat@x); maxInt <- max(tmat@x)
   #step <- maxInt - minInt + 1 ## this isincorrect df$j computed as  df$j <- ( df$j - 1 ) * step + df$x
   step <- maxInt + 1

   if( min(df$x) < 0 ) {
      warning( "The non-zero values of the matrix are expected to be non-negative integers.", call. = TRUE)
   }

   df$j <- ( df$j - 1 ) * step + df$x + 1
   ## In other words we are doing this:
   ## triplets <- ddply( .data = df, .variables = .(i,j),
   ##                   .fun = function(row) { c(row[[1]], (row[[2]]-1)*step + row[[3]] + 1, 1) })

   ## Convinient way to check the implmentation:
   ## resMat <- sparseMatrix( i = df$i, j = df$j, x = df$x, dims = c( nrow(mat), ncol(mat)*step ) )
   resMat <- sparseMatrix( i = df$i, j = df$j, x = rep(1,length(df$x)), dims = c( nrow(mat), ncol(mat)*step ) )
   
   if ( rowNames ) { rownames(resMat) <- rownames(mat) }

   if ( colNames ) { 
     colnames(resMat) <- as.character(unlist(Map( function(x) { paste(x, 0:maxInt, sep = ".") }, colnames(mat))))
   }
   
   resMat
}


##===========================================================
## Add date tags
##===========================================================

#' Adding date derived tags.
#' @description Aggregate values for given column names.
#' @param data a data frame
#' @param dateColumnName a date column over which the aggregation is done
#' @export
AddDateTags <- function( data, dateColumnName ) {

  dateCol <- enquo(dateColumnName)

  qRes <-
  data %>%
    dplyr::mutate( DayBoundary = ceiling_date( !!dateCol, "days" ) ) %>%
    dplyr::mutate( MonthBoundary = ceiling_date( !!dateCol, "months" ) ) %>%
    dplyr::mutate( YearBoundary = ceiling_date( !!dateCol, "years" ) ) %>%
    dplyr::mutate( Month = months( !!dateCol ), Weekday = weekdays( !!dateCol ) ) %>%
    dplyr::mutate( Weekday = factor( Weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ) ) ) %>%
    dplyr::mutate( Month = factor( Month, levels = c( "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" )  ) )

  qRes
}

##===========================================================
## Summary parititiong
##===========================================================

#' Summary by columns partitioning.
#' @description Print the summary of data frame in a series of specified number of columns.
#' @param data a data frame
#' @param numberOfColumns number of columns for the partitioning
#' @export
SummaryPartitioned <- function( data, numberOfColumns = 3, ...) {

  k <- 1

  while( k <= ncol(data)) {
    if( k > 1 ) { cat("\n") }
    k1 <- min( ncol(data), k + numberOfColumns-1 )
    print( summary( data[, k:k1, drop=F ], ... ) )
    k <- k + numberOfColumns
  }
}
antononcube/ERTMon-R documentation built on Oct. 14, 2021, 2:27 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
antononcube/ERTMon-R
Event Records Transformations Monad

R/DataConversionFunctions.R
In antononcube/ERTMon-R: Event Records Transformations Monad

R Package Documentation

Browse R Packages

We want your feedback!

antononcube/ERTMon-R Event Records Transformations Monad

R/DataConversionFunctions.R In antononcube/ERTMon-R: Event Records Transformations Monad

R Package Documentation

Browse R Packages

We want your feedback!

antononcube/ERTMon-R
Event Records Transformations Monad

R/DataConversionFunctions.R
In antononcube/ERTMon-R: Event Records Transformations Monad