towide: Wide data set from long data set

View source: R/towide.R

towideR Documentation

Wide data set from long data set

Description

Creates a wide data frame from a long data frame identifying an 'id' variable and using a time variable whose values provide suffixes for time-varying variable names in wide form.

Usage

towide(
  data,
  idvar = "id",
  timevar = "time",
  sep = "_",
  add.invariants = TRUE,
  ...
)

Arguments

data

a data frame in 'long' form.

idvar

(default: 'id') the variable(s) identifying each group of rows that are transformed to a single row in the wide file.

timevar

(default: 'time') the variable containing the occasion names in the long file.

sep

(default: '') the character(s) that separate the name of a time-varying variable in the long form from the added suffix for the correponding names in wide form. Default: ''.

add.invariants

(default TRUE) additional variables that are invariant within clusters are kept in output even if not included in 'idvar' (default: names(data)[gicc(data, dataidvar)]). For all variables, except idvar to be treated as time-varying, use invar = NULL.

...

Other arguments are passed to reshape.

Details

In contrast with reshape in stats, this function identifies variables that are invariant with respect to 'idvar' and does not expand them to wide form.

Only 'time'-varying variables are expanded to wide form.

Value

a data frame in wide form in which each variable that varies within levels of 'idvar' is turned into as many variables as there are distinct values of 'timevar' using the values of 'timevar' as suffixes to name the variables in wide form.

See Also

tolong

Examples

## Not run: 
# Subjects A, B observed on varying occasions, measuring variables
#  x and y in different locations
dd <- data.frame( subject = c('A','A','B'),
                  time = c(1,2,1), 
                  y.left = 1:3, y.right = 1:3, 
                  x.left= 1:3, x.right = 11:13, x.middle = 21:23 
                  )
dd
tolong(dd, sep = '.') # uses 'time' as default name for occasions variable
# Specify new 'timevar' to avoid clobbering 'time':
dl <- tolong(dd, sep = '.', timevar = "location") 
dl
#
# Back to wide format: Use 'idvar' to specify combination of 
# of variable values that uniquely identifies rows in wide file:
#
towide(dl, idvar = c('subject','time'), timevar = 'location')

# Long file with additional constants

dl <- data.frame(name = rep(c('A','B','C'), c(3,3,2)),
                 site = c('head','neck','jaw','chest')[
                      c(1,2,3,1,2,3,1,4)],
                 sex = rep(c('male','female','male'), c(3,3,2)),
                 var1 = 1:8,
                 var2 = 11:18,
                 invar = rep(1:3, c(3,3,2)))
dl
towide(dl, c('name','sex'), 'site')

# Two indexing variable: e.g. hippocampal volume: 2 sides x 3 sites

dl <- data.frame(name = rep(LETTERS[1:3], each = 6),
                 side = rep(c('left','right'), 9),
                 site = rep(rep(c('head','body','tail'),each = 2),3),
                 volume = 1:18,
                 grade = LETTERS[1:18],
                 sex = rep(c('female','male','female'), each = 6),
                 age = rep(c(25, 43, 69), each = 6))
dl
(dl.site <- towide(dl, c('name','side'), 'site'))
(dl.site.side <- towide(dl.site, c('name'), 'side'))
dl.site.side[,sort(names(dl.site.side))]

#
# Switching long and wide variables
# Multiple variables in 'idvar'
#  
dd <- read.table(header=T,text="
country    variable   1990 1991 1992 1993
Canada     population   20   21   24   26
Canada     income       10   12   12   11
Mexico     population   50   52   53   54
Mexico     income       30   31   33   34
")
 dd                 
 names(dd) <- sub("^X","val__", names(dd)) # use '__' in case '_' is used elsewhere                
 dd
 dl <- tolong(dd, sep = '__', timevar = 'year')
 dl
 dw <- towide(dl, idvar = c('country','year'), 
        timevar = 'variable')
 dw
 dw[grep('^id_',names(dw))] <- NULL
 dw
 names(dw) <- sub("^val_","", names(dw))
 dw

#
#  A function to flip years and variables
#
flip <- function(data, rowvar = 'country', 
                 colfmt = '[0-9]{4}$', 
                 varname = 'variable', sep = '__') {
    names(data) <- sub(
         paste0("^.*(",colfmt,')'), 
         paste0("value",sep,"\\1"), 
         names(data))
    dl <- tolong(data, sep = "__", timevar = 'year', idvar = "XXXX")
    dw <- towide(dl, timevar = varname, idvar = c(rowvar, 'year'), sep = '__')
    dw <- dw[, - grep("^XXXX", names(dw))]
    names(dw) <- sub(paste0('value',sep), '', names(dw))
    dw
}     
flip(dd)

#     
# Mixture of time-varying and time-invariant variables
#

dl <- data.frame(subject = c('A','A','A','B','B','C','C'), 
                 time = c(1,2,3,1,2,1,3),
                 sex = c('male','male','male','female','female','male','male'),
                 y = c(10,10,10,11,11,12,12), # accidentally time-invariant
                 x = c(20,21,22,25,26,18,19)) # time-varying
towide(dl, idvar = 'subject', timevar = 'time')  
towide(dl, idvar = 'subject', timevar = 'time', add.invariants = FALSE)  

# multiple time variables: e.g. month, day

dl <- data.frame(subject = c('A','A','A','B','B','C','C'), 
                 month = c(1,1,3,2,2,1,3),
                 day =   c(10,15, 2, 3, 9, 20, 2),
                 sex = c('male','male','male','female','female','male','male'),
                 y = c(10,10,10,11,11,12,12), # accidentally time-invariant
                 x = c(20,21,22,25,26,18,19)) # time-varying
# need single time variable
dl
dl$date <- with(dl, as.Date(paste0(month,'-',day),'%m-%d'))  # uses the current year
dl
dw <- towide(dl, idvar = 'subject', timevar = c('date'))
dl2 <- tolong(dw, sep = '_')
sortdf(dl2, ~ subject/time)
#
# Variables in long form
#
# This illustrates what how towide works when some variables 
# invariant wrt the key and others vary. Here the key is
# c('country','year'). The key-variant variables are value
# and rownum. The invariant variable is country.code
# 
dd <- read.table(header=TRUE, text = "
country   year   variable    value country.code  rownum  
Canada     2001    atemp       20   CAN           1
Canada     2002    atemp       23   CAN           2
US         2001    atemp       23   USA           3
US         2002    atemp       23   USA           4
Canada     2001    wind       120   CAN           5
Canada     2002    wind       123   CAN           6
US         2001    wind       123   USA           7
US         2002    wind       123   USA           8
Canada     2001    rain       220   CAN           9
Canada     2002    rain       223   CAN           10
US         2001    rain       223   USA           11
US         2002    rain       223   USA           12
")
(dw <- towide(dd, idvar = c('country','year'), timevar = 'variable'))
#
# to keep only the variable name as a name
#
names(dw) <- sub('^value_','', names(dw))
dw
#
# to get rid of other time varying variable
#
dw <- dw[, - grep('_', names(dw))]
dw

## End(Not run)

gmonette/spida2 documentation built on July 14, 2024, 12:45 p.m.