The stp25tools package provides tools for data wrangling and statistical transformations.

Lifecycle: experimental CRAN status

knitr::opts_chunk$set(echo = TRUE, warnings=FALSE)
require(stp25tools)

Funktionen

Transform vectors

Factor (factor2)

sex <- factor2(c(1,0,0,0,1,1,0), 
        male = 1, female = 0)
 sex

reorder2

 x <-
c(
  rep(1, 21),rep(2, 120),rep(3, 28),rep(4, 4),rep(5, 56),
  rep(6, 2),rep(7, 92),rep(8, 42),rep(9, 74),rep(10, 20)
)

x <- factor(x, 1:10, letters[1:10])
table(x)
table(reorder2(x))
table(reorder2(x, threshold = 30))

Numeric

x <-
  c("3,6",  "> 15100",  "+1",  "-1",
    "$10 -> expensive", "cheap: $2.50", "free $0 !!",
    "699.91  ",  " 228.4031.9 ",
    "",  NA,  "hallo1",  "-77"
  )


cbind(
  stp = as_numeric(x,  na.string = c("-77")),
  fct = as_numeric(factor(x)),
  rdr = readr::parse_number(x,  na = c("-77"))
)

Get Data

Works internally with the following functions:

Direct import from text

dat <-
  get_data("
sex treatment control
m  2 3
f  3 4
", 
tabel_expand = TRUE,
id.vars = 1)

xtabs(~ sex + value, dat)
dat <- 
  get_data(
"sex treatment  neg  pos
f   KG          3   3
f   UG          4   5
m   KG          5   4
m   UG          4   2
",
  tabel_expand = TRUE,
  id.vars = 1:2,
  value = "befund"
)

ftable(xtabs(~ sex + treatment + befund, dat))

Data File Import

if( file.exists("R/dummy.csv")) {
# get_data("R/dummy.csv", dec = ",", na.strings = "-", skip=1, label=1)
# get_data("R/dummy.xlsx", na.strings = "-")
 get_data("R/dummy.xlsx")
 x <- get_data("R/dummy.sav")
 get_label(x)[1:4]
}

Get / Set a variable label

Managing labels: Label, delet_label, get_label, set_label alternativ function labelled::set_variable_labels

Codebook Generate a data dictionnary

Save data and reconstruct from codebook

 save_data(dat, "demo.xlsx", include.codebook=TRUE)
 DF2 <- use_codebook(file = "demo.xlsx")
codebook(DF2)

Die Funktion look_for macht zwar nicht dasselbe, kann aber nützlich sein, wenn eine Variable gesucht wird.

labelled::look_for(DF2, "KG")

Filter + Consort-Plot

Works internally with the following functions:

airquality2 <-
  airquality |> Label(Ozone = "Ozone in ppm", Temp = "Temperatur in °C")

str(subset2(airquality2, Temp > 80, select = c(Ozone, Temp)))
str(dplyr::filter(airquality2, Temp > 80))

dat <- filter2(airquality2, Temp > 80)
str(dat)
# simple_consort_plot(dat)
attr(dat, "filter")
#require(stp25stat2)
#require(stp25tools)

data(DFdummy, package = "stp25data")

DF1 <- DFdummy |> 
  filter2(study.agreement)

attr(DF1, "filter")

DF2 <- DF1 |> filter2(
  st.p.sars.cov2 == "nein",
  !is.na(spike.igg.3.impfung),
  !is.na(MPN)

)

DF3 <- DF2 |> filter2(
  study.agreement,
  sero.negativ.after.dose.2,
  !is.na(spike.igg.3.impfung),
  !is.na(spike.igg.4.impfung),
  spike.igg.3.impfung == "<7.1 BAU/ml"
)
require(consort)

dat <- prepare_consort(DF1, DF2, DF3)
out <- consort_plot(
  data = dat,
  orders = c(
    Trial.Nr   = "Population",
    Condition.1           = "Excluded",
    Trial.Nr     = "Allocated \nDeskriptive Analyse",
    Condition.2    =    "Fehlende Daten",
    Trial.Nr = "Regressionsanalyse",
    Condition.3    = "Not evaluable for the final analysis",
    Trial.Nr = "Final Analysis"
  ),
  side_box = c("Condition.1", "Condition.2", "Condition.3"),
  cex = 0.9
)



plot(out)

Transpose

This is an extension of tidyr::pivot_longer and tidyr::pivot_wider, with added functionality using formulas.

DF_sprk <- data.frame(
  Laborwert = gl(7,8,
    labels = c("Albumin","Amylase","Lipase","AST","ALT","Bilirubin","C.Peptid")),
  Treat = gl(2, 4, labels = c("Control", "Treat")),
  Time = gl(4, 1, labels = c("t0", "t1", "t2", "t4")),
  x = rnorm(7 * 8)
) |>
  stp25stat2::Summarise(x ~ Laborwert + Time + Treat,
            fun = mean,
            value = "x")
DF_sprk

DF<- Wide(DF_sprk[-4], Time ) |> Wide(Laborwert , t0, t1, t2, t4)
#' So einen DF bekomme ich meist
DF
#' Und das brauch ich zum Rechnen
DF |> Long(. ~ Treat)  |> tidyr::separate(variable, c("Time", "Laborwert"), sep="_")

Wide

Works internally with the function tidyr::pivot_wider, with added functionality using formulas.

dat <- data.frame(
  month = rep(1:3, 2),
  student = rep(c("Amy", "Bob"), each = 3),
  A = c(19, 27, 16, 28, 10, 29),
  B = c(6, 7, 8, 5, 6, 7)
)
dat
dat |> 
  Wide(student,  A)
dat |> 
  tidyr::pivot_wider(names_from = student, 
                           values_from = A)

#' dat |> tidyr::pivot_wider(names_from = student, A)
#' error Column `value` doesn't exist.

dat |> 
  tidyr::pivot_wider(names_from = student, 
                    values_from = A, 
                    values_fill = 0)
# mehere values
dat |> tidyr::pivot_wider(names_from = student, 
                           values_from = c(A, B))

dat |> Wide(month ~ student, A, B)

Formulas are evaluated in two ways

  1. Wide(A ~ student) on the left values_from on the right names_from
  2. Wide(month ~ student, A) now the output structure and is in the formula names_from will be handed over separately
dat |> Wide(A ~ student)

dat |> Wide(A + B ~ student)
dat |> Wide(A + B ~ student + month)
# dat |> Wide(A ~ student)
# dat |> Wide(A ~ student + month)

dat |> Wide(month ~ student, A)
dat |> Wide(month ~ student, A , B)

Long

Combining several variables into one long variable. Works internally with the function tidyr::pivot_longer.

df2 <-  
dat |> Wide(student,  A, B)

df2  |> Long(A_Amy, A_Bob, B_Amy ,B_Bob, by =  ~ month) |> 
  tidyr::separate(variable , c('First', 'Last'))

dat |>
  tidyr::gather(variable, value,-(month:student))

# relig_income |>
#   tidyr::pivot_longer(!religion, names_to = "income", values_to = "count")
# 
# dat |> Long(  A, B, by =  ~ month + student  )|>
#   tidyr::unite(temp, student, variable) |>
#   tidyr::spread(temp, value)

Combining several variables into several long variables. Works internally with the function base::rbind().

DF <- data.frame(
id = 1:16,
group = gl(2, 8, labels = c("Control", "Treat")),
age = rnorm(16),
a1 = rnorm(16),
b1 = rnorm(16),
c1 = rnorm(16),
a2 = rnorm(16),
b2 = rnorm(16),
c2 = rnorm(16),
a3 = rnorm(16),
b3 = rnorm(16),
c3 = rnorm(16)
)
DF <- tibble::as_tibble(DF)
head(DF)
#' das gleiche wie oben
#' DF |> Long( a1, a2, a3, by = ~ id + group + age)
#' Long( a1 + a2+ a3  ~ id + group + age, DF)

Long(
  DF,
  .list = list(
    t0 = c("a1", "b1", "c1"),
    t1 = c("a2", "b2", "c2"),
    t2 = c("a3", "b3", "c3")
  ),
  by =  ~ id + group + age,
  names = c("a", "b", "c"),
  key = "time"
)

Wenn die Variablen - Namen gut trennbar sind geht auch folgendes.

names(DF) <- c("id","group", "age","a.t1","b.t1","c.t1","a.t2","b.t2","c.t2","a.t3","b.t3","c.t3")

DF |> Long(.~ id + group + age)  |>
  tidyr::separate(variable, 
                  c( "Laborwert", "Time"), 
                  sep = "\\.") |>
  Wide(Laborwert)

Pivot-Transpose

dat <- data.frame(pos = c("A", "B" , "C"),
                  x = 1:3,
                  y = 3:5)
dat

transpose2(dat)

transpose2(
  dat,
  key = "Item",
  col.names = c("A-level", "C-level", "D-Level"),
  row.names = c("x-axis", "y-axis")
)

Rbind2()

Works internally with the function dplyr::bind_rows.

Rbind2( ...,
        .id = "which",
        .names = NULL,
        .use.label = TRUE,
        include.rownames = FALSE
df1 <- data.frame(CustomerId = c(1:6), Product = c(rep("Oven", 3), rep("Television", 3))) |> 
  Label( Product = "Produkt")
df2 <- data.frame(CustomerId = c(4:7), Product = c(rep("Television", 2), rep("Air conditioner", 2)))
df3 <- data.frame(
  CustomerId = c(4:7),
  Product = c(rep("Television", 2), rep("Air conditioner", 2)),
  State = c(rep("California", 2), rep("New Jersey", 2))
) |> 
  Label( Product = "Produkt-Kategorie", State = "Bundes-Staat")

dat1<- Rbind2(df1, df2, df3)
str(dat1)
dat2 <- dplyr::bind_rows(df1, df2, df3, .id ="which")
str(dat2)

Merge2

Operates internally with the function base::merge and base::Reduce.

n<-10
df1 <- 
  data.frame(
  origin = sample(c("A", "B", "C", "D", "E"), n, replace = T),
  N = sample(seq(9, 27, 0.5), n, replace = T),
  P = sample(seq(0.3, 4, 0.1), n, replace = T),
  C = sample(seq(400, 500, 1), n, replace = T))
df2 <-
  data.frame(
    origin = sample(c("A", "B", "C", "D", "E"), n, replace = T),
    foo1 = sample(c(T, F), n, replace = T),
    X = sample(seq(145600, 148300, 100), n, replace = T),
    Y = sample(seq(349800, 398600, 100), n, replace = T))
df3 <-
  data.frame(origin = sample(c("A", "B", "C", "D", "E"), n, replace = T))
df4 <-
  data.frame(origin = sample(c("A", "B", "C", "D", "E"), n, replace = T))

df1$id <- df2$id <- df3$id <- df4$id <- paste("P", sprintf("%02d", c(1:n)), sep = "")  

Merge2(df1, df2, df3, df4, by = "id")

cbind listenweise

#' cbind data.frame aber listenweise
m <- data.frame(
  Item = 1:3,
  a = (1:3),
  b = (1:3) * 2,
  c = (1:3) * 3
)
sd <- data.frame(
  Item = (1:3),
  a = (1:3) * 4,
  b = (1:3) * 5,
  c = (1:3) * 6
)
combine_data_frame(m, sd)
combine_data_frame(m, sd, by = NULL)

data.frame manipulation

This group includes functions for converting objects into data frames (fix_to_df, fix_to_tibble, list_to_df), adding elements to vectors or lists (add_to, add_row_df), adding and transforming missing data (na_approx, auto_trans), managing labels (Label, delete_label, get_label, set_label) and performing arithmetic operations (auc_trapezoid).

Imputation LOD

#'  x <-
#' c(.00049001,.0035648,.01,.0112,
#'   .023212548,.00541257,.004041257,.458,.500)
#' y <-
#'   c(43.01,49.156,678.00112458964,789.023212548,
#'     674.00049001,634.00541257,76.004041257,789.458,500
#'   )
#'   
#' data.frame(x = signif(x, 3), x.lod = imputation_LOD(x,  lod = .0035648))
#'
#' data.frame(y = signif(y, 3), y.lod = imputation_LOD(y,  lod = 49.156))

add_to

Add item to list.

add_to(list(a = 1:3, b = LETTERS[1:5]),
       c = 1, d = 2)

Add element to data frame.

df <-   data.frame(
Source = c("A", "B", "C", "F"),
x = 1:4,
y = 1:4,
stringsAsFactors = FALSE
)

 add_to(df, c("Erste Zeile" = 1, "Dritte" = 3))

add_to(df, "Erste Zeile")
add_to(df, c("Erste Zeile", "Zweite"))
add_to(df, c("Erste Zeile" = 1, "letzte" = 5))
add_to(df, list("G", 5), pos = -1)
add_to(df, data.frame(  Source = c("G", "H"),
                        x = 5:6
), pos = -1)

list_to_df

x <- list(
  M1 = data.frame(
    Source = c("Intercept", "A", "B" , "C", "Residual"),
    b = c(0, 1, 2, 3, 0),
    y = c((1:4) + 10, 0)
  ),

  M2 = data.frame(
    Source = c("Intercept", "A", "C", "B",  "D",  "E", "Residual"),
    x = c(0, 1, 3, 2, 4, 5, 0),
    y = c((1:6) + 21, 0)
  ),
  M3 = data.frame(
    Source = c("A", "B", "C", "D", "Residual"),
    x = c((1:4), 0),
    y = c((1:4) + 20, 0)
  ),

  M1 = data.frame(
    Source = c("A", "B",  "D", "Residual"),
    x = c(1, 2, 4, 0),
    y = c((1:3) + 30, 0)
  )

)

# Spezialfall
list_to_df(x, last = "Residual")


fix_to_df(x)
data(infert, package = "datasets")
infert$case  <- factor(infert$case ,1:0, c("case", "control") )

#infert$spontaneous <- factor(infert$spontaneous)
#infert$induced2    <- factor(infert$induced==0)
# tab_1<- xtabs(~  case, infert)
# tab_2x2<- xtabs(~ induced2 + case, infert)
tab_3x2<- xtabs(~ induced + case, infert)
# tab_3x3<- xtabs(~ induced + education, infert)
# tab_3x3x2<- xtabs(~ induced + education+case, infert)
# tab_3x2x3<- xtabs(~ induced +case+ education, infert)

tab_3x2
fix_to_df(tab_3x2)

auto_trans

Automatic transformation of numerical variables according to their distribution properties.

x <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 9, 20, 30)
x_trans<-auto_trans(x)
auto_trans(x_trans)
x <- 100 - x
auto_trans(x)
n <- 100
set.seed(n)
x1 = rnorm(100)+5
x2 = rlnorm(n, meanlog = 0, sdlog = 1)+1
x3 = rpois(n, lambda = 1)+1
x4 = rweibull(n, shape = .8, scale = 1)+10
x5 = runif(n, min = 0, max = 100)+1
x.neg <- rbeta(n, 5, 2)  # Negative Skew
x.pos <- rexp(n, 1) # Positive Skew
dat<-
  data.frame(
    morm= x1[order(x1)],
    beta=x.neg[order(x.neg)],
    exp = x.pos[order(x.pos)],
   # lnorm= x2[order(x2)],
    pois= x3[order(x3)],
   # weibull= x4[order(x4)],
  #  unif= x5[order(x5)],
    group= factor(c(rep(1, n*.5), rep(1:2, n*0.3/2), rep(2, n*0.2)))
  )
par(mfrow=c(2,2))

boxplot(beta~group, dat)
boxplot(auto_trans(beta, treshhold = .4)~group, dat)
boxplot(exp~group, dat)
boxplot(auto_trans(exp, treshhold = .4)~group, dat)
#boxplot(weibull~group, dat)
#boxplot(unif~group, dat)
for(i in 1:4){

  dat$x<- dat[[i]]
  fit0 <- lm(x~ group, dat)
  dat$x<- auto_trans(dat$x, treshhold = .4)
  fit1 <- lm(x~ group, dat)



  p0<-plot(effects::effect("group", fit0),
           main="orginal",
           ylab = names(dat)[i])
  p1<-plot(effects::effect("group", fit1),
           main="non-trans",
           ylab = "")
  p2<-plot(effects::effect("group", fit1,
                  transformation =
                    list(link = attr(dat$x, "link"),
                         inverse = attr(dat$x, "inverse"))),
           main=attr(dat$x, "name"),
           ylab = ""
           )
   gridExtra::grid.arrange(p0, p1, p2, ncol = 3)
}

separate & separate_rows

Multiple answers are often coded incorrectly. The following functions are used to prepare the data structure.

data_games <- tibble::tibble(
  country = c("Germany", "France", "Spain"),
  game = c("England - win", "Brazil - loss", "Portugal - tie")
)

#'   sep = "[^[:alnum:].]+"
data_games |> tidyr::separate(col = game, into = c("opponent", "result"))
data_opponents <- tibble::tibble(
  country = c("Germany", "France", "Spain"),
  opponent = c("England, Switzerland", "Brazil, Denmark", "Portugal, Argentina")
)

data_opponents
data_opponents |> tidyr::separate_rows(opponent)

separate multiple choice

Break down multiple answers.

 #require(stp25tools)
lbl <- c (
  '1 Heart Failure' = 1,
  '2 Rhythm Abnormality' = 2,
  '3 Valve Dysfunction' = 3,
  '4 Bleeding with OAC' = 4,
  '5 ACS' = 5,
  '6 Neurological Event' = 6,
  '7 Neoplastic Disease' = 7,
  '8 Others' = 8,
  '0 No Complications' = 0
)

x <- c(0,
       "1,3,6,8,4",
       2,
       "",
       "8,4",
       #  3.8,100,
       "4,6,8,3",
       "2,3,4,5")

#x <- gsub("\\.", ",", x)
rslt <-
  separate_multiple_choice(x ,
                           sep = ",",
                           as_logical = TRUE,
                           label = lbl)
stp25stat2::Tbll_desc(rslt)

Creation of dummy variables and reverse

 z <- gl(3, 2, 12, labels = c("apple", "salad", "orange"))
table(z)
levels(z) <- list("veg"   = "salad", "fruit" = c("apple", "orange"))
table(z)
z <- factor_to_dummy(z)
table(z)
z <- dummy_to_factor(z)
table(z)

string manipulation

  # NODE                     EXPLANATION
  # --------------------------------------------------------------------------------
  #   (?<=                     look behind to see if there is:
  #      ---------------------------------------------------------------------------
  #      [\s]                  any character of: whitespace (\n, \r, \t, \f, and " ")
  #    -----------------------------------------------------------------------------
  #   )                        end of look-behind
  # --------------------------------------------------------------------------------
  #   \s*                      whitespace (\n, \r, \t, \f, and " ") (0 or
  #                            more times (matching the most amount possible))
  # --------------------------------------------------------------------------------
  #   |                        OR
  # --------------------------------------------------------------------------------
  #   ^                        the beginning of the string
  # --------------------------------------------------------------------------------
  #   \s+                      whitespace (\n, \r, \t, \f, and " ") (1 or more times 
  #                            (matching the most amount possible))
  # --------------------------------------------------------------------------------
  #   $                        before an optional \n, and the end of the string

Wrap string: split_string(), wrap_string(), wrap_sentence(), wrap_string_at()

Clean up data frame and strings clean_names, cleansing_umlaute, cleansing_umlaute2

fl <- "Raw data/Mai_Markiert_LT_Cases_Jan2018-Dez2022.xlsx"
DF <- 
   readxl::read_excel(fl_new, 
                      range = "A1:z15", 
                      sheet = 1) |> 
   fix_names()
clean_names(tibble::tibble("Öli"=1:3, "p-k"=1:3, "95%-CI"=4:6) )
cleansing_umlaute( " Öäüö? hallo")

wrap_string (Umbrechen)

wrap_string(), wrap_factor(), wrap_data_label(), wrap_string_at(), split_string()

strg<- c("R is free   software and comes with ABSOLUTELY NO WARRANTY.",
         "You are welcome to redistribute it under certain conditions.")

wrap_string(strg, 5)
wrap_string(factor(strg))
wrap_factor(factor(strg))

#wrap_data_label(data)
wrap_string_at(strg, "and")
split_string(strg, "and")
animals <- c("cat", "dog", "mouse", "elephant")
stringr::str_flatten_comma(animals, last = " and ")

Sonstiges

Prepare internal data with formula prepare_data2



stp4/stp25tools documentation built on Feb. 27, 2025, 11:14 p.m.