The stp25tools package provides tools for data wrangling and statistical transformations.
knitr::opts_chunk$set(echo = TRUE, warnings=FALSE) require(stp25tools)
Pivot-Functions: Long(), Wide(), Dapply(), dapply2(), transpose2()
Merging data.frame
: Merge2, Rbind2, combine_data_frame
Convert object to data.frame
: fix_to_df, fix_to_tibble, list_to_df
Create vectors for accessing data frames
: Cs, XLS, paste_names
Wrap string: wrap_string
Add element to vector or lists: add_to, add_row_df
Internal function in stp25stat2 used to prepare data with formula: prepare_data2, print
Transform vectors: as_numeric, as_factor, factor2, as_cut, as_logical, rev.factor,as_rev, cat_bmi
Import data: get_data
Add and transform missing data: na_approx, auto_trans
Managing labels: Label, delet_label, get_label, set_label
Cleaning Data Frame and strings: clean_names, cleansing_umlaute, cleansing_umlaute2
Calculation operations: auc_trapezoid
sex <- factor2(c(1,0,0,0,1,1,0), male = 1, female = 0) sex
x <- c( rep(1, 21),rep(2, 120),rep(3, 28),rep(4, 4),rep(5, 56), rep(6, 2),rep(7, 92),rep(8, 42),rep(9, 74),rep(10, 20) ) x <- factor(x, 1:10, letters[1:10]) table(x) table(reorder2(x)) table(reorder2(x, threshold = 30))
x <- c("3,6", "> 15100", "+1", "-1", "$10 -> expensive", "cheap: $2.50", "free $0 !!", "699.91 ", " 228.4031.9 ", "", NA, "hallo1", "-77" ) cbind( stp = as_numeric(x, na.string = c("-77")), fct = as_numeric(factor(x)), rdr = readr::parse_number(x, na = c("-77")) )
Works internally with the following functions:
readxl::read_excel(file, sheet, skip, range)
read.table(file, header, sep, quote, dec, na.strings, skip, fill, comment.char)
haven::read_sav(file, encoding, user_na)
read.text2(file, dec)
dat <- get_data(" sex treatment control m 2 3 f 3 4 ", tabel_expand = TRUE, id.vars = 1) xtabs(~ sex + value, dat) dat <- get_data( "sex treatment neg pos f KG 3 3 f UG 4 5 m KG 5 4 m UG 4 2 ", tabel_expand = TRUE, id.vars = 1:2, value = "befund" ) ftable(xtabs(~ sex + treatment + befund, dat))
if( file.exists("R/dummy.csv")) { # get_data("R/dummy.csv", dec = ",", na.strings = "-", skip=1, label=1) # get_data("R/dummy.xlsx", na.strings = "-") get_data("R/dummy.xlsx") x <- get_data("R/dummy.sav") get_label(x)[1:4] }
Managing labels: Label
, delet_label
, get_label
, set_label
alternativ function labelled::set_variable_labels
Save data and reconstruct from codebook
save_data(dat, "demo.xlsx", include.codebook=TRUE) DF2 <- use_codebook(file = "demo.xlsx")
codebook(DF2)
Die Funktion look_for
macht zwar nicht dasselbe, kann aber nützlich sein, wenn eine Variable gesucht wird.
labelled::look_for(DF2, "KG")
Works internally with the following functions:
base::subset
+ label_data_framedplyr::filter
+ info what was filteredairquality2 <- airquality |> Label(Ozone = "Ozone in ppm", Temp = "Temperatur in °C") str(subset2(airquality2, Temp > 80, select = c(Ozone, Temp))) str(dplyr::filter(airquality2, Temp > 80)) dat <- filter2(airquality2, Temp > 80) str(dat) # simple_consort_plot(dat) attr(dat, "filter")
#require(stp25stat2) #require(stp25tools) data(DFdummy, package = "stp25data") DF1 <- DFdummy |> filter2(study.agreement) attr(DF1, "filter") DF2 <- DF1 |> filter2( st.p.sars.cov2 == "nein", !is.na(spike.igg.3.impfung), !is.na(MPN) ) DF3 <- DF2 |> filter2( study.agreement, sero.negativ.after.dose.2, !is.na(spike.igg.3.impfung), !is.na(spike.igg.4.impfung), spike.igg.3.impfung == "<7.1 BAU/ml" )
require(consort) dat <- prepare_consort(DF1, DF2, DF3) out <- consort_plot( data = dat, orders = c( Trial.Nr = "Population", Condition.1 = "Excluded", Trial.Nr = "Allocated \nDeskriptive Analyse", Condition.2 = "Fehlende Daten", Trial.Nr = "Regressionsanalyse", Condition.3 = "Not evaluable for the final analysis", Trial.Nr = "Final Analysis" ), side_box = c("Condition.1", "Condition.2", "Condition.3"), cex = 0.9 ) plot(out)
This is an extension of tidyr::pivot_longer
and tidyr::pivot_wider
, with added functionality using formulas.
DF_sprk <- data.frame( Laborwert = gl(7,8, labels = c("Albumin","Amylase","Lipase","AST","ALT","Bilirubin","C.Peptid")), Treat = gl(2, 4, labels = c("Control", "Treat")), Time = gl(4, 1, labels = c("t0", "t1", "t2", "t4")), x = rnorm(7 * 8) ) |> stp25stat2::Summarise(x ~ Laborwert + Time + Treat, fun = mean, value = "x")
DF_sprk DF<- Wide(DF_sprk[-4], Time ) |> Wide(Laborwert , t0, t1, t2, t4) #' So einen DF bekomme ich meist DF #' Und das brauch ich zum Rechnen DF |> Long(. ~ Treat) |> tidyr::separate(variable, c("Time", "Laborwert"), sep="_")
Works internally with the function tidyr::pivot_wider
, with added functionality using formulas.
dat <- data.frame( month = rep(1:3, 2), student = rep(c("Amy", "Bob"), each = 3), A = c(19, 27, 16, 28, 10, 29), B = c(6, 7, 8, 5, 6, 7) )
dat
dat |> Wide(student, A) dat |> tidyr::pivot_wider(names_from = student, values_from = A) #' dat |> tidyr::pivot_wider(names_from = student, A) #' error Column `value` doesn't exist. dat |> tidyr::pivot_wider(names_from = student, values_from = A, values_fill = 0)
# mehere values dat |> tidyr::pivot_wider(names_from = student, values_from = c(A, B)) dat |> Wide(month ~ student, A, B)
Formulas are evaluated in two ways
Wide(A ~ student)
on the left values_from
on the right names_from
Wide(month ~ student, A)
now the output structure and is in the formula names_from
will be handed over separatelydat |> Wide(A ~ student) dat |> Wide(A + B ~ student) dat |> Wide(A + B ~ student + month) # dat |> Wide(A ~ student) # dat |> Wide(A ~ student + month) dat |> Wide(month ~ student, A) dat |> Wide(month ~ student, A , B)
Combining several variables into one long variable. Works internally with the function
tidyr::pivot_longer
.
df2 <- dat |> Wide(student, A, B) df2 |> Long(A_Amy, A_Bob, B_Amy ,B_Bob, by = ~ month) |> tidyr::separate(variable , c('First', 'Last')) dat |> tidyr::gather(variable, value,-(month:student)) # relig_income |> # tidyr::pivot_longer(!religion, names_to = "income", values_to = "count") # # dat |> Long( A, B, by = ~ month + student )|> # tidyr::unite(temp, student, variable) |> # tidyr::spread(temp, value)
Combining several variables into several long variables. Works internally with the function base::rbind()
.
DF <- data.frame( id = 1:16, group = gl(2, 8, labels = c("Control", "Treat")), age = rnorm(16), a1 = rnorm(16), b1 = rnorm(16), c1 = rnorm(16), a2 = rnorm(16), b2 = rnorm(16), c2 = rnorm(16), a3 = rnorm(16), b3 = rnorm(16), c3 = rnorm(16) ) DF <- tibble::as_tibble(DF)
head(DF) #' das gleiche wie oben #' DF |> Long( a1, a2, a3, by = ~ id + group + age) #' Long( a1 + a2+ a3 ~ id + group + age, DF) Long( DF, .list = list( t0 = c("a1", "b1", "c1"), t1 = c("a2", "b2", "c2"), t2 = c("a3", "b3", "c3") ), by = ~ id + group + age, names = c("a", "b", "c"), key = "time" )
Wenn die Variablen - Namen gut trennbar sind geht auch folgendes.
names(DF) <- c("id","group", "age","a.t1","b.t1","c.t1","a.t2","b.t2","c.t2","a.t3","b.t3","c.t3") DF |> Long(.~ id + group + age) |> tidyr::separate(variable, c( "Laborwert", "Time"), sep = "\\.") |> Wide(Laborwert)
dat <- data.frame(pos = c("A", "B" , "C"), x = 1:3, y = 3:5)
dat transpose2(dat) transpose2( dat, key = "Item", col.names = c("A-level", "C-level", "D-Level"), row.names = c("x-axis", "y-axis") )
Works internally with the function dplyr::bind_rows
.
Rbind2( ..., .id = "which", .names = NULL, .use.label = TRUE, include.rownames = FALSE
df1 <- data.frame(CustomerId = c(1:6), Product = c(rep("Oven", 3), rep("Television", 3))) |> Label( Product = "Produkt") df2 <- data.frame(CustomerId = c(4:7), Product = c(rep("Television", 2), rep("Air conditioner", 2))) df3 <- data.frame( CustomerId = c(4:7), Product = c(rep("Television", 2), rep("Air conditioner", 2)), State = c(rep("California", 2), rep("New Jersey", 2)) ) |> Label( Product = "Produkt-Kategorie", State = "Bundes-Staat") dat1<- Rbind2(df1, df2, df3) str(dat1) dat2 <- dplyr::bind_rows(df1, df2, df3, .id ="which") str(dat2)
Operates internally with the function base::merge
and base::Reduce
.
n<-10 df1 <- data.frame( origin = sample(c("A", "B", "C", "D", "E"), n, replace = T), N = sample(seq(9, 27, 0.5), n, replace = T), P = sample(seq(0.3, 4, 0.1), n, replace = T), C = sample(seq(400, 500, 1), n, replace = T)) df2 <- data.frame( origin = sample(c("A", "B", "C", "D", "E"), n, replace = T), foo1 = sample(c(T, F), n, replace = T), X = sample(seq(145600, 148300, 100), n, replace = T), Y = sample(seq(349800, 398600, 100), n, replace = T)) df3 <- data.frame(origin = sample(c("A", "B", "C", "D", "E"), n, replace = T)) df4 <- data.frame(origin = sample(c("A", "B", "C", "D", "E"), n, replace = T)) df1$id <- df2$id <- df3$id <- df4$id <- paste("P", sprintf("%02d", c(1:n)), sep = "") Merge2(df1, df2, df3, df4, by = "id")
#' cbind data.frame aber listenweise m <- data.frame( Item = 1:3, a = (1:3), b = (1:3) * 2, c = (1:3) * 3 ) sd <- data.frame( Item = (1:3), a = (1:3) * 4, b = (1:3) * 5, c = (1:3) * 6 ) combine_data_frame(m, sd) combine_data_frame(m, sd, by = NULL)
This group includes functions for converting objects into data frames (fix_to_df
, fix_to_tibble
, list_to_df
), adding elements to vectors or lists (add_to
, add_row_df
), adding and transforming missing data (na_approx
, auto_trans
), managing labels (Label
, delete_label
, get_label
, set_label
) and performing arithmetic operations (auc_trapezoid
).
#' x <- #' c(.00049001,.0035648,.01,.0112, #' .023212548,.00541257,.004041257,.458,.500) #' y <- #' c(43.01,49.156,678.00112458964,789.023212548, #' 674.00049001,634.00541257,76.004041257,789.458,500 #' ) #' #' data.frame(x = signif(x, 3), x.lod = imputation_LOD(x, lod = .0035648)) #' #' data.frame(y = signif(y, 3), y.lod = imputation_LOD(y, lod = 49.156))
Add item to list.
add_to(list(a = 1:3, b = LETTERS[1:5]), c = 1, d = 2)
Add element to data frame.
df <- data.frame( Source = c("A", "B", "C", "F"), x = 1:4, y = 1:4, stringsAsFactors = FALSE ) add_to(df, c("Erste Zeile" = 1, "Dritte" = 3)) add_to(df, "Erste Zeile") add_to(df, c("Erste Zeile", "Zweite")) add_to(df, c("Erste Zeile" = 1, "letzte" = 5))
add_to(df, list("G", 5), pos = -1) add_to(df, data.frame( Source = c("G", "H"), x = 5:6 ), pos = -1)
x <- list( M1 = data.frame( Source = c("Intercept", "A", "B" , "C", "Residual"), b = c(0, 1, 2, 3, 0), y = c((1:4) + 10, 0) ), M2 = data.frame( Source = c("Intercept", "A", "C", "B", "D", "E", "Residual"), x = c(0, 1, 3, 2, 4, 5, 0), y = c((1:6) + 21, 0) ), M3 = data.frame( Source = c("A", "B", "C", "D", "Residual"), x = c((1:4), 0), y = c((1:4) + 20, 0) ), M1 = data.frame( Source = c("A", "B", "D", "Residual"), x = c(1, 2, 4, 0), y = c((1:3) + 30, 0) ) ) # Spezialfall list_to_df(x, last = "Residual") fix_to_df(x)
data(infert, package = "datasets") infert$case <- factor(infert$case ,1:0, c("case", "control") ) #infert$spontaneous <- factor(infert$spontaneous) #infert$induced2 <- factor(infert$induced==0) # tab_1<- xtabs(~ case, infert) # tab_2x2<- xtabs(~ induced2 + case, infert) tab_3x2<- xtabs(~ induced + case, infert) # tab_3x3<- xtabs(~ induced + education, infert) # tab_3x3x2<- xtabs(~ induced + education+case, infert) # tab_3x2x3<- xtabs(~ induced +case+ education, infert) tab_3x2 fix_to_df(tab_3x2)
Automatic transformation of numerical variables according to their distribution properties.
x <- c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 9, 20, 30) x_trans<-auto_trans(x) auto_trans(x_trans) x <- 100 - x auto_trans(x)
n <- 100 set.seed(n) x1 = rnorm(100)+5 x2 = rlnorm(n, meanlog = 0, sdlog = 1)+1 x3 = rpois(n, lambda = 1)+1 x4 = rweibull(n, shape = .8, scale = 1)+10 x5 = runif(n, min = 0, max = 100)+1 x.neg <- rbeta(n, 5, 2) # Negative Skew x.pos <- rexp(n, 1) # Positive Skew dat<- data.frame( morm= x1[order(x1)], beta=x.neg[order(x.neg)], exp = x.pos[order(x.pos)], # lnorm= x2[order(x2)], pois= x3[order(x3)], # weibull= x4[order(x4)], # unif= x5[order(x5)], group= factor(c(rep(1, n*.5), rep(1:2, n*0.3/2), rep(2, n*0.2))) )
par(mfrow=c(2,2)) boxplot(beta~group, dat) boxplot(auto_trans(beta, treshhold = .4)~group, dat) boxplot(exp~group, dat) boxplot(auto_trans(exp, treshhold = .4)~group, dat) #boxplot(weibull~group, dat) #boxplot(unif~group, dat)
for(i in 1:4){ dat$x<- dat[[i]] fit0 <- lm(x~ group, dat) dat$x<- auto_trans(dat$x, treshhold = .4) fit1 <- lm(x~ group, dat) p0<-plot(effects::effect("group", fit0), main="orginal", ylab = names(dat)[i]) p1<-plot(effects::effect("group", fit1), main="non-trans", ylab = "") p2<-plot(effects::effect("group", fit1, transformation = list(link = attr(dat$x, "link"), inverse = attr(dat$x, "inverse"))), main=attr(dat$x, "name"), ylab = "" ) gridExtra::grid.arrange(p0, p1, p2, ncol = 3) }
Multiple answers are often coded incorrectly. The following functions are used to prepare the data structure.
data_games <- tibble::tibble( country = c("Germany", "France", "Spain"), game = c("England - win", "Brazil - loss", "Portugal - tie") ) #' sep = "[^[:alnum:].]+" data_games |> tidyr::separate(col = game, into = c("opponent", "result"))
data_opponents <- tibble::tibble( country = c("Germany", "France", "Spain"), opponent = c("England, Switzerland", "Brazil, Denmark", "Portugal, Argentina") ) data_opponents data_opponents |> tidyr::separate_rows(opponent)
Break down multiple answers.
#require(stp25tools) lbl <- c ( '1 Heart Failure' = 1, '2 Rhythm Abnormality' = 2, '3 Valve Dysfunction' = 3, '4 Bleeding with OAC' = 4, '5 ACS' = 5, '6 Neurological Event' = 6, '7 Neoplastic Disease' = 7, '8 Others' = 8, '0 No Complications' = 0 ) x <- c(0, "1,3,6,8,4", 2, "", "8,4", # 3.8,100, "4,6,8,3", "2,3,4,5") #x <- gsub("\\.", ",", x) rslt <- separate_multiple_choice(x , sep = ",", as_logical = TRUE, label = lbl) stp25stat2::Tbll_desc(rslt)
z <- gl(3, 2, 12, labels = c("apple", "salad", "orange")) table(z) levels(z) <- list("veg" = "salad", "fruit" = c("apple", "orange")) table(z) z <- factor_to_dummy(z) table(z) z <- dummy_to_factor(z) table(z)
# NODE EXPLANATION # -------------------------------------------------------------------------------- # (?<= look behind to see if there is: # --------------------------------------------------------------------------- # [\s] any character of: whitespace (\n, \r, \t, \f, and " ") # ----------------------------------------------------------------------------- # ) end of look-behind # -------------------------------------------------------------------------------- # \s* whitespace (\n, \r, \t, \f, and " ") (0 or # more times (matching the most amount possible)) # -------------------------------------------------------------------------------- # | OR # -------------------------------------------------------------------------------- # ^ the beginning of the string # -------------------------------------------------------------------------------- # \s+ whitespace (\n, \r, \t, \f, and " ") (1 or more times # (matching the most amount possible)) # -------------------------------------------------------------------------------- # $ before an optional \n, and the end of the string
Wrap string: split_string(), wrap_string(), wrap_sentence(), wrap_string_at()
Clean up data frame and strings clean_names, cleansing_umlaute, cleansing_umlaute2
fl <- "Raw data/Mai_Markiert_LT_Cases_Jan2018-Dez2022.xlsx" DF <- readxl::read_excel(fl_new, range = "A1:z15", sheet = 1) |> fix_names()
clean_names(tibble::tibble("Öli"=1:3, "p-k"=1:3, "95%-CI"=4:6) )
cleansing_umlaute( " Öäüö? hallo")
wrap_string(), wrap_factor(), wrap_data_label(), wrap_string_at(), split_string()
strg<- c("R is free software and comes with ABSOLUTELY NO WARRANTY.", "You are welcome to redistribute it under certain conditions.") wrap_string(strg, 5) wrap_string(factor(strg)) wrap_factor(factor(strg)) #wrap_data_label(data) wrap_string_at(strg, "and") split_string(strg, "and")
animals <- c("cat", "dog", "mouse", "elephant") stringr::str_flatten_comma(animals, last = " and ")
Prepare internal data with formula prepare_data2
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.