knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "README-"
)
require(stpvers)

stp25aggregate

Generate Random Data Sets

Generates random data sets including: data.frames, lists, and vectors. Readme(https://www.rdocumentation.org/packages/wakefield/versions/0.3.3)] wakefield

Daten Laden

GetData(): Ladet verschiedene Dateiformate von csv bis sav. Tabellen im Text-Format koennen direkt gelesen werden. Zurueckgegeben wird ein data.frame.

xls Verwendet readxl::read_excel

Text direkt im R-File Es lassen sich direkt Daten als Text einlesen dabei kann mit Tabel_Expand = TRUE, id.vars = 1:2 gesteuert werden ob eine Kreuztabelle aufgedroeselt werden soll.

LimeSurvy Hier muss eine Liste uebergeben werden die die Filenamen beinhaltet also GetData(list("file.R","file.txt")). Das erste File ist das R-File mit den Labels das zweite die Daten. Weitere moegliche Parameter sind die Zeichencodierung

setwd("C:/Users/wpete/Dropbox/3_Forschung/R-Project/stp25aggregate")

x <- "fa\xE7ile"

Encoding(x)
Encoding(x) <- "latin1"
x
xx <- iconv(x, "latin1", "UTF-8")
Encoding(c(x, xx))
c(x, xx)

xx<- "Abc ÖÄÜ ÖÄÜ?"

Encoding(xx) <- "bytes"
xx # will be encoded in hex

#Encoding(xx)<-  "latin1"
xx
#iconv(xx, Encoding(xx), "UTF-8")


require(stpvers)
require(readr)

str(GetData("data/Data-default.csv", sep=","))
GetData("data/Data-ansi.csv", sep=",")
GetData("data/Data-utf8.csv", sep=",")


 read.csv("data/Data-default.csv" )
read.csv("data/Data-ansi.csv")
read.csv("data/Data-utf8.csv")

read.csv("data/Data-default.csv" , check.names = FALSE, stringsAsFactors =FALSE)
read.csv("data/Data-ansi.csv", check.names = FALSE)
 read.csv("data/Data-utf8.csv", check.names = FALSE)
guess_encoding("data/Data-default.csv")
read_csv("data/Data-default.csv")

read_csv("data/Data-ansi.csv")
read_csv("data/Data-utf8.csv")
dat<-GetData("
sex treatment control
m  2 3
f  3 4
",
Tabel_Expand = TRUE, id.vars = 1)
head(dat)
#xtabs(~sex +value, dat)
#  library("readxl")

DF <- GetData(
  data_file = 'Raw data/Tabelle Beugesehnen.xlsx',
  raw_data = "Raw data/Beugesehnen-2.Rdata",
  sheet = 1,
  skip = 1
)

Andere Packete für SPSS und XLSX

  library(excel.link)
  library(readxl)

  fil1 <-
    "Raw data/LTx-Datenbank aktuell_1-last_NMPvs non NMP.xlsx"
  dat1 <- readxl::read_excel(fil1, sheet = 1)

  fil2 <- "Raw data/Metra LTX Datenbank 03-04-20.xlsx"
  sheet1 <- "Minimal Data Set Metra Patients"
  sheet2 <- "non transplanted organs"

  dat2 <- excel.link::xl.read.file(
    fil2,
    password = "BC82",
    xl.sheet = "Minimal Data Set Metra Patients",
    top.left.cell = "A2"
  )

library("rio")
 Excel (.xls and .xlsx), using haven::read_excel.
 SPSS (.sav), using haven::read_sav 

Clean Names

clean_names(tibble::tibble("Öli"=1:3, "p-k"=1:3, "95%-CI"=4:6) )

clean_names(
  c(
    "  a", "a  ", "a %", "a",
    "$a", "$$$a", "GDP ($)",
    "GDP (us$)", "a (#)", "a & b",
    "#", "$", "a_cnt", "Aa&Bb",
    "camelCasePhrases", "AlphaBetaGamma",
    "Alpha       Beta", "Beta  !!! Gamma",
    "a + b", "a - b", "a * b", "Ösel"
  ), abbreviate=TRUE
)

Add_rows

df <-   data.frame(
  Source = c("A", "B", "C", "F"),
  x = 1:4,
  y = 1:4,
  stringsAsFactors = FALSE
)

stp25tools::add_row_df(df, c("Erste Zeile" = 1, "Dritte" = 3))
stp25tools::add_row_df(df, c("Erste Zeile" = 1, "letzte" = 5))
stp25tools::add_row_df(df, "Erste Zeile")
stp25tools::add_row_df(df, c("Erste Zeile", "Zweite"))

Data and Variable Transformation Functions

Long/Wide

Umformen von einem Breit-Format nach einem Lang-Format. Melt2 und melt2 sind Erweiterungen der reshape2::melt Funktion. Intern wird melt und dcast verwendet.

df <- tibble::tibble(
   month=c(1,2,3,1,2,3),
   student= gl(2,3, labels =c("Amy", "Bob")),
   A=c(9,7,6,8,6,9),
   B=c(6,7,8,5,6,7)
)
df

Vergleich der Ergebnisse spread vs Wide

    tidyr::spread(df[-4], student, A)

# new style with pivot_wider
df[-4] %>%
  tidyr::pivot_wider(names_from = student, 
                     values_from = A, 
                     names_prefix = "Student_")

    (df_w1 <- Wide(df[-4], student, A))

df %>% Wide( month ~ student, A)


df %>% Wide( student, c(A, B))

Vergleich der Ergebnisse gather vs Long

     tidyr::gather(df_w1,  key = "student", value = "A", Amy, Bob)
     Long(Amy + Bob ~ month, df_w1, key="student", value="A")
     Long(df_w1, id.vars=1, key = "student", value = "A") 

Long mit meheren Parametern

    (df_w2 <- Wide(df, student, c(A, B))) 
# # new style with pivot_longer
     Long(list(A=c("Amy_A", "Bob_A" ), B=c("Amy_B", "Bob_B")), df_w2,
                 by =  ~ month,
                 key = "student",
                 key.levels= c("Amy", "Bob"))

Summarise

mean3<- function(x)round(mean(x, na.rm=TRUE), 1)

    Summarise(A + B ~ student, df, fun=mean3, key = "group", value = "cbc")
    Summarise(A + B ~ student, df, fun=mean3,  margins = TRUE)
    Summarise(A + B ~ student,
                      df,
                      fun=mean3,
                      formula = variable ~ student,
                      margins = TRUE)

Alternative mosaic

require(mosaic)
mean(HELPrct$age)
mean( ~ age, data = HELPrct)

mean3 <- function(x)
  round(mean(x, na.rm = TRUE), 1)
#
# Summarise(daysanysub + dayslink ~ homeless , HELPrct, fun=mean3, key = "group", value = "cbc")
# Summarise(daysanysub + dayslink ~ homeless , HELPrct, fun=mean3,  margins = TRUE)
# Summarise(daysanysub + dayslink ~ homeless ,
#           HELPrct,
#           fun=mean3,
#           formula = variable ~ homeless,
#           margins = TRUE)


#namespace:mosaicCore
Summarise(daysanysub ~ homeless , HELPrct, fun = mean3)
df_stats(daysanysub ~ homeless , HELPrct, mean3, na.action = na.pass)
df_stats(daysanysub ~ homeless , HELPrct, mean3)

Summarise(
  daysanysub + dayslink ~ homeless + sex,
  HELPrct,
  fun = function(x)
    c(m = mean(x, na.rm = TRUE), 
      sd = sd(x, na.rm = TRUE))
)
df_stats(daysanysub + dayslink ~ homeless , HELPrct, mean3, na.action = na.pass)
df_stats(daysanysub + dayslink ~ homeless , HELPrct, mean3)
df_stats(daysanysub + dayslink ~ homeless + sex , HELPrct, mean, sd)

Aufdröseln vom Mehrfachantworten

Die Funktion separate_multiple_choice() transformiert einen String mit Trennzeichen zu einem Multi-Set mit 0 und 1. (Separate multiple choice) der param x ist entweder ein Character oder eine zahl

    dat <-  data.frame(
      Q1 = c(134, NA, 35, 134, 5, 24),
      Q2 = c(
        "Alm Dudler, Essig, Cola",  NA, 
        "Cola, Holer", "Alm Dudler, Cola, 
        Essig","Holer", "Bier, Essig"
      )
    )
    dat
    cbind(dat[-1],
          separate_multiple_choice(dat$Q2))
    dat <- cbind(dat[-2],
                 separate_multiple_choice(dat$Q1,
                                          label = c(
                                            "Alm Dudler", "Bier", "Cola", "Essig", "Holer"
                                          )))

dat

Mittelwerte Addieren

#' Mittelwerte Addieren
#'
#' 
#' @param n 
#' @param m 
#' @param sd 
#'
calc.mean <- function(n, m, sd) {
  j <- length(n) #-- Anzahl an Elementen
  if (length(n) != length(m) |
      length(n) != length(sd))
    stop("Ungleiche Anzahl an n,m oder sd!")

  #print(paste( m ,"-", trunc(m),"=", m - trunc(m)))
  #-- Interne Function berechnet zwei Werte
  calc.mean2 <- function(n, m, sd) {
    var <- sd ^ 2
    n.total <- sum(n)
    n.minus1 <- n - 1
    m.total <- sum(m * n) / n.total
    var.total <-
      1 / (n.total - 1) * (sum(n.minus1 * var)  +   prod(n) / n.total * diff(m) ^
                             2)
    #var3 =  1/(n1+n2-1) *( (n1-1)*var1  +  (n2-1)*var2 + n1*n2/(n1+n2)*((m1-m2)^2)  )
    data.frame(
      n = c(n, n.total),
      m = c(m, m.total),
      sd = c(sd, sqrt(var.total)),
      var = c(var, var.total)
    )
  }

  if (j == 2) {
    zw <- calc.mean2(n, m, sd)
  }
  else{
    n1 <- n[1:2]
    m1 <- m[1:2]
    sd1 <- sd[1:2]
    zw <- calc.mean2(n = n1, m = m1, sd = sd1)
    zw.j <- zw[3, ]
    for (i in 3:j) {
      zw.i <- calc.mean2(
        n  = c(zw.j$n, n[i])
        ,
        m  = c(zw.j$m, m[i])
        ,
        sd = c(zw.j$sd , sd[i])
      )
      zw[i, ] <- zw.i[2, ]
      zw <- rbind(zw, zw.i[3, ])
    }
    zw
  }
  cbind(value = c(1:j, "total") , zw)
}



x1 <- rnorm( 4, 3.0,  2.1)    
x2 <- rnorm(10, 4.0,  2.0) 
x3 <- rnorm(11, 3.5,  3.0)

calc.mean(
  n = c(length(x1), length(x2)),
  m = c(mean(x1), mean(x2)),
  sd = c(sd(x1), sd(x2))
)
mean(c(x1, x2))
sd(c(x1, x2))
var(c(x1, x2))


calc.mean(
  n = c(length(x1), length(x2), length(x3)),
  m = c(mean(x1), mean(x2), mean(x3)),
  sd = c(sd(x1), sd(x2) , sd(x3))
)
mean(c(x1, x2, x3))
sd(c(x1, x2, x3))
var(c(x1, x2, x3))



# Total Asf
calc.mean(
  n = c(22, 26, 22, 26),
  m = c(407, 613, 434, 565) / 100,
  sd = c(258, 236, 345, 253) / 100
)

Funktionen aus anderen Packeten

dplyr::case_when

library(dplyr)

set.seed(0815)
n<-15
df<- data.frame(x=rnorm(n), y=rnorm(n), z=rnorm(n))
df <- df %>%
  mutate(group = case_when(x<0 & y<0 & z<0 ~ "A",  
                           x<0 | y<0 | z<0 ~ "B",  
                           TRUE ~ "C"  
                           ))

df  


stp4/stp25aggregate documentation built on Sept. 17, 2021, 5:34 a.m.