select_variables_sep2018<-function(joined_df) {
db<-nodupdb<-joined_df[!duplicated(joined_df$q_0),]
#1. Bulbar (pyt. 16), upper limb, lower limb onset
# (pyt. 16: limb onset oraz pyt. 48a: cervical=upper limb, lumbar=lower limb);
# Jeśli pacjent w pytaniu 36a ma co innego niż 1, 2, 3 albo 4 – brak danych.
# Jeśli w 16b jest tak, to przyjmujemy Bulbar i koniec.
# Jeśli w 16a jest tak, to patrzymy na pytanie 49a.
# Jeśli 49a = Cervical -> Upper Limb.
# Jeśli 49a = Lumbo-sacral-> Lower Limb. Brak danych w innym razie.
flags2=(db$q_16b=='Yes' & !is.na(db$q_36a)) #bulbat
flags1=(!flags2 & db$q_16a=='Yes' & db$q_49a=='Cervical' & !is.na(db$q_36a)) #lower limb
flags3=(!flags2 & !flags1 & db$q_16a=='Yes' & db$q_49a=='Lumbo-sacral' & !is.na(db$q_36a)) #upper limb
ans=flags1*1 + flags2*2 + flags3*3
db[,iv56:=factor(ifelse(ans==0, NA, ans), labels=c("Bulbar", 'Upper limb', 'Lower limb'), levels=1:3)]
# setattr(db$iv56, 'label', 'Fenotype classification 1')
setattr(db$iv56, 'label', 'Fenotype classification Bulbar/Limb')
# 2. UMN or LMN involvement on onset (pyt. 17).
# Jeśli pacjent w pytaniu 36a ma co innego niż 1, 2, 3 albo 4 – brak danych.
# Dla każdej grupy pyt 17 po jednej grupie fenotypu: UMN, LMN, UMN+LMN
flags1=(!is.na(db$q_36a) & db$q_17=="UMN")
flags2=(!is.na(db$q_36a) & db$q_17=="LMN")
flags3=(!is.na(db$q_36a) & db$q_17=="UMN+LMN")
ans=flags1*1 + flags2*2 + flags3*3
db[,iv57:=factor(ifelse(ans==0, NA, ans), labels=c("UMN", 'LMN', 'UMN+LMN'), levels=1:3)]
#setattr(db$iv57, 'label', 'Fenotype classification 2')
# db[,iv57:=factor( (db$iv57=='UMN')*1+(db$iv57=='LMN')*2, labels=c('UMN', 'LMN'), levels=c(1, 2))]
setattr(db$iv57, 'label', 'Fenotype classification UMN/LMN')
db[,dv1:=db$q_51_1.score]
setattr(db$dv1, 'label', "ALSFRS score")
db[,dv2:=db$q_51_1.1 + db$q_51_1.2 + db$q_51_1.3]
setattr(db$dv2, 'label', "ALSFRS Speech, Salivation and Swallowing")
db[,dv3:=db$q_51_1.4 + db$q_51_1.5 + db$q_51_1.6]
setattr(db$dv3, 'label', "ALSFRS Handwriting, Cutting Food and Hygiene")
db[,dv4:=db$q_51_1.7 + db$q_51_1.8 + db$q_51_1.9]
setattr(db$dv4, 'label', "ALSFRS Turning in Bed, Walking and Climbing Stairs")
db[,dv5:=db$q_51_1.10 + db$q_51_1.11 + db$q_51_1.12]
setattr(db$dv5, 'label', "ALSFRS Dyspnea, Orthopnea and Respiratory insufficiency")
db[,dv6:=db$q_51_1.rate]
setattr(db$dv6, 'label', "ALSFRS rate")
db[,iv7:=(as.numeric(db$q_11 - db$q_5)/365.25)]
setattr(db$iv7, 'label', "Age at first symptoms")
src_vars=c('q_73', 'q_74', 'q_75', 'q_76', 'q_77', 'q_78', 'q_79', 'q_80', 'q_81', 'q_82', 'q_83', 'q_84')
dst_vars=paste0('iv', seq(8, 7 + length(src_vars)))
setattr(db$q_73, 'label', "Blood hypertension")
setattr(db$q_74, 'label', "Diabetes – type I / II")
setattr(db$q_75, 'label', "Hypercholesterolemia")
setattr(db$q_76, 'label', "Hypertriglyceridemia")
setattr(db$q_77, 'label', "Hyperthyroidism")
setattr(db$q_78, 'label', "Hypothyroidism")
setattr(db$q_79, 'label', "Autoimmune rheumatologic disorder")
setattr(db$q_80, 'label', "Autoimmune intestinal disorder")
setattr(db$q_81, 'label', "Stroke")
setattr(db$q_82, 'label', "Heart – ischemia")
setattr(db$q_83, 'label', "Heart – arrhytmia")
setattr(db$q_84, 'label', "Heart – insufficiency")
for(i in seq_along(src_vars)) {
src_var=src_vars[[i]]
dst_var=dst_vars[[i]]
db[,(dst_var):=db[[src_var]]]
}
flags = dplyr::coalesce(db$q_85a_1!='No', FALSE) |
dplyr::coalesce(db$q_85a_2!='No', FALSE) |
dplyr::coalesce(db$q_85a_3!='No', FALSE)
db[,iv20:=factor(flags, labels=c("No cancer", "Any cancer"), levels=c(FALSE,TRUE))]
setattr(db$iv20, 'label', "Cancer")
setattr(db$q_137, 'label', 'Emotional stress in the last 5 years')
db[,iv21:=db$q_137]
setattr(db$q_138, 'label', 'Emotional stress >5 years')
db[,iv22:=db$q_138]
db[,iv23:=factor(db$q_138=='Yes' | db$q_137 == 'Yes', labels=c('No', 'Yes'), levels=c(FALSE, TRUE))]
setattr(db$iv23, 'label', 'Emotional stress')
setattr(db$q_20, 'label', 'Weight loss (>10% initial weight)')
db[,iv24:=db$q_20]
setattr(db$q_21, 'label', 'Emotional lability at onset')
db[,iv25:=db$q_21]
setattr(db$q_22, 'label', 'Which cognitive symptoms at onset')
db[,iv26:=db$q_22]
setattr(db$q_23, 'label', 'Handedness')
db[,iv27:=db$q_23]
setattr(db$q_37, 'label', 'Emotional lability')
db[,iv28:=db$q_37]
setattr(db$q_38.1a, 'label', 'Resting tremor in Upper Limbs')
db[,iv29:=db$q_38.1a]
setattr(db$q_38.1b, 'label', 'Resting tremor in Lower Limbs')
db[,iv30:=db$q_38.1b]
setattr(db$q_23, 'label', 'Emotional lability')
db[,iv31:=factor(db$q_38.1a=='Yes' | db$q_38.1b == 'Yes', labels=c('No', 'Yes'), levels=c(FALSE, TRUE))]
setattr(db$iv31, 'label', 'Resting tremor')
setattr(db$q_38.2a, 'label', 'Bradykinesia in Upper Limbs')
db[,iv32:=db$q_38.2a]
setattr(db$q_38.2b, 'label', 'Bradykinesia in Upper Limbs')
db[,iv33:=db$q_38.2b]
db[,iv34:=factor(db$q_38.2a=='Yes' | db$q_38.2b == 'Yes', labels=c('No', 'Yes'), levels=c(FALSE, TRUE))]
setattr(db$iv34, 'label', 'Bradykinesia')
setattr(db$q_39, 'label', 'Sensory changes')
db[,iv35:=db$q_39]
setattr(db$q_40, 'label', 'Apraxia (facial)')
db[,iv36:=db$q_40]
setattr(db$q_41, 'label', 'Apraxia (upper limbs)')
db[,iv37:=db$q_41]
db[,iv38:=factor(db$q_42, levels = c(1,2), labels=c('Positive', 'Negative'))]
setattr(db$iv38, 'label', 'Postural instability - NURT')
setattr(db$q_43, 'label', 'Broad-based gait')
db[,iv39:=db$q_43]
setattr(db$q_44, 'label', 'Cognition - Qualitative evaluation')
db[,iv40:=db$q_44]
setattr(db$q_89a, 'label', 'Psychiatric medication')
db[,iv41:=db$q_89a]
setattr(db$q_90a, 'label', 'Supplements')
db[,iv42:=db$q_90a]
setattr(db$q_91a, 'label', 'Riluzole')
db[,iv43:=db$q_91a]
setattr(db$q_92a, 'label', 'Antiepileptc drug')
db[,iv44:=db$q_92a]
setattr(db$q_93a, 'label', 'Statins')
db[,iv45:=db$q_93a]
setattr(db$q_94a, 'label', 'NSAID')
db[,iv46:=db$q_94a]
setattr(db$q_95a, 'label', 'Steroids')
db[,iv47:=db$q_95a]
setattr(db$q_96a, 'label', 'Immunosupressors')
db[,iv48:=db$q_96a]
setattr(db$q_100a_1, 'label', 'Familial history of ALS')
db[,iv49:=db$q_100a_1]
setattr(db$iv49, 'f.o.b', 1)
setattr(db$q_101a_1, 'label', 'Familial history of FTD')
db[,iv50:=db$q_101a_1]
setattr(db$iv50, 'f.o.b', 1)
setattr(db$q_102a_1, 'label', 'Familial history of Alzheimer disease')
db[,iv51:=db$q_102a_1]
setattr(db$iv51, 'f.o.b', 1)
setattr(db$q_103a_1, 'label', 'Familial history of Parkinson disease')
db[,iv52:=db$q_103a_1]
setattr(db$iv52, 'f.o.b', 1)
setattr(db$q_104a_1, 'label', 'Family history of multiple sclerosis')
db[,iv53:=db$q_104a_1]
setattr(db$iv53, 'f.o.b', 1)
db[,iv54:=factor(db$q_115=='Yes' |
db$q_116 == 'Yes' |
db$q_117a == 'Yes' |
db$q_117b == 'Yes' |
db$q_117c == 'Yes' |
db$q_118a == 'Yes' |
db$q_118b == 'Yes' |
db$q_118c == 'Yes' , labels=c('No', 'Yes'), levels=c(FALSE, TRUE))]
setattr(db$iv54, 'label', 'Trauma')
db[,iv55:=factor(db$q_119a == 'Yes' |
db$q_119b == 'Yes' |
db$q_119c == 'Yes' |
db$q_120a == 'Yes' |
db$q_120b == 'Yes' |
db$q_120c == 'Yes' |
db$q_121a == 'Yes' |
db$q_121b == 'Yes' |
db$q_122a == 'Yes' |
db$q_122b == 'Yes' |
db$q_123 == 'Yes' |
db$q_124 == 'Yes' |
db$q_125 == 'Yes' |
db$q_126 == 'Yes' |
db$q_127 == 'Yes' |
db$q_128 == 'Yes' |
db$q_129 == 'Yes' |
db$q_130 == 'Yes', labels=c('No', 'Yes'), levels=c(FALSE, TRUE))]
setattr(db$iv55, 'label', 'Surgery')
# db[,dv1:=factor( (db$depvar1=='Bulbar'), labels=c('Not bulbar', 'Bulbar'), levels=c(FALSE, TRUE))]
# setattr(db$dv1, 'label', 'Is fenotype bulbar')
# db[,dv2:=factor( (db$depvar1=='Upper limb'), labels=c('Not upper limb', 'Upper limb'), levels=c(FALSE, TRUE))]
# setattr(db$dv2, 'label', 'Is fenotype upper limb')
# db[,dv3:=factor( (db$depvar1=='Lower limb'), labels=c('Not lower limb', 'Lower limb'), levels=c(FALSE, TRUE))]
# setattr(db$dv3, 'label', 'Is fenotype lower limb')
args<-list(
zn=names(db)[stringr::str_detect(names(db), '^iv.*')],
zz=names(db)[stringr::str_detect(names(db), '^dv.*')])
return(list(db=as_tibble(db)[,c(args$zz, args$zn)], iv_names=args$zn, dv_names=args$zz))
}
regression_recode_ordinal<-function(varnames, depvar, mydt) {
cat('Binning the variables...\n')
woedt <- mydt %>% select_(.dots = varnames)
mybins <- binnr::bin(data = woedt,
y = depvar,
mono = 1, max.bin=20)
woes<-mybins$predict(type = 'woe')
ans<-as.data.table(woes)
danesurowe::copy_dt_attributes(mydt, ans)
return(list(df=ans, woe_info=mybins$variables))
}
padded_count<-function(n, ntotal) {
ndigits<-floor(log10(ntotal))+1
return(stringr::str_pad(n, ndigits, pad='0'))
}
regression_recode_factor_1<-function(varname, dt, keep_nominal=character(0)) {
# tmp_df<-data.frame(x=factor(c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,NA,NA,NA,NA),
# levels=c(1,2,3), labels=c('A', 'B', 'C')))
#as.integer(dt[[varname]])
tmp_fac<-addNA(as.integer(dt[[varname]]), ifany=TRUE)
mylabel<-attr(dt[[varname]], 'label')
if(is.null(mylabel)) {
browser()
}
if(length(unique(tmp_fac))>1) {
tmp_df<-setNames(tibble::tibble(tmp_fac), varname)
tmp_df<-predict(caret::dummyVars(~ ., tmp_df, fullRank=FALSE, levelsOnly = FALSE, drop2nd=TRUE), tmp_df)
to_remove<-caret::nearZeroVar(as.data.table(tmp_df), saveMetrics = FALSE)
if(varname %in% keep_nominal) {
# browser()
for(to_rm in to_remove) {
tmp_fac[as.integer(tmp_fac)==to_rm]<-NA
}
class(tmp_fac)<-class(dt[[varname]])
tmp_fac<-addNA(tmp_fac, ifany = TRUE)
if(any(is.na(levels(tmp_fac)))) {
setattr(tmp_fac, 'levels', c(attr(dt[[varname]], 'levels'), as.character(NA)))
} else {
setattr(tmp_fac, 'levels', attr(dt[[varname]], 'levels'))
}
data.table::setattr(tmp_fac, 'label', mylabel)
setNames(data.table(x=tmp_fac), varname)
#We still need to replace small groups with NA.
} else {
to_keep<-setdiff(seq(ncol(tmp_df)), to_remove)
tmp_df<-data.table::as.data.table(tmp_df[,to_keep, drop=FALSE])
mylabels<-levels(dt[[varname]])
for(i in seq_along(colnames(tmp_df))) {
mycolname<-colnames(tmp_df)[[i]]
if(stringr::str_detect(mycolname, '\\.NA$')) {
data.table::setattr(tmp_df[[i]], 'label', paste0(mylabel, " - NA"))
} else {
a<-as.integer(stringr::str_match(mycolname, '\\.([0-9]+)$')[,2])
data.table::setattr(tmp_df[[i]], 'label', paste0(mylabel, " - ", mylabels[a]))
}
}
tmp_df
}
} else {
data.table()
}
}
# regression_recode_factor_1<-function(varname, dt, count_threshold=1) {
# var<-dt[[varname]]
# mylabel<-danesurowe::GetVarLabel(dt, varname)
# levels<-as_tibble(table(var)) %>% filter(n>count_threshold)
#
# if(nrow(levels)>2) {
# base_level<-arrange(levels, -n)$var[[1]]
# levels<-levels[2:nrow(levels),]
# newname<-paste0(varname,'_', padded_count(1, nrow(levels)))
# newvar<-as.integer(var==levels$var[[1]])
# newvar[is.na(newvar)]<-0
# ans<-data.table(var=newvar)
# data.table::setnames(ans, 'var', newname)
# setattr(ans[[newname]], 'label', paste0(mylabel,' ',levels$var[[1]] ))
# if(nrow(levels)>1) {
# for(i in seq(2, nrow(levels))) {
# newname<-paste0(varname,'_', padded_count(i, nrow(levels)))
# newvar<-as.integer(var==levels$var[[i]])
# newvar[is.na(newvar)]<-0
# ans[,(newname):=newvar]
# setattr(ans[[newname]], 'label', paste0(mylabel,' ',levels$var[[i]] ))
# }
# }
# } else if (nrow(levels)==2) {
# ans<-data.table(.tmp=as.integer(var))
# data.table::setnames(ans, '.tmp', varname)
# } else {
# ans<-data.table(.tmp=rep(NA, nrow(dt)))
# }
# if(sum(is.na(var))>count_threshold) {
# newname<-paste0(varname,'_NA')
# ans[,(newname):=as.integer(is.na(var))]
# setattr(ans[[newname]], 'label', paste0('Braki danych w ',mylabel))
# }
# if('.tmp' %in% names(ans)) {
# ans[,.tmp:=NULL]
# }
# return(ans)
# }
regression_recode_binary_1<-function(varname, dt, count_threshold=1) {
browser()
# cat(varname)
# if(varname=='iv53') {
# browser()
# }
var<-as.numeric(dt[[varname]])
levels<-unique(var)
levels<-levels[!is.na(levels)]
mylabel<-danesurowe::GetVarLabel(dt, varname)
if(length(levels)>1) {
newvar<-as.integer(var==levels[[2]])
ans<-data.table(var=newvar)
danesurowe::copy_var_attributes(dt[[varname]], var_dest_name = 'var', dt_dest = ans)
data.table::setnames(ans, 'var', varname)
} else {
ans<-data.table(.tmp=rep(NA, nrow(dt)))
}
if(sum(is.na(var))>count_threshold) {
newname<-paste0(varname,'_NA')
ans[,(newname):=as.integer(is.na(var))]
setattr(ans[[newname]], 'label', paste0('Braki danych w ',mylabel))
}
if('.tmp' %in% names(ans)) {
ans[,.tmp:=NULL]
}
return(ans)
}
prepare_variables_hybrid<-function(dt, iv_names, dv_name, keep_nominal=character(0)) {
dv<-dt[[dv_name]]
nas<-is.na(dv)
mydt<-as.data.table(dt[!nas,iv_names])
mydt<-danesurowe::copy_dt_attributes(dt, mydt)
dv<-dv[!nas]
data.table::setattr(dv, 'label', Hmisc::label(dt[[dv_name]]))
fobs<-purrr::map_dbl(iv_names, ~danesurowe::GetFOB(mydt[[.]], flag_recalculate_uniques = TRUE))
ordinals<-iv_names[fobs==2]
ans<-list()
if(length(ordinals)>0) {
browser()
bins<-regression_recode_ordinal(ordinals, depvar, mydt)
ans<-c(ans, bins$df)
}
nominals<-iv_names[fobs==1]
if(length(nominals)>0) {
a<-flatten(purrr::map(nominals, regression_recode_factor_1, dt=mydt, keep_nominal=keep_nominal))
if ('V1' %in% names(a)) {
browser()
}
ans<-c(ans,a)
}
binaries<-iv_names[fobs==3]
if(length(binaries)>0) {
a<-flatten(purrr::map(binaries, regression_recode_factor_1, dt=mydt, keep_nominal=keep_nominal))
# a<-flatten(purrr::map(binaries, regression_recode_binary_1, dt=mydt))
ans<-c(ans,a)
}
numerics<-iv_names[fobs==0]
if(length(numerics)>0) {
a<-mydt %>% select_(.dots=numerics) %>% as_tibble() %>% mutate_all(funs(as.numeric))
# a<-flatten(purrr::map(numerics, ~scale(mydt[[.]])))
# browser()
tmp_df<-c(ans,a)
missing_pattern<-mice::md.pattern(a)
ans2<-as.data.table(tmp_df)
imputes<-NULL
if(nrow(missing_pattern)>2) {
if(nrow(mydt)>40000) {
m<-missing_pattern[nrow(missing_pattern),1:ncol(missing_pattern)-1]
vars_miss<-names(m[m>0])
ms<-list()
for(colname in vars_miss) {
var<-ans2[[colname]]
m<-mean(var, na.rm = TRUE)
imputes<-c(imputes, list(sumNA=sum(is.na(var))))
var[is.na(var)]<-m
ms<-c(ms, list(m))
ans2[,(colname):=as.numeric(var)]
}
imputes<-data.frame(varname=vars_miss, missing_count=unlist(imputes), mean=unlist(ms))
ans2<-cbind(ans, imputes)
} else {
library(mice)
imp<-mice::mice(ans2)
ans2<-complete(imp)
ans2<-danesurowe::copy_dt_attributes(data.frame(ans), data.frame(ans2))
}
}
for ( mycolname in numerics) {
mylabel<-attr(mydt[[mycolname]], 'label')
data.table::setattr(ans2[[mycolname]], 'label', mylabel)
}
} else{
ans2<-mydt
}
mydt<-as.data.table(ans2)
return(list(db=mydt, dv=dv))
}
make_ads<-function(dt, iv_names, dv_name, keep_nominal=character(0)) {
rmvar<-caret::nearZeroVar(dt)
groupvar<-dt$iv56
nearZeroVars<-colnames(dt)[ seq_along(colnames(dt)) %in% rmvar]
vars_to_keep<-colnames(dt)[ ! seq_along(colnames(dt)) %in% rmvar]
ads<-dt[c(vars_to_keep, dv_name)]
iv_names=intersect(vars_to_keep, iv_names)
ans<-prepare_variables_hybrid(ads, iv_names, dv_name, keep_nominal=keep_nominal)
depvar<-ans$dv
nas<-is.na(depvar)
# if(length(keep)>0) {
# browser()
# to_keep<-setdiff(colnames(ans$db), keep)
# ans$db<-cbind(ans$db[to_keep], setNames(data.frame(x=dt[keep]), keep))
# }
dvlab<-Hmisc::label(depvar)
depvar<-depvar[!nas]
data.table::setattr(depvar, 'label', dvlab)
cbind(data.frame(dv=depvar), ans$db)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.