# upload raw data; clinical features located on columns 7 - 2271 rawdata <- read.csv("../data/RedCap.csv") rawdata <- rawdata[c(2, 12:2271)] # keep patient ID and clinical features # untargeted CRIC metadata untarmet <- read.csv("../data/CRIC_clinical_creatinine_normalized_samplemetadata.csv", header=FALSE, stringsAsFactors = FALSE)
# functions for later use is.continuous <- function(df, f){ return(length(table(df[,f])) > 10) } factorize <- function(df){ for (factor in 1:ncol(df)) { if (length(table(df[,factor])) == 2) { df[,factor] <- as.factor(df[,factor]) } } return(df) } cat_to_binary <- function(df) { df$sex_v3y0 <- df$sex_v3y0-1 # make sex binary n_factors <- length(df) colnames <- colnames(df) for (factor in 2:ncol(df)) { # for each factor if ((length(table(df[,factor])) < 10) && (length(table(df[,factor])) > 2)) { # if factor is categorical and not already binary new_names <- levels(as.data.frame(table(df[,factor]))$Var1) # save category names as new_names for (name in 1:length(new_names)) { # for each of these category names df[,n_factors + 1] <- as.integer(grepl(name,df[,factor])) # add a column to df with binary form of the cat colnames <- c(colnames, paste0(colnames[factor], "_", name)) # update df column names n_factors <- n_factors + 1 # update number of factors in df } df <- df[-factor] # remove original categorical factor colnames <- colnames[-factor] # update colnames n_factors <- n_factors - 1 # update number of factors in df } } colnames(df) <- colnames return(df) }
# extract patient IDs from untargeted data vid <- substr(untarmet$V2,1,7) pid <- unique(vid) # remove patients in rawdata who are not represented in untarmet, and vice versa if (any(!pid %in% rawdata$patientid)) {print("there are patients in the metadata that are not in the raw data")} no_match <- row.names(rawdata[!rawdata$patientid %in% pid,]) rawdata <- rawdata[-as.integer(no_match),] if (any(!rawdata$patientid %in% pid)) {print("there are patients in the raw data that are not in the metadata")} #test
# amputation rawdata <- rawdata[rawdata$amputation_v3y0 == 0,] # genetic consent rawdata <- rawdata[rawdata$genetic_consent_bl_v3y0 == 1,] # SERMS rawdata <- rawdata[rawdata$serms_v3y0 == 0,]
EGFR rate informs inclusion criteria; only include patients who have 4+ EGFR readings
# Calculate EGFR rate and variance for each patient entry <- c(1:6) #get number of unique visits on file visits<- unique(substring(names(rawdata)[grep(paste("^egfr_ckd_epi", sep = ""), names(rawdata))],14)) #init average yearly egfr exponential rate and volatility Qbar <- rep(0,dim(rawdata)[1]) Qvar <- Qbar #init valid visit counter count <- Qbar #get baseline egfr Q0 <- rawdata$egfr_ckd_epi_v3y0 #get baseline time t0 <- rawdata$days_thisvisit_v3y0 #loop over visits to calculate egfr return for every year for (visit in visits[2:length(visits)]){ Qt <- NA t <- NA #get current year egfr Qt <- rawdata[,paste("egfr_ckd_epi_",visit,sep = "")] #get time this visit t <- rawdata[,paste("days_thisvisit_",visit,sep = "")] #calculate egfr return in units of per year #Qdot <- 365*log(Qt/Q0)/(t-t0) #calculate egfr return in units of per visit Qdot <- log(Qt/Q0) #print(visit) #print(Qdot) #get the valid visit indecies vidx <- which(!is.na(Qdot)) count[vidx] <- count[vidx] + 1 #accumulate first and second moments Qbar[vidx] <- Qbar[vidx]+Qdot[vidx] Qvar[vidx] <- Qvar[vidx]+Qdot[vidx]*Qdot[vidx] #update previous time and egfr Q0 <- Qt t0 <- t } #calculate mean and variance Qbar <- Qbar/count Qvar <- Qvar/count - Qbar*Qbar #update rawdata rawdata$Qbar <- Qbar rawdata$Qvar <- Qvar QQbar<- quantile(rawdata$Qbar,probs = seq(0,1,.25), na.rm = TRUE) QQvar<- quantile(rawdata$Qvar,probs = seq(0,1,.25), na.rm = TRUE)
#init long form data for egfr with baseline visit df <- rawdata[1:2] df$rgroup <- findInterval(rawdata$Qbar, QQbar) df$vgroup <- findInterval(rawdata$Qvar, QQvar) df$mu_egfr_rate <- rawdata$Qbar df$sigma_egfr_rate <-rawdata$Qvar visit <- visits[1] df$egfr <- rawdata[,paste("egfr_ckd_epi_",visit,sep = "")] df$year <- 0 #loop over visits for (i in c(2:length(visits))){ df2 <- rawdata[1:2] df2$rgroup <- findInterval(rawdata$Qbar, QQbar) df2$vgroup <- findInterval(rawdata$Qvar, QQvar) df2$mu_egfr_rate <- rawdata$Qbar df2$sigma_egfr_rate <-rawdata$Qvar df2$egfr <- rawdata[,paste("egfr_ckd_epi_",visits[i],sep = "")] df2$year <- i df <- rbind(df,df2) } # delete df2 rm(df2) # add egfr information to raw data egfr_v0 <- df[df$year == 0,] rawdata$mu_egfr <- egfr_v0$mu_egfr_rate rawdata$sigma_egfr <- egfr_v0$sigma_egfr_rate rawdata$r_group <- egfr_v0$rgroup rawdata$v_group <- egfr_v0$vgroup # drop patients with less than 3 intervals egfr_readings <- count rawdata$egfr_readings <- egfr_readings few_egfr_ids <- rawdata[rawdata$egfr_readings < 3,]$patientid
# save the initial year df_init <- rawdata[,grepl("v3y0", colnames(rawdata))] # find all factors from v3y0 df_init$patientid <- rawdata$patientid # include patient id information df_init <- df_init[c(ncol(df_init), 1:(ncol(df_init)-1))] # drop patients with > 20% missing from initial visit missing20_ids <- c() for (p in 1:nrow(df_init)) { if (sum(is.na(df_init[p,])) / ncol(df_init) > .2) { missing20_ids <- c(missing20_ids, df_init[p,1]) } } # drop the patients df_init <- df_init[!df_init$patientid %in% few_egfr_ids,] df_init <- df_init[!df_init$patientid %in% missing20_ids,] # save egfr data egfr_data <- data.frame(rawdata$patientid, rawdata$mu_egfr, rawdata$sigma_egfr, rawdata$r_group, rawdata$v_group) egfr_data <- egfr_data[egfr_data$rawdata.patientid %in% df_init$patientid,]
Drop features that have been calculated such as survival analyses, were used for screening, are missing more than 20% of entries, etc.
library(knitr) #drop features that are not useful ignore <- c() # drop features that contain > 20% NA or missing values in the initial data set and record which features they are for (factor in 1:ncol(df_init)) { blanks <- sum(is.na(df_init[,factor])) # number of NA entries if (blanks == 0) blanks <- nrow(df_init[df_init[,factor] == "",]) # number of blank entries total <- length(df_init[,factor]) # total number of entries if (blanks/total >= .2) { ignore <- c(ignore, factor) } # ignore factor if >= 20% of entries are blank } # drop calculated features # fram ignore <- c(ignore, grep("fram", names(df_init))) # survival analysis ignore <- c(ignore, grep("sa_", names(df_init))) ignore <- c(ignore, grep("time_", names(df_init))) ignore <- c(ignore, grep("scr_roche", names(df_init))) # side by side time ignore <- c(ignore, grep("side_by_side", names(df_init))) # drop screening factors (should be the same across the board now) ignore <- c(ignore, grep("amputation", names(df_init))) ignore <- c(ignore, grep("genetic_consent", names(df_init))) ignore <- c(ignore, grep("serms", names(df_init))) # drop ions (if present) ignore <- c(ignore, grep("ion_", names(df_init))) # drop certain categorical variables based on distribution ignore <- c(ignore, grep("albuminuria", names(df_init))) # derived ignore <- c(ignore, grep("diabetes", names(df_init))) # everyone has it ignore <- c(ignore, grep("roche", names(df_init))) # derived ignore <- c(ignore, grep("hibp", names(df_init))) # repetitive ignore <- c(ignore, grep("igfr_baseline", names(df_init))) # we use egfr #ignore <- c(ignore, grep("egfr", names(df_init))) # keep this because we associate vars with it at baseline ignore<- c(ignore, grep("egfr_cric", names(df_init))) # we are using egfr ckd bc clinical standard ignore <- c(ignore, grep("esrd", names(df_init))) # drop u 24h proteins because they are measured in urinary_protein (may not be as reliable as extretion rate, which is grams/24hr) ignore <- c(ignore, grep("ualbumin", names(df_init))) ignore <- c(ignore, grep("ucreatinine", names(df_init))) ignore <- c(ignore, grep("uprotein", names(df_init))) # remove waist because it is highly correlated with BMI; fit to a linear model to drive point home lm.res <- lm(waist_v3y0 ~ bmi_v3y0, df_init) kable(data.frame("B" = lm.res$coefficients[2], "p" = summary(lm.res)$coefficients[,4][2] , "r squared" = summary(lm.res)$r.squared), label = "linear model: BMI as a function of waist") ignore <- c(ignore, grep("waist", names(df_init))) # remove MAP because it is calculated ignore <- c(ignore, grep("map", names(df_init))) # drop some education and race categorical vars due to redundancy ignore<- c(ignore, grep("edu_cat_2", names(df_init))) ignore<- c(ignore, grep("edu_cat_1", names(df_init))) ignore<- c(ignore, grep("race_cat_1", names(df_init))) ignore<- c(ignore, grep("race_ethnicity_v3y0", names(df_init))) # drop ignore features from df_init df_init <- df_init[-ignore] print(paste0("Drop ", length(ignore), " features."))
#Turn all categorical variables into binary variables df_init <- cat_to_binary(df_init) rawdata <- cat_to_binary(rawdata)
Drop features that do not predict baseline EGFR well + linear model for continuous factors + wilcox for categorical
library(knitr) # continuous vars df_cont <- data.frame("init" = c(1:ncol(df_init)), "name" = 0, "B" = 0, "p" = 0, "r_squared" = 0) signif <- c() cont_drop <- c() line <- 1 for (factor in 2:ncol(df_init)) { if (is.continuous(df_init, factor)) { df_cont$name[line] <- colnames(df_init)[factor] lm.res <- lm(df_init$egfr_ckd_epi_v3y0 ~ df_init[,factor]) df_cont$B[line] <- lm.res$coefficients[2] df_cont$p[line] <- summary(lm.res)$coefficients[,4][2] df_cont$r_squared[line] <- summary(lm.res)$r.squared if (df_cont$p[line] >= 0.05) { signif <- c(signif, factor) cont_drop <- c(cont_drop, factor) } line <- line + 1 } } df_cont <- df_cont[df_cont$name != 0,] df_cont <- df_cont[-1] print(kable(df_cont), label = description) # categorical vars df_cat <- data.frame("init" = c(1:(ncol(df_init)-1)), "name" = 0, "pval" = 0, "n" = 0, "percent" = 0) df_cat <- df_cat[-1] line <- 1 cat_drop <- c() for (factor in 2:ncol(df_init)) { if (length(table(df_init[,factor])) == 1) {} # drop factors wth no variation else if (!is.continuous(df = df_init, f = factor)) { df_cat$name[line] <- colnames(df_init)[factor] one <- df_init[df_init[,factor] == 1,]$egfr_ckd_epi_v3y0 not_one <- df_init[df_init[,factor] != 1,]$egfr_ckd_epi_v3y0 df_cat$pval[line] <- wilcox.test(one, not_one)$p.value cat_summary <- as.data.frame(table(df_init[,factor], useNA = "always")) df_cat$n[line] <- cat_summary[cat_summary$Var1==1,]$Freq[1] df_cat$percent[line] <- df_cat$n[line]/sum(cat_summary$Freq) * 100 if (df_cat$pval[line] < 0.05) { signif <- c(signif, factor) cat_drop <- c(cat_drop, factor) } line <- line + 1 } } df_cat <- df_cat[df_cat$name != 0,] kable(df_cat) print(paste0(length(cont_drop), " continuous features had a p value of less than 0.05 predicting baseline egfr with the linear model.")) print(paste0(length(cat_drop), " categorical features had a p value of less than 0.05 predicting baseline egfr with the wilcox test."))
Drop features that do not predict baseline A1C well + linear model for continuous factors + wilcox for categorical
library(knitr) # continuous vars df_cont <- data.frame("init" = c(1:ncol(df_init)), "name" = 0, "B" = 0, "p" = 0, "r_squared" = 0) cont_drop <- c() line <- 1 for (factor in 2:ncol(df_init)) { if (is.continuous(df_init, factor)) { df_cont$name[line] <- colnames(df_init)[factor] lm.res <- lm(df_init$hemoglobin_a1c_v3y0 ~ df_init[,factor]) df_cont$B[line] <- lm.res$coefficients[2] df_cont$p[line] <- summary(lm.res)$coefficients[,4][2] df_cont$r_squared[line] <- summary(lm.res)$r.squared if (df_cont$p[line] >= 0.05) { signif <- c(signif, factor) cont_drop <- c(cont_drop, factor) } line <- line + 1 } } df_cont <- df_cont[df_cont$name != 0,] df_cont <- df_cont[-1] print(kable(df_cont), label = description) # categorical vars df_cat <- data.frame("init" = c(1:(ncol(df_init)-1)), "name" = 0, "pval" = 0, "n" = 0, "percent" = 0) df_cat <- df_cat[-1] line <- 1 cat_drop <- c() for (factor in 2:ncol(df_init)) { if (length(table(df_init[,factor])) == 1) {} # drop factors wth no variation else if (!is.continuous(df = df_init, f = factor)) { df_cat$name[line] <- colnames(df_init)[factor] one <- df_init[df_init[,factor] == 1,]$hemoglobin_a1c_v3y0 not_one <- df_init[df_init[,factor] != 1,]$hemoglobin_a1c_v3y0 df_cat$pval[line] <- wilcox.test(one, not_one)$p.value cat_summary <- as.data.frame(table(df_init[,factor], useNA = "always")) df_cat$n[line] <- cat_summary[cat_summary$Var1==1,]$Freq[1] df_cat$percent[line] <- df_cat$n[line]/sum(cat_summary$Freq) * 100 if (df_cat$pval[line] < 0.05) { signif <- c(signif, factor) cat_drop <- c(cat_drop, factor) } line <- line + 1 } } df_cat <- df_cat[df_cat$name != 0,] kable(df_cat) print(paste0(length(cont_drop), " continuous features had a p value of less than 0.05 predicting baseline A1C with the linear model.")) print(paste0(length(cat_drop), " categorical features had a p value of less than 0.05 predicting baseline A1C with the wilcox test."))
print(paste0("total n features: ", length(unique(signif)))) # ensure that age and sex are in signif signif <- c(signif, c(grep("age", names(df_init)), grep("sex", names(df_init)))) include_factors <- names(df_init)[unique(signif)] cont_summary <- data.frame(matrix(nrow = 100, ncol= 4, 0)) colnames(cont_summary) <- c("name", "n", "mean", "std") df_include <- df_init[colnames(df_init)%in%include_factors] i <- 1 for (factor in 1:ncol(df_include)) { if (length(table(df_include[,factor])) > 10) { # factor is continuous cont_summary$name[i] <- colnames(df_include)[factor] cont_summary$n[i] <- sum(!is.na(df_include[,factor])) cont_summary$mean[i] <- mean(df_include[,factor], na.rm = TRUE) cont_summary$std[i] <- sd(df_include[,factor], na.rm = TRUE) i <- i + 1 } else { # factor is categorical cat_summary <- as.data.frame(table(df_include[,factor], useNA = "always")) cat_summary$Percentage <- (cat_summary$Freq*100)/total print(kable(cat_summary, caption = colnames(df_include)[factor])) } } cont_summary <- cont_summary[cont_summary$name != 0,] print(kable(cont_summary, caption = "continuous variable summary"))
Impute missing values ``` {r echo = FALSE, message = FALSE}
library(DMwR)
imputed_patients <- as.integer(rownames(df_include[!complete.cases(df_include),]))
imputed_data <- knnImputation(df_include, k = 10, meth = "median", scale = F)
non_imputed <- na.omit(df_include)
print(paste0(nrow(imputed_data)-nrow(non_imputed), " patients are dropped without imputation"))
```r library(car) transform_continuous_data <- function(df, description, print_summary){ # if lambda needed to transform data is too high and there is not a significant change in normality, do not transform the data summary <- data.frame("init" = c(1:length(df_include)), "name" = 0, "initial_rsquared" = 0, "transformed_rsquared" = 0, "lambda" = 0, "transformed" = 1) summary <- summary[-c(1)] line <- 1 biglambda <- c() untransformed <- c() for (factor in 1:ncol(df)) { if (length(table(df[,factor])) > 10) { # factor is continuous my_object <- df[,factor] name <- colnames(df)[factor] # before transformation qq_before <- qqnorm(my_object, ylab=name, main = paste0(name, " before transformation"), plot.it = FALSE) r_before <- cor(qq_before$x,qq_before$y) bc_dfult <- boxCox(my_object ~ 1, plotit = F, lambda = seq(-5,5,.1), eps = 0, family = "yjPower") bc_df <- data.frame(bc_dfult$x, bc_dfult$y) lambda <- bc_df[with(bc_df, order(-bc_df$bc_dfult.y)),][1,1] # after transformation if (lambda == 0) {transformed <- log(my_object)} if (lambda != 0) {transformed <- (my_object ^ lambda -1) / lambda} qq_after <- qqnorm(transformed, ylab=name, main = paste0(name, " after transformation"), plot.it = FALSE) r_after <- cor(qq_after$x,qq_after$y, use = "na.or.complete") if ((r_after - r_before)/r_before < .05) { # if there is a significant shift in r (don't care about lambda), then transform transformed <- my_object summary$transformed[line] <- 0 untransformed <- c(untransformed, name) # note exceptions } else { df[,factor] <- transformed if (lambda> 1) {biglambda <- c(biglambda, name)} #note big lambdas } summary$name[line] <- name summary$initial_rsquared[line] <- round(r_before*r_before, digits = 3) summary$transformed_rsquared[line] <- round(r_after*r_after, digits = 3) summary$lambda[line] <- lambda #summary$n_imputed[line] <- sum(is.na(df_include[,factor])) #summary$percent_imputed[line] <- round(sum(is.na(df_include[,factor]))/nrow(df_include), digits = 3) summary$minval[line] <- round(min(df[,factor], na.rm = TRUE), digits = 3) summary$maxval[line] <- round(max(df[,factor], na.rm = TRUE), digits = 3) line <- line + 1 # unit scale df df[,factor] <- df[,factor]-min(df[,factor], na.rm = TRUE) df[,factor] <- df[,factor]/max(df[,factor], na.rm = TRUE) } } summary <- summary[summary$name != 0,] # note these exceptions if (print_summary) { print(kable(summary, label = description)) print("The following continuous factors were not transformed because transformation did not significantly shift them towards normality: ") print(untransformed) } return(df) }
# transform the data transformed_imputed_data <- transform_continuous_data(imputed_data, "imputed data", TRUE) transformed_nonimputed_data <- transform_continuous_data(non_imputed, "non imputed data", FALSE)
use a linear model to predict pcs rate
only_imputed<- transformed_imputed_data[rownames(transformed_imputed_data) %in% imputed_patients,] imputed_lm.res <- lm(sf12_pcs_v3y0 ~ . , data = transformed_imputed_data) print(paste0("imputed data r squared: ", summary(imputed_lm.res)$r.squared)) nonimputed_lm.res <- lm(sf12_pcs_v3y0 ~ . , data = transformed_nonimputed_data) print(paste0("non imputed data r squared: ", summary(nonimputed_lm.res)$r.squared)) only_imputed_lm.res <- lm(sf12_pcs_v3y0 ~ . , data = only_imputed) print(paste0("ONLY imputed data r squared: ", summary(only_imputed_lm.res)$r.squared))
# save selected clinical variables with patient ID and pcs rate final_data <- transformed_imputed_data final_data <- data.frame(final_data, egfr_data) final_data <- final_data[c((ncol(final_data)-4):ncol(final_data), 1:(ncol(final_data)-5))] write.csv(final_data, "../data/preprocessing_results.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.