# upload raw data; clinical features located on columns 7 - 2271 rawdata <- read.csv("../data/RedCap.csv") rawdata <- rawdata[c(2, 12:2271)] # keep patient ID and clinical features # untargeted CRIC metadata untarmet <- read.csv("../data/clinical_creatinine_normalized_data/CRIC_clinical_creatinine_normalized_samplemetadata.csv", header=FALSE, stringsAsFactors = FALSE)
is.continuous <- function(df, f){ return(length(table(df[,f])) > 10) } factorize <- function(df){ for (factor in 1:ncol(df)) { if (length(table(df[,factor])) == 2) { df[,factor] <- as.factor(df[,factor]) } } return(df) }
# extract patient IDs from untargeted data vid <- substr(untarmet$V2,1,7) pid <- unique(vid) # remove patients in rawdata who are not represented in untarmet, and vice versa if (any(!pid %in% rawdata$patientid)) {print("there are patients in the metadata that are not in the raw data")} no_match <- row.names(rawdata[!rawdata$patientid %in% pid,]) rawdata <- rawdata[-as.integer(no_match),] if (any(!rawdata$patientid %in% pid)) {print("there are patients in the raw data that are not in the metadata")} #test #subset initial visit only df_init <- data.frame(rawdata$patientid) # include patient id information df_init <- data.frame(df_init, rawdata[,grepl("v3y0", colnames(rawdata))]) # find all factors from v3y0
drop_patients <- c() # amputation amputee_ids <- df_init[grep(1, df_init$amputation_v3y0),]$rawdata.patientid # genetic consent no_consent_ids <- df_init[grep(0, df_init$genetic_consent_bl_v3y0),]$rawdata.patientid # SERMS SERMS_ids <- df_init[grep(1, df_init$serms_v3y0),]$rawdata.patientid
EGFR rate informs inclusion criteria; only include patients who have 4+ EGFR readings
# Calculate EGFR rate and variance for each patient entry <- c(1:6) #get number of unique visits on file visits<- unique(substring(names(rawdata)[grep(paste("^egfr_cric", "_v", "[0-9]+", "y", "[0-9]+", sep = ""), names(rawdata))],11)) #init average yearly egfr exponential rate and volatility Qbar <- rep(0,dim(rawdata)[1]) Qvar <- Qbar #init valid visit counter count <- Qbar #get baseline egfr Q0 <- rawdata$egfr_cric_v3y0 #get baseline time t0 <- rawdata$days_thisvisit_v3y0 #loop over visits to calculate egfr return for every year for (visit in visits[2:length(visits)]){ #get current year egfr Qt <- rawdata[,paste("egfr_cric_",visit,sep = "")] #get time this visit t <- rawdata[,paste("days_thisvisit_",visit,sep = "")] #calculate egfr return in units of per year #Qdot <- 365*log(Qt/Q0)/(t-t0) #calculate egfr return in units of per visit Qdot <- log(Qt/Q0) #print(visit) #print(Qdot) #get the valid visit indecies vidx <- which(!is.na(Qdot)) count[vidx] <- count[vidx] + 1 #accumulate first and second moments Qbar[vidx] <- Qbar[vidx]+Qdot[vidx] Qvar[vidx] <- Qvar[vidx]+Qdot[vidx]*Qdot[vidx] #update previous time and egfr Q0 <- Qt t0 <- t } #calculate mean and variance Qbar <- Qbar/count Qvar <- Qvar/count - Qbar*Qbar #update rawdata rawdata$Qbar <- Qbar rawdata$Qvar <- Qvar QQbar<- quantile(rawdata$Qbar,probs = seq(0,1,.25), na.rm = TRUE) QQvar<- quantile(rawdata$Qvar,probs = seq(0,1,.25), na.rm = TRUE) #init long form data for egfr with baseline visit df <- rawdata[1:2] df$rgroup <- findInterval(rawdata$Qbar, QQbar) df$vgroup <- findInterval(rawdata$Qvar, QQvar) df$mu_egfr_rate <- rawdata$Qbar df$sigma_egfr_rate <-rawdata$Qvar visit <- visits[1] df$egfr <- rawdata[,paste("egfr_cric_",visit,sep = "")] df$year <- 0 #loop over visits for (i in c(2:length(visits))){ df2 <- rawdata[1:2] df2$rgroup <- findInterval(rawdata$Qbar, QQbar) df2$vgroup <- findInterval(rawdata$Qvar, QQvar) df2$mu_egfr_rate <- rawdata$Qbar df2$sigma_egfr_rate <-rawdata$Qvar df2$egfr <- rawdata[,paste("egfr_cric_",visits[i],sep = "")] df2$year <- i df <- rbind(df,df2) } # delete df2 rm(df2) # add egfr information to raw data egfr_v0 <- df[df$year == 0,] rawdata$mu_egfr <- egfr_v0$mu_egfr_rate rawdata$sigma_egfr <- egfr_v0$sigma_egfr_rate rawdata$r_group <- egfr_v0$rgroup rawdata$v_group <- egfr_v0$vgroup # drop patients with less than 4 egfr readings egfr_readings <- count + 1 rawdata$egfr_readings <- egfr_readings few_egfr_ids <- rawdata[rawdata$egfr_readings < 4,]$patientid
# drop patients with > 20% missing from initial visit missing20_ids <- c() for (p in 1:nrow(df_init)) { if (sum(is.na(df_init[p,])) / ncol(df_init) > .2) { missing20_ids <- c(missing20_ids, df_init(p,1)) } } # drop the patients from the raw data set rawdata <- rawdata[!rawdata$patientid %in% amputee_ids,] rawdata <- rawdata[!rawdata$patientid %in% few_egfr_ids,] rawdata <- rawdata[!rawdata$patientid %in% missing20_ids,] rawdata <- rawdata[!rawdata$patientid %in% no_consent_ids,] rawdata <- rawdata[!rawdata$patientid %in% SERMS_ids,] # update the initial year df_init <- data.frame(rawdata$patientid) # include patient id information df_init <- data.frame(df_init, rawdata[,grepl("v3y0", colnames(rawdata))]) # find all factors from v3y0 # save egfr data egfr_data <- data.frame(rawdata$mu_egfr, rawdata$sigma_egfr, rawdata$r_group, rawdata$v_group) # save patient data patients <- rawdata$patientid
Drop features that have been calculated such as survival analyses, were used for screening, are missing more than 20% of entries, etc.
library(knitr) #drop features that are not useful in initial data set and in raw data ignore <- c() # drop features that contain > 20% NA or missing values in the initial data set and record which features they are for (factor in 1:ncol(df_init)) { blanks <- sum(is.na(df_init[,factor])) # number of NA entries if (blanks == 0) blanks <- nrow(df_init[df_init[,factor] == "",]) # number of blank entries total <- length(df_init[,factor]) # total number of entries if (blanks/total >= .2) { ignore <- c(ignore, factor) } # ignore factor if >= 20% of entries are blank } # drop calculated features # fram ignore <- c(ignore, grep("fram", names(df_init))) # survival analysis ignore <- c(ignore, grep("sa_", names(df_init))) ignore <- c(ignore, grep("time_", names(df_init))) ignore <- c(ignore, grep("scr_roche", names(df_init))) # side by side time ignore <- c(ignore, grep("side_by_side", names(df_init))) # drop screening factors (should be the same across the board now) ignore <- c(ignore, grep("amputation", names(df_init))) ignore <- c(ignore, grep("genetic_consent", names(df_init))) ignore <- c(ignore, grep("serms", names(df_init))) # drop ions (if present) ignore <- c(ignore, grep("ion_", names(df_init))) # drop certain categorical variables based on distribution ignore <- c(ignore, grep("albuminuria", names(df_init))) ignore <- c(ignore, grep("diabetes", names(df_init))) ignore <- c(ignore, grep("roche", names(df_init))) ignore <- c(ignore, grep("hibp", names(df_init))) ignore <- c(ignore, grep("igfr_baseline", names(df_init))) ignore <- c(ignore, grep("egfr", names(df_init))) ignore <- c(ignore, grep("esrd", names(df_init))) # drop u 24h proteins because they are measured in urinary_protein ignore <- c(ignore, grep("ualbumin", names(df_init))) ignore <- c(ignore, grep("ucreatinine", names(df_init))) ignore <- c(ignore, grep("uprotein", names(df_init))) # remove waist because it is highly correlated with BMI; fit to a linear model to drive point home lm.res <- lm(waist_v3y0 ~ bmi_v3y0, df_init) kable(data.frame("B" = lm.res$coefficients[2], "p" = summary(lm.res)$coefficients[,4][2] , "r squared" = summary(lm.res)$r.squared), label = "linear model: BMI as a function of waist") ignore <- c(ignore, grep("waist", names(df_init))) #drop all features that we dropped from the first visit in all the other visits ig_raw <- c() for (f in 1:length(names(df_init)[ignore])) { lookfor <- gsub("v3y0", "", names(df_init)[ignore][f]) ig_raw <- c(ig_raw, grep(lookfor, names(rawdata))) } ig_raw <- unique(ig_raw) rawdata <- rawdata[-ig_raw] # drop ignore features from df_init df_init <- df_init[-ignore] print(paste0("Drop ", length(ignore), " features."))
Convert categorical data to binary
#Turn all categorical variables into binary variables cat_to_binary <- function(df) { n_factors <- length(df) colnames <- colnames(df) for (factor in 2:ncol(df)) { # for each factor if ((length(table(df[,factor])) < 10) && (length(table(df[,factor])) > 2)) { # if factor is categorical and not already binary new_names <- levels(as.data.frame(table(df[,factor]))$Var1) # save category names as new_names for (name in 1:length(new_names)) { # for each of these category names df[,n_factors + 1] <- as.integer(grepl(name,df[,factor])) # add a column to df with binary form of the cat colnames <- c(colnames, paste0(colnames[factor], "_", name)) # update df column names n_factors <- n_factors + 1 # update number of factors in df } df <- df[-factor] # remove original categorical factor colnames <- colnames[-factor] # update colnames n_factors <- n_factors - 1 # update number of factors in df } } colnames(df) <- colnames return(df) } df_init <- cat_to_binary(df_init) rawdata <- cat_to_binary(rawdata)
Random Forest as a validation for factor dropping later, exclude nas
library(randomForest) library(knitr) df_rf <- factorize(df_init) df_rf <- df_rf[-1] df_rf <- data.frame(df_rf, egfr_data$rawdata.mu_egfr) rf.test <- randomForest(df_rf$egfr_data.rawdata.mu_egfr ~ . , data = df_rf, importance = TRUE, na.action = "na.exclude") mse <- as.data.frame(importance(rf.test, type = 1)) kable(mse) mse <- data.frame(rownames(mse), mse) rf_include <-mse[mse$X.IncMSE > 1.9,]$rownames.mse.
Drop features that do not predict mu EGFR well + linear model for continuous factors + wilcox for categorical
library(knitr) # continuous vars df_init <- data.frame(df_init, egfr_data$rawdata.mu_egfr) df_cont <- data.frame("init" = c(1:ncol(df_init)), "name" = 0, "B" = 0, "p" = 0, "r_squared" = 0) not_associated <- c() line <- 1 for (factor in 2:ncol(df_init)) { if (is.continuous(df_init, factor)) { df_cont$name[line] <- colnames(df_init)[factor] lm.res <- lm(df_init$egfr_data.rawdata.mu_egfr ~ df_init[,factor]) df_cont$B[line] <- lm.res$coefficients[2] df_cont$p[line] <- summary(lm.res)$coefficients[,4][2] df_cont$r_squared[line] <- summary(lm.res)$r.squared if (!any(grepl(names(df_init)[factor], rf_include))) {# save factors that had an mse > 1.9 fom random forest even if not significant if (df_cont$p[line] >= 0.05) {not_associated <- c(not_associated, factor)} } line <- line + 1 } } df_cont <- df_cont[df_cont$name != 0,] df_cont <- df_cont[-1] print(kable(df_cont), label = description) cont_drop <- length(not_associated) # categorical vars df_cat <- data.frame("init" = c(1:ncol(df_init)), "name" = 0, "pval" = 0, "n" = 0, "percent" = 0) df_cat <- df_cat[-1] line <- 1 for (factor in 2:ncol(df_init)) { if (length(table(df_init[,factor])) == 1) { not_associated <- c(not_associated, factor)} # drop factors wth no variation else if (!is.continuous(df = df_init, f = factor)) { # df_cat$name[line] <- colnames(df_init)[factor] one <- df_init[df_init[,factor] == 1,]$egfr_data.rawdata.mu_egfr not_one <- df_init[df_init[,factor] != 1,]$egfr_data.rawdata.mu_egfr df_cat$pval[line] <- wilcox.test(one, not_one)$p.value cat_summary <- as.data.frame(table(df_init[,factor], useNA = "always")) df_cat$n[line] <- cat_summary[cat_summary$Var1==1,]$Freq[1] df_cat$percent[line] <- df_cat$n[line]/sum(cat_summary$Freq) * 100 if(!grepl("sex", names(df_init)[factor])) { # save sex if (!any(grepl(names(df_init)[factor], rf_include))) { # save factors that had an mse > 1.9 from random forest even if not significant if (df_cat$pval[line] >= 0.05) {not_associated <- c(not_associated, factor)}} } line <- line + 1 } } df_cat <- df_cat[df_cat$name != 0,] kable(df_cat) cat_drop <- length(not_associated)-cont_drop # drop some variiables due to redundancy after looking at codes in cric codebook # drop race cat 1 and race ethnicity because they are redundant nad does not contain as much info as race_ethnicity_cat2 not_associated<- c(not_associated, grep("race_cat_1", names(df_init))) not_associated<- c(not_associated, grep("race_ethnicity_v3y0", names(df_init))) # drop edu cat 2 and edu cat 1 bc redundant not_associated<- c(not_associated, grep("edu_cat_2", names(df_init))) not_associated<- c(not_associated, grep("edu_cat_1", names(df_init))) # dropping features that are not related to mu egfr #drop all features that we dropp from the first visit in all the other visits not_assoc_raw <- c() for (f in 1:length(names(df_init)[not_associated])) { lookfor <- gsub("v3y0", "", names(df_init)[ignore][f]) not_assoc_raw <- c(not_assoc_raw, grep(lookfor, names(rawdata))) } not_assoc_raw <- !is.na(unique(not_assoc_raw)) rawdata <- rawdata[-not_assoc_raw] df_init <- df_init[-not_associated] print(paste0(cont_drop, " continuous features were dropped because they did not predict egfr well enough and were not found significant by random forest.")) print(paste0(cat_drop, " categorical features were dropped because they did not predict egfr well enough and were not found significant by random forest."))
library(knitr) cont_df <- data.frame(matrix(nrow = 100, ncol= 4, 0)) colnames(cont_df) <- c("name", "n", "mean", "std") i <- 1 for (factor in 2:ncol(df_init)) { if (length(table(df_init[,factor])) > 10) { # factor is continuous cont_df$name[i] <- colnames(df_init)[factor] cont_df$n[i] <- sum(!is.na(df_init[,factor])) cont_df$mean[i] <- mean(df_init[,factor], na.rm = TRUE) cont_df$std[i] <- sd(df_init[,factor], na.rm = TRUE) i <- i + 1 } else { # factor is categorical cat_summary <- as.data.frame(table(df_init[,factor], useNA = "always")) cat_summary$Percentage <- (cat_summary$Freq*100)/total print(kable(cat_summary, caption = colnames(df_init)[factor])) } } cont_df <- cont_df[cont_df$name != 0,] print(kable(cont_df, caption = "continuous variable summary"))
# Impute missing values using k-nearest neighbors library(DMwR) df_init <- df_init[-ncol(df_init)] # remove egfr data res <- knnImputation(df_init[-1], k = 10, meth = "median", scale = F) # do not use patient id in imputation. use median because there are no NAs in categorical vars, but would want weighavg for those # save a copy of non imputed data non_imputed <- df_init[-1]
# add egfr information to non imputed data set non_imputed$mu_egfr <- egfr_data$rawdata.mu_egfr non_imputed$sigma_egfr <- egfr_data$rawdata.sigma_egfr non_imputed$r_group <- egfr_data$rawdata.r_group non_imputed$v_group <- egfr_data$rawdata.v_group # remove all rows with NA non_imputed <- non_imputed[complete.cases(non_imputed),] # extract non_imputed_egfr data and remove for transformation non_imputed_egfr_data <- rev(non_imputed)[1:4] non_imputed <- non_imputed[- c((length(non_imputed) - 3):length(non_imputed))]
#Normalize continuous data with box-cox transformation transform_continuous_data <- function(df, description){ library(car) # if lambda needed to transform data is too high and there is not a significant change in normality, do not transform the data summary <- data.frame("init" = c(1:200), "name" = 0, "initial_rsquared" = 0, "transformed_rsquared" = 0, "lambda" = 0, "transformed" = 1) summary <- summary[-c(1)] line <- 1 biglambda <- c() untransformed <- c() for (factor in 2:ncol(df)) { if (length(table(df[,factor])) > 10) { # factor is continuous my_object <- df[,factor] name <- colnames(df)[factor] # before transformation qq_before <- qqnorm(my_object, ylab=name, main = paste0(name, " before transformation"), plot.it = FALSE) r_before <- cor(qq_before$x,qq_before$y) bc_dfult <- boxCox(my_object ~ 1, plotit = F, lambda = seq(-5,5,.1), eps = 0, family = "yjPower") bc_df <- data.frame(bc_dfult$x, bc_dfult$y) lambda <- bc_df[with(bc_df, order(-bc_df$bc_dfult.y)),][1,1] # after transformation if (lambda == 0) {transformed <- log(my_object)} if (lambda != 0) {transformed <- (my_object ^ lambda -1) / lambda} qq_after <- qqnorm(transformed, ylab=name, main = paste0(name, " after transformation"), plot.it = FALSE) r_after <- cor(qq_after$x,qq_after$y, use = "na.or.complete") if ((lambda > 1) && ((r_after - r_before)/r_before < .05)) { # if lambda > 1 but there is no significant shift towards normal, use original data transformed <- my_object summary$transformed[line] <- 0 untransformed <- c(untransformed, name) # note exceptions } else { df[,factor] <- transformed if (lambda> 1) {biglambda <- c(biglambda, name)} #note big lambdas } summary$name[line] <- name summary$initial_rsquared[line] <- r_before*r_before summary$transformed_rsquared[line] <- r_after*r_after summary$lambda[line] <- lambda summary$n_imputed[line] <- sum(is.na(df_init[,factor+1])) summary$percent_imputed[line] <- sum(is.na(df_init[,factor+1]))/nrow(df_init) line <- line + 1 } } summary <- summary[summary$name != 0,] # note these exceptions print(kable(summary, label = description)) print("The following continuous factors were not transformed because transformation did not significantly shift them towards normality: ") print(untransformed) return(df) }
# transform the data res <- transform_continuous_data(res, "imputed data") non_imputed <- transform_continuous_data(non_imputed, "non imputed data")
#add egfr values back to result and to non_imputed data res <- data.frame(res, egfr_data) non_imputed <- data.frame(non_imputed, non_imputed_egfr_data)
test.res <- lm(mu_egfr ~ urine_albumin_v3y0 + urine_creatinine_v3y0, data = non_imputed ) print(summary(test.res)$r.squared)
use a linear model to predict egfr rate
imputed_lm.res <- lm(rawdata.mu_egfr ~ . - rawdata.sigma_egfr - rawdata.r_group - rawdata.v_group, data = res) print(paste0("imputed data r squared: ", summary(imputed_lm.res)$r.squared)) nonimputed_lm.res <- lm(mu_egfr ~ . -sigma_egfr - v_group - r_group, data = non_imputed) print(paste0("non imputed data r squared: ", summary(nonimputed_lm.res)$r.squared)) print(paste0("using non imputed data requires ", nrow(res) - nrow(non_imputed), " patients to be dropped due to missing values."))
# based on clinical features, make list of all patient IDs that were included # use included patients to filter untargeted data # load untargeted cric data untar <- read.csv("../data/clinical_creatinine_normalized_data/CRIC_clinical_creatinine_normalized_data.csv", header=FALSE) t_untar <- as.data.frame(t(untar)) t_untar$vid <- vid t_untar <- t_untar[t_untar$vid %in% patients,] t_untar <- t_untar[-15435] # save untargeted data filtered by patient filtered_untar <- as.data.frame(t(t_untar)) write.csv(filtered_untar, "../data/clinical_creatinine_normalized_data/cric_normalized_data_filteredByPatient.csv", row.names = FALSE) # save untargeted metadata filtered by patient filtered_untarmet <- untarmet filtered_untarmet$vid <- vid filtered_untarmet <- filtered_untarmet[filtered_untarmet$vid %in% patients,] filtered_untarmet <- filtered_untarmet[-(length(filtered_untarmet))] write.csv(filtered_untarmet, "../data/clinical_creatinine_normalized_data/CRIC_clinical_creatinine_normalized_samplemetadata_filteredByPatient.csv")
#save data write.csv(data.frame(patients, res), "../data/preprocessingresults.csv", row.names = FALSE) write.csv(non_imputed, "../data/nonimputed_preprocessingresults.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.