knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=6, fig.height=4 )
The package 'RIC' includes a simulatd dataset called reg_data
. In this vignette, we explain the steps that was taken to simulated the dataset.
First, we load the package 'MASS', which will enable us to use multivariate sampling.
library(MASS)
Next, we create a function called 'generate_date', which will create a sample dataset with follow-up time, number of events, treatment assignment, and three covariates.
generate_data <- function(n_data = 1000, ln_rate_coeffs = c(intercept=-0.5, tx=log(1/3), c1=log(1), c2=log(2), c3=log(3)), rate_v = 1, cov_mu = c(0,0,0), cov_sigma = rbind(c(1,0,0), c(0,1,0), c(0,0,1))) { tx <- (runif(n_data) > 0.5) * 1 #Covariates can be correlated see cov_sigma input parameter multivariateSample <- mvrnorm(n = n_data, mu = cov_mu, Sigma = cov_sigma) multivariateSample[,1] <- (multivariateSample[,1] > 0.5) * 1 #changing the first covariate to binary #Calculating the actual rate of event for each person rate_mu <- exp(ln_rate_coeffs['intercept'] +ln_rate_coeffs['tx'] * tx +ln_rate_coeffs['c1'] * multivariateSample[,1] +ln_rate_coeffs['c2'] * multivariateSample[,2] +ln_rate_coeffs['c3'] * multivariateSample[,3] ) #Heterogeneity in rate is modelled with a gamma distribution, giving rise to negative binomial regression rates <- rgamma(n_data, rate = rate_mu / rate_v, shape = rate_mu * rate_mu / rate_v) #Follow-up time is cut at one year times <- exp(rnorm(n_data, log(1), 0.1)) times[which(times>1)] <- 1 events <- rpois(n_data,rates*times) out_data <- as.data.frame(cbind(ln_time = log(times), events = events, tx = tx, c1 = multivariateSample[,1], c2 = multivariateSample[,2], c3 = multivariateSample[,3])) return(out_data) }
Next, we create a second function by the name of 'generate_marker', which generates a risk score, as an example of a marker.
generate_marker <- function(formula = events~tx+c1+c2+c3+offset(ln_time), reg_data) { reg <- glm.nb(formula = formula, data = reg_data, link = log) #Risk score is the predicted values from the regression model with tx=0 and follow-up time set to 1 new_data <- as.data.frame(reg_data) new_data[,'tx'] <- 0 new_data[,'time'] <- 1 new_data[,'ln_time'] <- 0 return (predict.glm(reg, newdata = new_data)) }
Now that we have written these two functions, we can use them to generate a simulated dataset with a sample size of 1000:
marker_formula <- events ~ tx + c1 + c2 + c3 + offset(ln_time) sample_size <- 1000 reg_data <- generate_data(sample_size) reg_data[,'x'] <- generate_marker(formula = marker_formula, reg_data)
The simulated dataset is exported with the package, and can be called with RIC::reg_data
.
library(MASS) #Generates a sample dataset with follow-up time, number of events, treatment assignment, and three covariates generate_data<-function(n_data=1000, ln_rate_coeffs=c(intercept=-0.5,tx=log(1/3),c1=log(1),c2=log(2),c3=log(3)),rate_v=1, cov_mu=c(0,0,0), cov_sigma=rbind(c(1,0,0),c(0,1,0),c(0,0,1))) { #Generate covariates - we are generating one binary variable for treatment, and one binary variable and two continuous variables as covariates tx<-(runif(n_data)>0.5)*1 #Covariates can be correlated see cov_sigma input parameter temp<-mvrnorm(n=n_data, mu=cov_mu, Sigma=cov_sigma) temp[,1]<-(temp[,1]>0.5)*1 #changing the first covariate to binary #Calculating the actual rate of event for each person rate_mu<-exp(ln_rate_coeffs['intercept'] +ln_rate_coeffs['tx']*tx +ln_rate_coeffs['c1']*temp[,1] +ln_rate_coeffs['c2']*temp[,2] +ln_rate_coeffs['c3']*temp[,3] ) #Heterogeneity in rate is modelled with a gamma distribution, giving rise to negative binomial regression rates<-rgamma(n_data, rate=rate_mu/rate_v, shape=rate_mu*rate_mu/rate_v) #Follow-up time is cut at one year times<-exp(rnorm(n_data,log(1),0.1)) times[which(times>1)]<-1 events<-rpois(n_data,rates*times) out_data<-as.data.frame(cbind(ln_time=log(times),events=events,tx=tx,c1=temp[,1],c2=temp[,2],c3=temp[,3])) return(out_data) } #Generates the risk score (as an example of a marker) generate_marker<-function(formula=events~tx+c1+c2+c3+offset(ln_time),reg_data) { reg<-glm.nb(formula=formula,data=reg_data,link=log) #Risk score is the predicted values from the regression model with tx=0 and follow-up time set to 1 new_data<-as.data.frame(reg_data) new_data[,'tx']<-0 new_data[,'time']<-1 new_data[,'ln_time']<-0 return(predict.glm(reg,newdata=new_data)) } marker_formula<-events~tx+c1+c2+c3+offset(ln_time) sample_size<-1000 reg_data<-generate_data(sample_size) reg_data[,'x']<-generate_marker(formula=marker_formula,reg_data)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.