#' Fit the Neutral model to B-cell data from busulfan chimeras
#'
#' This function allows to compare model behaviour and statistics for different developmental pathways in B cell compartment using data from BuChi.
#'
#' The basic model is as follws:
#' \deqn{X'(t) = \theta(t) - \lambda X(t)}
#'
#' Neutral model assumes 'homogeneity' in the population of interest considering that all the subsets turnover with a constant, identical rate.
#' Neutral_func solves the system of linear first order ODEs for displaceable donor and host population and fits the 'Neutral model' to the cell counts and normalised donor fractions.
#' Influx of cells from parent population (\eqn{\theta}) changes exponentially with time (mouse age) with the rate \eqn{\nu}.
#' \deqn{\theta(t) = \theta_0 e^-\nu t}
#'
#' Needs the dataframes for Spleen and LN counts in the directory.
#' Data is fitted using Nealder-Mead algorithm in optim.
#' Currently only fits on log transformed cell counts and untransformed donor fractions.
#' 95\% CI is generated using hessian matrix in optim (not by bootstrapping).
#' if mod_diag is set to TRUE, the function will validate whether the best-fit model is behaving normally and is following the basic assumptions of least-square method.
#'
#'
#' @param par_pop column name of the parent population (chracter string e.g. 'SP_T1' or 'LN_FM')
#' @param pop_interest column name of the population of interest (chracter string e.g. 'SP_T1' or 'LN_GC')
#' @param params a vector containing initial parameter values in order c(N0, fd0, theta0, lambda)
#' @param mod_diag Boolean. If TRUE returns residual and QQ plots. By default FALSE.
#'
#'
#' @return AIC value for statistical comparison
#' @return a dataframe containing parameter values with 95\% CI
#' @return plot1 - spline fitted to parent population using linear regression on log cell counts.
#' @return plot2 - Nutral model fitted to cell counts
#' @return plot3 - Neutral model fitted to donor fractions
#' @return plot4 - returns residual and QQ plots in a single page (only if mod_diag == TRUE)
#'
#' @examples neutral_func(compartment= 'Spleen', par_pop = 'SP_T1', pop_interest = 'SP_FM', params = c(7,0.1,5,0.01), mod_diag = TRUE)
#' @export
neutral_func <- function(par_pop, pop_interest, params, mod_diag = NULL){
fancy_scientific <- function(l) {
# turn in to character string in scientific notation
l <- format(l, scientific = TRUE)
# quote the part before the exponent to keep all the digits
l <- gsub("^(.*)e", "'\\1'e", l)
# remove + after exponent, if exists. E.g.: (e^+2 -> e^2)
l <- gsub("e\\+","e",l)
# turn the 'e' into plotmath format
l <- gsub("e", "%*%10^", l)
# convert 1x10^ or 1.000x10^ -> 10^
l <- gsub("\\'1[\\.0]*\\'\\%\\*\\%", "", l)
# return this as an expression
parse(text=l)
}
log10minorbreaks=as.numeric(1:10 %o% 10^(4:8))
# reading the data set from a CSV file
dataframe1 <- read.csv('B_merge_spleen.csv')
dataframe2 <- read.csv('B_merge_ln.csv')
# removing column of indices
dataframe1 <- dataframe1[,-1]
dataframe2 <- dataframe2[,-1]
parpop <- grepl('SP', par_pop)
# fitting a linear model to the parent population of the cell type under study to estimate the rate of input
# selecting the right dataset to depending on the argument par_pop
# Then selcting dataset containing only the counts of parent population and time (mouse age)
if (parpop == TRUE){
parent_pop <- filter(dataframe1, dataframe1$population == par_pop)%>%
select(contains('days'), contains('counts'))
} else {
parent_pop <- filter(dataframe1, dataframe1$population == par_pop)%>%
select(contains('days'), contains('counts'))
}
# using aov function to perform linear regression on the log cell counts of parent population
# linear regression of log cell counts assumes exponential decay of parent population with time
ParPop_LR <- aov(log(cell_counts) ~ mouse.age.days, data = parent_pop)
summary(ParPop_LR)
coef(ParPop_LR)
fitted(ParPop_LR)
confint(ParPop_LR)
# definig a function to depict exponential decay of the population
phi_fun <- function(t,v){
# initial value of the cell counts of the population of interest
# estimated as the intercept in linear regression 'phi_0'
phi_0 = exp(coef(ParPop_LR)[1])
# the slope is also estimated from linear regression 'v'
phi_0 * exp(-v*(t))
}
# the slope is also estimated from linear regression 'v'
v= -coef(ParPop_LR)[2]
# the half life of the parent population according to thee fitted spline
log(2)/v
# defining timescale for the parent population decay
ts <- seq(50,500)
p1 <- ggplot()+
geom_point(data = parent_pop, aes(x= mouse.age.days, y= cell_counts), size=3)+
geom_line(aes(x= ts, phi_fun(ts,v)), color = "orange", size=1.25)+ scale_y_log10(limits=c(1e4,1e7))+
#geom_line(data = parent_pop, aes(x= mouse.age.days, y= exp(fitted(ParPop_LR))), color='blue', size=1)+
labs(title = paste('spline:', par_pop), y='cell counts', x="days post bmt") +
theme_light() + theme(axis.text = element_text(size = 20),
axis.title = element_text(size = 16, face = "bold"),
panel.background = element_rect(colour = "black"),
plot.title = element_text(size=20, hjust = 0.5),
legend.text = element_text(size=22),
legend.title = element_blank())
output1 <- paste('predicted rate of decline =', v, ' (', -confint(ParPop_LR)[2,1], ', ', -confint(ParPop_LR)[2,1], ')', sep = '')
output2 <- paste('predicted cell counts at birth = ', exp(coef(ParPop_LR)[1])/10^6, ' (', exp(confint(ParPop_LR)[1,1])/10^6, ', ', exp(confint(ParPop_LR)[1,2])/10^6, ')', sep = '')
# the system of ODEs for host and donor populations in BuChi
# X_host' == theta_host(t) - lambda * X_host
# X_donor' == theta_donor(t) - lambda * X_donor
# Modifying the data frame to contain transformed cell counts and donor fractions
pop_int <- grepl('SP', pop_interest)
if (pop_int == TRUE){
data_pop <- dataframe1 %>%
filter(population == pop_interest) %>%
filter(donor_fractions <= 1.5) %>%
mutate(cell_counts.log = log(cell_counts),
#donor_fractions.logit = logitTransformation(donor_fractions),
#donor_fractions.asin = AsinTransformation(donor_fractions),
age.at.t0 = age.at.bmt + 12,
days.post.t0 = mouse.age.days - age.at.t0)%>%
select(-contains('bmt'))
} else {
data_pop <- dataframe2 %>%
filter(population == pop_interest) %>%
filter(donor_fractions <= 1.5) %>%
mutate(cell_counts.log = log(cell_counts),
#donor_fractions.logit = logitTransformation(donor_fractions),
#donor_fractions.asin = AsinTransformation(donor_fractions),
age.at.t0 = age.at.bmt + 12,
days.post.t0 = mouse.age.days - age.at.t0)%>%
select(-contains('bmt'))
}
#data_pop[is.na(data_pop)] <- 3
# Sorting the data set in the ascending order of time series
data_pop <- data_pop[order(data_pop$age.at.t0),]
#catgorising age.at.bmt of host mice into distinct age groups to visualise data better
data_pop[, "age_group"] <- NULL
for(i in 1:nrow(data_pop)) {
if(data_pop[i, "age.at.t0"] < 64 )
data_pop[i, "age_group"] <- "age group1: <8 weeks"
else if (data_pop[i, "age.at.t0"] < 124)
data_pop[i, "age_group"] <- "age group2: 8 - 16 weeks"
else if (data_pop[i, "age.at.t0"] > 124)
data_pop[i, "age_group"] <- "age group4: >16 weeks"
}
#X0 = total cell count at the earliest BMT - t0
#phi_0 = number cells maturing into FM at BMT
#Subpopulations to consider : host and donor
#Donor population (displaceable)
Xd <- function(t,tb,p0,l) {
(chi* 10^p0 *exp(-v*(tb-ta))* (exp((l-v)*(t-tb))-1) + (l-v) * Xd_tb(tb,p0,l))/((l-v) * exp(l*(t-tb)))
}
#host populations
#Host population (displacebale)
Xh <- function(t,tb,p0,l) {
((1-chi) * 10^p0 *exp(-v*(tb-ta))* (exp((l-v)*(t-tb))-1) + (l-v) * Xh_tb(tb,p0,l))/((l-v) * exp(l*(t-tb)))
}
## total cell counts
# Initial total naive T cell population at different aBMTs
X_tb <- function(tb,X0,p0,l){
((10^p0*(exp((l-v)*(tb-ta))-1)) + ((l-v) * 10^X0)) / ((l-v) * exp(l*(tb-ta)))
}
X_total <- function(t,tb,X0,p0,l) {
((10^p0*exp(-v*(tb-ta))*(exp((l-v)*(t-tb))-1)) + ((l-v) * X_tb(tb,X0,p0,l))) / ((l-v) * exp(l*(t-tb)))
}
## Normalised donor fraction
# donor fraction at later timepoint other than the earliest bmt
fd_tb <- function(tb,X0,fd0,p0,l){
(10^X0 * fd0 * exp(-v*(tb-ta)))/X_tb(tb,X0,p0,l)
}
# Normalised donor fraction as a function of days post bmt ~ d
fd <- function(d,tb,X0,fd0,p0,l) {
((10^p0*exp(-v*(tb-ta))*(exp((l-v)* d) -1)) + ((l-v) * fd_tb(tb,X0,fd0,p0,l) * X_tb(tb,X0,p0,l))) /
((10^p0*exp(-v*(tb-ta))*(exp((l-v)*d)-1)) + ((l-v) * X_tb(tb,X0,p0,l)))
}
# total cell counts for donor fraction calculation using days post bmt rather than mouse age in days
# mosue age in days can not be used to plot donor fractions since normalised doonr fraction is the function
# of days post bmt and if plotted against mouse age days may not lead to complete equilibration of
# donor fraction to 1
# parameters and initial conditions
ta = 52
#Unknowns - lambda=l, b, N0, fd(0)
LL_func <- function(param, dataset) {
X0 <- param[1]
fd0 <- param[2]
p0 <- param[3]
l <- param[4]
k <- length(param) #numner of unknown parameters
n1 <- nrow(dataset) #number of observations in dataset1
n2 <- nrow(dataset) #number of observations in dataset2
n <- n1+n2
R1 <- sum((dataset$cell_counts.log - log(X_total(dataset$mouse.age.days, dataset$age.at.t0,X0,p0,l)))^2) #SSR for dataset1
R2 <- sum((dataset$donor_fractions - (fd(dataset$days.post.t0, dataset$age.at.t0,X0,fd0,p0,l)))^2) #SSR for dataset2
#log-likelihood ignoring all the terms dependent only on the number of observations n
#matrix multipltication of residual and transpose of residuals
logl <- -(n1/2)*log(R1) - (n2/2)*log(R2)
return(-logl) #since optim minimizes the function by default, ML
}
fit_LLpop <- optim(par=params, fn=LL_func, dataset = data_pop, hessian = TRUE)
fit_LLpop
AIC <- 2 * length(fit_LLpop$params) - 2 * fit_LLpop$value
AIC <- paste('AIC is', AIC)
output <- c('log10(N0)', 'fd0', 'log10(theta0)', 'lambda')
#confidence intervals on the parameter estimates from hessian matrix
fisher_info_LL <- solve(fit_LLpop$hessian) #covariance matrix of the estimates is (asymptotically) the inverse of the Hessian
prop_sigma_LL <- sqrt(diag(fisher_info_LL)) #standard errors are the square roots of the diagonal elements of the covariance
upper_CI_LL <- fit_LLpop$par + 1.96 * prop_sigma_LL #upper bound on parameter estimates
lower_CI_LL <- fit_LLpop$par - 1.96 * prop_sigma_LL #lower bound on parameter estimates
ci_interval_LL <- data.frame(output = output, value=fit_LLpop$par, lower_CI=lower_CI_LL, upper_CI=upper_CI_LL)
output3 <- ci_interval_LL
#ggplot
ts <- seq(50,500,1)
ds <- seq(0,300,1)
p2 <- ggplot() +
geom_point(data =data_pop, aes(mouse.age.days, y=cell_counts, color=age_group), alpha=0.8, size=3) +
scale_color_manual(values= c("#ea2745", "#28af46", "#356af2", "#8637cc"), name = "Host age at BMT")+
labs(title=paste('fitted cell counts:', pop_interest), y=NULL, x="mouse age (days)") + xlim(0,500) +
geom_line(aes(x=ts, y=X_total(ts,68, fit_LLpop$par[1], fit_LLpop$par[3], fit_LLpop$par[4])), color ="darkblue", size=1.2)+
scale_y_continuous(limits=c(1e4, 1e7), trans="log10", breaks=c(1e6, 1e7, 1e8), minor_breaks = log10minorbreaks, labels =fancy_scientific) +
theme_bw()+ guides(color = FALSE)+ theme(axis.text = element_text(size = 24),
axis.title = element_text(size = 22, face = "bold"),
panel.background = element_rect(colour = "black"),
plot.title = element_text(size=22, hjust = 0.5),
legend.text = element_text(size=22),
legend.title = element_text(size = 22))
p3 <- ggplot() +
geom_point(data =data_pop, aes(days.post.t0, y=donor_fractions, color=age_group), alpha=0.7, size=3) +
geom_hline(yintercept=1, color="gray40", size=1, linetype=2) +
scale_color_manual(values= c("#ea2745", "#28af46", "#356af2", "#8637cc"), name = "Host age at BMT")+
labs(title=paste('fitted donor fractions:', pop_interest), y=NULL, x="days post BMT") +
geom_line(aes(x=ds, y=fd(ds,61, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#ea2745", size=1.2)+
geom_line(aes(x=ds, y=fd(ds,85, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#28af46", size=1.2)+
geom_line(aes(x=ds, y=fd(ds,147, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#356af2", size=1.2)+
scale_y_continuous(limits=c(0,1.2), breaks=c(0,0.3,0.6,0.9,1.2))+
theme_bw()+ guides(color = FALSE)+ theme(axis.text = element_text(size = 24),
axis.title = element_text(size = 22, face = "bold"),
panel.background = element_rect(colour = "black"),
plot.title = element_text(size=22, hjust = 0.5),
legend.text = element_text(size=22),
legend.title = element_text(size = 22))
# model diagnostics
#residuals plot
rsd.counts.log <- data_pop$cell_counts.log - log(X_total(data_pop$mouse.age.days, data_pop$age.at.t0, fit_LLpop$par[1], fit_LLpop$par[3],
fit_LLpop$par[4]))
p41 <- ggplot()+
geom_point(aes(x= data_pop$mouse.age.days, y=rsd.counts.log ))+
labs(title= paste("Residuals plot cell counts", pop_interest), y="residuals", x="mouse age (days)") +
geom_hline(yintercept=0, color="darkred") +theme_bw()
p42 <- ggqqplot(rsd.counts.log)
rsd.fd <- data_pop$donor_fractions - (fd(data_pop$days.post.t0, data_pop$age.at.t0,fit_LLpop$par[1], fit_LLpop$par[2],
fit_LLpop$par[3], fit_LLpop$par[4]))
p43 <- ggplot()+
geom_point(aes(x=data_pop$days.post.t0, y=rsd.fd))+
labs(title=paste("Residuals plot donor fractions", pop_interest), y="residuals", x="days post t0") +
geom_hline(yintercept=0, color="darkred") +theme_bw()
p44 <- ggqqplot(rsd.fd)
p4 <- ggarrange(p41,p42,p43,p44, labels = c('A','B','C','D'), ncol = 2, nrow = 2)
warn <- print('check model diagnostics if necessary')
if (is.null(mod_diag)) {
return(list(AIC,output3, p1, p2, p3, warn))
} else if (mod_diag == TRUE) {
return(list(AIC,output3, p1, p2, p3, p4))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.