R/General_Bcell.R

#' Fit the Neutral model to B-cell data from busulfan chimeras
#'
#' This function allows to compare model behaviour and statistics for different developmental pathways in B cell compartment using data from BuChi.
#'
#' The basic model is as follws:
#' \deqn{X'(t) = \theta(t) - \lambda X(t)}
#'
#' Neutral model assumes 'homogeneity' in the population of interest considering that all the subsets turnover with a constant, identical rate.
#' Neutral_func solves the system of linear first order ODEs for displaceable donor and host population and fits the 'Neutral model' to the cell counts and normalised donor fractions.
#' Influx of cells from parent population (\eqn{\theta}) changes exponentially with time (mouse age) with the rate \eqn{\nu}.
#' \deqn{\theta(t) = \theta_0 e^-\nu t}
#'
#' Needs the dataframes for Spleen and LN counts in the directory.
#' Data is fitted using Nealder-Mead algorithm in optim.
#' Currently only fits on log transformed cell counts and untransformed donor fractions.
#' 95\% CI is generated using hessian matrix in optim (not by bootstrapping).
#' if mod_diag is set to TRUE, the function will validate whether the best-fit model is behaving normally and is following the basic assumptions of least-square method.
#'
#'
#' @param par_pop column name of the parent population (chracter string e.g. 'SP_T1' or 'LN_FM')
#' @param pop_interest column name of the population of interest (chracter string e.g. 'SP_T1' or 'LN_GC')
#' @param params a vector containing initial parameter values in order c(N0, fd0, theta0, lambda)
#' @param mod_diag Boolean. If TRUE returns residual and QQ plots. By default FALSE.
#'
#'
#' @return AIC value for statistical comparison
#' @return a dataframe containing parameter values with 95\% CI
#' @return plot1 - spline fitted to parent population using linear regression on log cell counts.
#' @return plot2 - Nutral model fitted to cell counts
#' @return plot3 - Neutral model fitted to donor fractions
#' @return plot4 - returns residual and QQ plots in a single page (only if mod_diag == TRUE)
#'
#' @examples neutral_func(compartment= 'Spleen', par_pop = 'SP_T1', pop_interest = 'SP_FM', params = c(7,0.1,5,0.01), mod_diag = TRUE)
#' @export

neutral_func <- function(par_pop, pop_interest, params, mod_diag = NULL){

fancy_scientific <- function(l) {
    # turn in to character string in scientific notation
    l <- format(l, scientific = TRUE)
    # quote the part before the exponent to keep all the digits
    l <- gsub("^(.*)e", "'\\1'e", l)
    # remove + after exponent, if exists. E.g.: (e^+2 -> e^2)
    l <- gsub("e\\+","e",l)
    # turn the 'e' into plotmath format
    l <- gsub("e", "%*%10^", l)
    # convert 1x10^ or 1.000x10^ -> 10^
    l <- gsub("\\'1[\\.0]*\\'\\%\\*\\%", "", l)
    # return this as an expression
    parse(text=l)
  }

log10minorbreaks=as.numeric(1:10 %o% 10^(4:8))

# reading the data set from a CSV file
dataframe1 <- read.csv('B_merge_spleen.csv')
dataframe2 <- read.csv('B_merge_ln.csv')

# removing column of indices
dataframe1 <- dataframe1[,-1]
dataframe2 <- dataframe2[,-1]

parpop <- grepl('SP', par_pop)


# fitting a linear model to the parent population of the cell type under study to estimate the rate of input
# selecting the right dataset to depending on the argument par_pop
# Then selcting dataset containing only the counts of parent population and time (mouse age)

if (parpop == TRUE){
  parent_pop <- filter(dataframe1, dataframe1$population == par_pop)%>%
    select(contains('days'), contains('counts'))
} else {
  parent_pop <- filter(dataframe1, dataframe1$population == par_pop)%>%
    select(contains('days'), contains('counts'))
}

# using aov function to perform linear regression on the log cell counts of parent population
# linear regression of log cell counts assumes exponential decay of parent population with time
ParPop_LR <- aov(log(cell_counts) ~ mouse.age.days, data = parent_pop)

summary(ParPop_LR)
coef(ParPop_LR)
fitted(ParPop_LR)
confint(ParPop_LR)

# definig a function to depict exponential decay of the population
phi_fun <- function(t,v){

  # initial value of the cell counts of the population of interest
  # estimated as the intercept in linear regression 'phi_0'
  phi_0 = exp(coef(ParPop_LR)[1])

  # the slope is also estimated from linear regression 'v'
  phi_0 * exp(-v*(t))
}

# the slope is also estimated from linear regression 'v'
v= -coef(ParPop_LR)[2]
# the half life of the parent population according to thee fitted spline
log(2)/v

# defining timescale for the parent population decay
ts <- seq(50,500)

p1 <- ggplot()+
  geom_point(data =  parent_pop, aes(x= mouse.age.days, y= cell_counts), size=3)+
  geom_line(aes(x= ts, phi_fun(ts,v)), color = "orange", size=1.25)+ scale_y_log10(limits=c(1e4,1e7))+
  #geom_line(data =  parent_pop, aes(x= mouse.age.days, y= exp(fitted(ParPop_LR))), color='blue', size=1)+
  labs(title = paste('spline:', par_pop), y='cell counts', x="days post bmt") +
  theme_light() + theme(axis.text = element_text(size = 20),
                        axis.title =  element_text(size = 16, face = "bold"),
                        panel.background = element_rect(colour = "black"),
                        plot.title = element_text(size=20, hjust = 0.5),
                        legend.text = element_text(size=22),
                        legend.title = element_blank())


output1 <- paste('predicted rate of decline  =', v, ' (', -confint(ParPop_LR)[2,1], ', ', -confint(ParPop_LR)[2,1], ')', sep = '')
output2 <- paste('predicted cell counts at birth = ', exp(coef(ParPop_LR)[1])/10^6, ' (', exp(confint(ParPop_LR)[1,1])/10^6, ', ', exp(confint(ParPop_LR)[1,2])/10^6, ')', sep = '')


# the system of ODEs for host and donor populations in BuChi
# X_host' == theta_host(t) - lambda * X_host
# X_donor' == theta_donor(t) - lambda * X_donor

# Modifying the data frame to contain transformed cell counts and donor fractions
pop_int <- grepl('SP', pop_interest)

if (pop_int == TRUE){
  data_pop <- dataframe1 %>%
    filter(population == pop_interest) %>%
    filter(donor_fractions <= 1.5) %>%
    mutate(cell_counts.log = log(cell_counts),
           #donor_fractions.logit = logitTransformation(donor_fractions),
           #donor_fractions.asin = AsinTransformation(donor_fractions),
           age.at.t0 = age.at.bmt + 12,
           days.post.t0 = mouse.age.days - age.at.t0)%>%
    select(-contains('bmt'))
} else {
  data_pop <- dataframe2 %>%
    filter(population == pop_interest) %>%
    filter(donor_fractions <= 1.5) %>%
    mutate(cell_counts.log = log(cell_counts),
           #donor_fractions.logit = logitTransformation(donor_fractions),
           #donor_fractions.asin = AsinTransformation(donor_fractions),
           age.at.t0 = age.at.bmt + 12,
           days.post.t0 = mouse.age.days - age.at.t0)%>%
    select(-contains('bmt'))
}



#data_pop[is.na(data_pop)] <- 3
# Sorting the data set in the ascending order of time series
data_pop <- data_pop[order(data_pop$age.at.t0),]


#catgorising age.at.bmt of host mice into distinct age groups to visualise data better
data_pop[, "age_group"] <- NULL

for(i in 1:nrow(data_pop)) {
  if(data_pop[i, "age.at.t0"] < 64 )
    data_pop[i, "age_group"] <- "age group1: <8 weeks"
  else if (data_pop[i, "age.at.t0"] < 124)
    data_pop[i, "age_group"] <- "age group2: 8 - 16 weeks"
  else if (data_pop[i, "age.at.t0"] > 124)
    data_pop[i, "age_group"] <- "age group4: >16 weeks"
}


#X0 = total cell count at the earliest BMT - t0
#phi_0 = number cells maturing into FM at BMT

#Subpopulations to consider : host and donor
#Donor population (displaceable)
Xd <- function(t,tb,p0,l) {
  (chi* 10^p0 *exp(-v*(tb-ta))* (exp((l-v)*(t-tb))-1) + (l-v) * Xd_tb(tb,p0,l))/((l-v) * exp(l*(t-tb)))
}

#host populations
#Host population (displacebale)
Xh <- function(t,tb,p0,l) {
  ((1-chi) * 10^p0 *exp(-v*(tb-ta))* (exp((l-v)*(t-tb))-1) + (l-v) * Xh_tb(tb,p0,l))/((l-v) * exp(l*(t-tb)))
}

## total cell counts

# Initial total naive T cell population at different aBMTs
X_tb <- function(tb,X0,p0,l){
  ((10^p0*(exp((l-v)*(tb-ta))-1)) + ((l-v) * 10^X0)) / ((l-v) * exp(l*(tb-ta)))
}

X_total <- function(t,tb,X0,p0,l) {
  ((10^p0*exp(-v*(tb-ta))*(exp((l-v)*(t-tb))-1)) + ((l-v) * X_tb(tb,X0,p0,l))) / ((l-v) * exp(l*(t-tb)))
}

## Normalised donor fraction

# donor fraction at later timepoint other than the earliest bmt
fd_tb <- function(tb,X0,fd0,p0,l){
  (10^X0 * fd0 * exp(-v*(tb-ta)))/X_tb(tb,X0,p0,l)
}

# Normalised donor fraction as a function of days post bmt ~ d
fd <- function(d,tb,X0,fd0,p0,l) {
  ((10^p0*exp(-v*(tb-ta))*(exp((l-v)* d) -1)) + ((l-v) * fd_tb(tb,X0,fd0,p0,l) * X_tb(tb,X0,p0,l))) /
    ((10^p0*exp(-v*(tb-ta))*(exp((l-v)*d)-1)) + ((l-v) * X_tb(tb,X0,p0,l)))
}

# total cell counts for donor fraction calculation using days post bmt rather than mouse age in days
# mosue age in days can not be used to plot donor fractions since normalised doonr fraction is the function
# of days post bmt and if plotted against mouse age days may not lead to complete equilibration of
# donor fraction to 1


# parameters and initial conditions
ta = 52
#Unknowns - lambda=l, b, N0, fd(0)

LL_func <- function(param, dataset) {
  X0   <- param[1]
  fd0  <- param[2]
  p0   <- param[3]
  l    <- param[4]

  k  <- length(param)           #numner of unknown parameters
  n1 <- nrow(dataset)           #number of observations in dataset1
  n2 <- nrow(dataset)           #number of observations in dataset2
  n  <- n1+n2

  R1 <- sum((dataset$cell_counts.log - log(X_total(dataset$mouse.age.days, dataset$age.at.t0,X0,p0,l)))^2)   #SSR for dataset1
  R2 <- sum((dataset$donor_fractions - (fd(dataset$days.post.t0, dataset$age.at.t0,X0,fd0,p0,l)))^2)  #SSR for dataset2


  #log-likelihood ignoring all the terms dependent only on the number of observations n
  #matrix multipltication of residual and transpose of residuals
  logl <- -(n1/2)*log(R1) - (n2/2)*log(R2)

  return(-logl)     #since optim minimizes the function by default, ML
}

fit_LLpop <- optim(par=params, fn=LL_func, dataset = data_pop, hessian = TRUE)
fit_LLpop
AIC <- 2 * length(fit_LLpop$params) - 2 * fit_LLpop$value
AIC <- paste('AIC is', AIC)

output <- c('log10(N0)', 'fd0', 'log10(theta0)', 'lambda')


#confidence intervals on the parameter estimates from hessian matrix
fisher_info_LL <- solve(fit_LLpop$hessian)  #covariance matrix of the estimates is (asymptotically) the inverse of the Hessian
prop_sigma_LL <- sqrt(diag(fisher_info_LL))  #standard errors are the square roots of the diagonal elements of the covariance
upper_CI_LL <- fit_LLpop$par + 1.96 * prop_sigma_LL  #upper bound on parameter estimates
lower_CI_LL <- fit_LLpop$par - 1.96 * prop_sigma_LL  #lower bound on parameter estimates
ci_interval_LL <- data.frame(output = output, value=fit_LLpop$par, lower_CI=lower_CI_LL, upper_CI=upper_CI_LL)

output3 <- ci_interval_LL


#ggplot
ts <- seq(50,500,1)
ds <- seq(0,300,1)

p2 <- ggplot() +
  geom_point(data =data_pop, aes(mouse.age.days, y=cell_counts, color=age_group), alpha=0.8, size=3) +
  scale_color_manual(values= c("#ea2745", "#28af46", "#356af2", "#8637cc"), name = "Host age at BMT")+
  labs(title=paste('fitted cell counts:', pop_interest),  y=NULL, x="mouse age (days)") + xlim(0,500) +
  geom_line(aes(x=ts, y=X_total(ts,68, fit_LLpop$par[1],  fit_LLpop$par[3], fit_LLpop$par[4])), color ="darkblue", size=1.2)+
  scale_y_continuous(limits=c(1e4, 1e7),  trans="log10", breaks=c(1e6, 1e7, 1e8), minor_breaks = log10minorbreaks, labels =fancy_scientific) +
  theme_bw()+ guides(color = FALSE)+ theme(axis.text = element_text(size = 24),
                                           axis.title =  element_text(size = 22, face = "bold"),
                                           panel.background = element_rect(colour = "black"),
                                           plot.title = element_text(size=22,  hjust = 0.5),
                                           legend.text = element_text(size=22),
                                           legend.title = element_text(size = 22))



p3 <- ggplot() +
  geom_point(data =data_pop, aes(days.post.t0, y=donor_fractions, color=age_group), alpha=0.7, size=3) +
  geom_hline(yintercept=1, color="gray40", size=1, linetype=2) +
  scale_color_manual(values= c("#ea2745", "#28af46", "#356af2", "#8637cc"), name = "Host age at BMT")+
  labs(title=paste('fitted donor fractions:', pop_interest), y=NULL, x="days post BMT") +
  geom_line(aes(x=ds, y=fd(ds,61, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#ea2745", size=1.2)+
  geom_line(aes(x=ds, y=fd(ds,85, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#28af46", size=1.2)+
  geom_line(aes(x=ds, y=fd(ds,147, fit_LLpop$par[1], fit_LLpop$par[2], fit_LLpop$par[3], fit_LLpop$par[4])), alpha=0.8, color="#356af2", size=1.2)+
  scale_y_continuous(limits=c(0,1.2),  breaks=c(0,0.3,0.6,0.9,1.2))+
  theme_bw()+ guides(color = FALSE)+ theme(axis.text = element_text(size = 24),
                                           axis.title =  element_text(size = 22, face = "bold"),
                                           panel.background = element_rect(colour = "black"),
                                           plot.title = element_text(size=22,  hjust = 0.5),
                                           legend.text = element_text(size=22),
                                           legend.title = element_text(size = 22))



# model diagnostics

#residuals plot
rsd.counts.log <- data_pop$cell_counts.log - log(X_total(data_pop$mouse.age.days, data_pop$age.at.t0, fit_LLpop$par[1],  fit_LLpop$par[3],
                                                         fit_LLpop$par[4]))
p41 <- ggplot()+
  geom_point(aes(x= data_pop$mouse.age.days, y=rsd.counts.log ))+
  labs(title= paste("Residuals plot cell counts", pop_interest), y="residuals", x="mouse age (days)") +
  geom_hline(yintercept=0, color="darkred") +theme_bw()

p42 <- ggqqplot(rsd.counts.log)

rsd.fd <- data_pop$donor_fractions - (fd(data_pop$days.post.t0, data_pop$age.at.t0,fit_LLpop$par[1], fit_LLpop$par[2],
                                         fit_LLpop$par[3], fit_LLpop$par[4]))
p43 <- ggplot()+
  geom_point(aes(x=data_pop$days.post.t0, y=rsd.fd))+
  labs(title=paste("Residuals plot donor fractions", pop_interest), y="residuals", x="days post t0") +
  geom_hline(yintercept=0, color="darkred") +theme_bw()


p44 <- ggqqplot(rsd.fd)

p4 <- ggarrange(p41,p42,p43,p44, labels = c('A','B','C','D'), ncol = 2, nrow = 2)

warn <- print('check model diagnostics if necessary')


if (is.null(mod_diag)) {

  return(list(AIC,output3, p1, p2, p3, warn))
} else if (mod_diag == TRUE) {

  return(list(AIC,output3, p1, p2, p3, p4))
}
}
sanketrane/SRBcell documentation built on May 29, 2019, 9:11 a.m.