inst/visualise_results.R

# Note that this function only works by printing to a pdf. To print to screen, comment out
# the first line and execute in parts, noting that dev.off() is only activated if an existing
# pdf is open. Moreover, some functions might throw an error due to poor mixing, hence are
# wrapped in try().
visualise_results <- function(results,                  # list; results saved in .RData file generated by mcmc_sampling
                              data_summary,             # list; data .RData file
                              PDF,                      # logical; PDF = TRUE to plot PDF
                              child,                    # logical; child = TRUE if results are generated by mcmc_sampling_child
                              augment_missing_data,     # logical; if true, missing entries are inferred (only if prevalence data or the log likelihood is set to zero (QC check), in which case, always TRUE)
                              Simulated)                # logical; true if data are simulated
{
  #======================================================================================================
  # Pdf and plotting (comment out if plotting manually)
  if(class(PDF) == 'character'){pdf(PDF)}else{stop('Need to plot by hand')}
  #======================================================================================================

  #======================================================================================================
  # Install packages if not already
  installed_packages <- rownames(installed.packages())
  desired_packages <- c('MCMCpack',# For correlation plots
                        'car',     # for conversion between an array and list
                        'plyr',    # For text plot and confidence intervals
                        'gplots',  # For moi and haplotype count plots
                        'lattice', # To generate mcmcAs
                        'vioplot', # For posterior frequency plots
                        'abind',
                        'grDevices')
  uninstalled_desired_packages <- desired_packages[!desired_packages %in% installed_packages]
  if(length(uninstalled_desired_packages) > 0){
    install.packages(uninstalled_desired_packages)
  }
  #======================================================================================================


  #======================================================================================================
  # Load packages if not already
  loaded_packages <- search()
  unloaded_desired_packages <- desired_packages[!paste('package:', desired_packages, sep = '') %in% loaded_packages]
  if(length(unloaded_desired_packages) > 0){
    for(i in 1:length(unloaded_desired_packages)){
      require(unloaded_desired_packages[i], character.only = TRUE)
    }
  }
  #======================================================================================================

  #======================================================================================================
  # Extract meta data
  haplotypes <- processed_data_list$comp_genotypes_character
  no_haplotypes <- processed_data_list$no_haplotypes
  no_markers <- processed_data_list$no_marker_loci
  no_total <- processed_data_list$no_total
  #======================================================================================================


  #======================================================================================================
  # Burnin and pdf thinning (for managable pdf size)
  burnin <- 1:(0.5*mcmc_variable_list$no_traces_preburnin) # remove first half
  plot_thin <- 10
  plot_thin_preburnin <- seq(1, mcmc_variable_list$no_traces_preburnin, plot_thin) # To further thin traces that include burnin
  plot_thin_postburnin <- seq(1, mcmc_variable_list$no_traces_preburnin-max(burnin), plot_thin) # To further thin traces that don't include burnin
  plot_thin_preburnin_noburnin <- seq((max(burnin)+1), mcmc_variable_list$no_traces_preburnin, plot_thin)
  #======================================================================================================

  #========================================================================================================
  # Generate posteriors of interest
  if(mcmc_variable_list$no_mcmc_chains > 1){
    alply_genotype_freq_store_chains_burnin <- alply(results$genotype_freq_store_chains[-burnin,,],3)
  }else{
    alply_genotype_freq_store_chains_burnin <- results$genotype_freq_store_chains[-burnin,,]
  }

  mcmc_frequency_chains <- mcmc.list(lapply(alply_genotype_freq_store_chains_burnin, # 3 for splitting by third dimension (nothing to do with no. of chains)
                                            mcmc,
                                            start = (max(burnin)+1)*mcmc_variable_list$thinning_interval,
                                            end = mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval,
                                            thin = mcmc_variable_list$thinning_interval))

  mcmc_As <- abind(alply(results$genotype_count_store_chains[-burnin,,,],4), along = 1) # Haplotype counts for all chains excluding burn in
  mcmc_mois <- apply(mcmc_As,c(1,2),sum)
  #========================================================================================================

  #===========================================================================================
  # Summary of run variables for text summary (might want to add some more)
  raw_data_plus_latent <- data_summary$Data[,1:no_markers,drop=FALSE]

  ind_partial_initial <- apply(raw_data_plus_latent,1,function(X){99%in%X})

  if(child){
    no_children_TempVar <- strsplit(rownames(raw_data_plus_latent), ':')
    childID <- unique(matrix(unlist(no_children_TempVar), ncol = 2, byrow = TRUE)[,1])
    no_children <- length(childID)
  } else {
    no_children <- no_total
  }

  Variables <- as.matrix(c(no_markers, no_children, no_total, sum(apply(raw_data_plus_latent,1,function(X){99%in%X})),
                           mcmc_variable_list$thinning_interval * mcmc_variable_list$no_traces_preburnin,
                           mcmc_variable_list$thinning_interval, max(burnin)*mcmc_variable_list$thinning_interval))

  rownames(Variables) <- c('No. marker loci','No. children',
                           'No. blood samples','No. partial observations',
                           'No. mcmc iterations overall','Thinning interval','Burn in overall'); colnames(Variables)<-'Value'

  Data_to_print <- as.matrix(table(apply(raw_data_plus_latent,1,paste,sep=' ',collapse=' '))) # Summary of data
  colnames(Data_to_print)<-'No of observations'
  EffSize_pi<-as.matrix(round(effectiveSize(mcmc_frequency_chains),0),ncol=1) # Effective size of the sample (after burnin)
  colnames(EffSize_pi) <- 'Effective posterior sample size'
  #===========================================================================================

  #=====================================================================================================
  # Extract the simulated samples of freq and moi (simulated haploytype counts not used in plots)
  if(Simulated){
    pi_simulated_sample <- data_summary$Sample_haplotype_frequencies[,haplotypes ]
    if(augment_missing_data){
      moi_simulated_sample = data_summary$Data[,'MOI']
    } else {
      moi_simulated_sample = data_summary$Data[!ind_partial_initial,'MOI']}
  } else {
    moi_simulated_sample <- rep(Inf, no_total)
  }
  #=====================================================================================================

  #=====================================================================================================
  # Values for plotting mois and genotype counts and chain cols
  randomly_selected_bloodsamples <- sample(no_total, size = min(no_total, 16))
  mini_title_names <- paste(dimnames(mcmc_As)[[2]],apply(raw_data_plus_latent, 1, paste, sep = ' ', collapse=' '), sep=', ')
  chain_cols <- rainbow(mcmc_variable_list$no_mcmc_chains, alpha = 0.5)
  #=====================================================================================================

  #===========================================================================================
  # Print data summary and effective size to the screen/file
  par(mfcol=c(2,2), mar = c(5,4,1,1))
  textplot(Variables)
  textplot(as.matrix(Data_to_print))
  textplot(EffSize_pi)
  par(mfrow = c(1,1)) # reset
  #===========================================================================================
  #========================================================================================================
  # Classic diagnostic trace plots (burnin removed hence start and end arguments passed to mcmc function via lapply - see xlab of trace plots)
  plot(mcmc_frequency_chains, ask = FALSE, bty = 'n', col = chain_cols, smooth = FALSE)
  try(gelman.plot(mcmc_frequency_chains, transform = FALSE, ask = FALSE, bty = 'n')) # Plot of gelman diagnostic
  autocorr.plot(mcmc_frequency_chains,lag.max=50, ask = FALSE, bty = 'n')
  #========================================================================================================

  #========================================================================================================
  # Plot true values against estimate (having excluded burnin)
  par(mfrow = c(1,1))
  if(Simulated){ # If simulated data

    pi_med <- summary(mcmc_frequency_chains)$quantiles[,3]
    CI_lower<-summary(mcmc_frequency_chains)$quantiles[,1]
    CI_upper<-summary(mcmc_frequency_chains)$quantiles[,5]
    options(warn = -1) # to supress warnings
    plotCI(x = data_summary$Sample_haplotype_frequencies[,haplotypes],
           y = pi_med[haplotypes],ui=CI_upper[haplotypes],li=CI_lower[haplotypes],
           xlab='Sample haplotype frequency',ylab='Estimated haplotype frequency',
           xlim=c(0,1),ylim=c(0,1), gap = 0,
           barcol='grey',pch=19,pt.bg=par("bg"),cex = 0.3,
           bty = 'n')
    lines(seq(0,1,0.1),seq(0,1,0.1))
  }
  #========================================================================================================
  #========================================================================================================
  # Violin plot
  # Generate samples from the dirichlet prior for overlaying when mcmc_variable_list$log_like_zero is true
  priorsamples <- rdirichlet(1000, frequency_list$frequency_hyperparameter) # frequency_hyperparameter
  colnames(priorsamples) <- haplotypes
  plot(NULL, xlim=c(0.5,no_haplotypes + 0.5), ylim=c(0,1.05), xaxt = 'n', yaxt = 'n', xlab = 'Haplotypes', ylab = 'Frequency', cex.lab = 1.2)

  for(chain in 1:mcmc_variable_list$no_mcmc_chains){
    for(haplotype in haplotypes){
      if(mcmc_variable_list$log_like_zero){
        if(chain ==1){
          vioplot(priorsamples[,haplotype], at = which(haplotype == haplotypes), col = 'gray', add = TRUE)
        }
      }
      try(vioplot(results$genotype_freq_store_chains[-burnin, haplotype, chain], at = which(haplotype == haplotypes), col = chain_cols[chain], add = TRUE))
    }
  }
  if(!mcmc_variable_list$log_like_zero & Simulated){
    points(y = data_summary$Population_haplotype_frequencies[,haplotypes], x=1:no_haplotypes, pch = 19, col = 'black')
  }
  box(col = 'white', lwd = 6)
  axis(side = 1, at = 1:no_haplotypes, labels = haplotypes, tick = FALSE, cex.axis = 1, las = 2, line = -1)
  axis(side = 2, at = seq(0,1,0.5), labels = seq(0,1,0.5), cex.axis = 1.2)
  if(!mcmc_variable_list$log_like_zero & Simulated){legend('topleft', legend = c('Frequency used to simulate the data'), pch = c(19), col = 'black', bty = 'n', cex = 1)}
  legend('topright', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains),lty = 'solid', col = chain_cols[1:mcmc_variable_list$no_mcmc_chains], cex = 1, bty = 'n', lwd = 3)
  #========================================================================================================
  #========================================================================================================
  if(!mcmc_variable_list$log_like_zero & Simulated){
    # Violin relative to the simulated sammple frequency
    plot(NULL, xlim=c(0.5,no_haplotypes+0.5), ylim=c(0,1.05), xaxt = 'n', yaxt = 'n', xlab = 'Haplotypes', ylab = 'Frequency', cex.lab = 1.2)
    for(chain in 1:mcmc_variable_list$no_mcmc_chains){
      for(haplotype in haplotypes){
        try(vioplot(results$genotype_freq_store_chains[-burnin, haplotype, chain], at = which(haplotype == haplotypes), col = chain_cols[chain], add = TRUE))}}
    points(y = data_summary$Sample_haplotype_frequencies[,haplotypes],
           x = 1:no_haplotypes, pch = 19, col = 'black')
    box(col = 'white', lwd = 6)
    axis(side = 1, at = 1:no_haplotypes, labels = haplotypes, tick = FALSE, cex.axis = 1, las = 2, line = -1)
    axis(side = 2, at = seq(0,1,0.5), labels = seq(0,1,0.5), cex.axis = 1.2)
    if(Simulated){legend('topleft', legend = c('Simulated sample frequency'), pch = c(19), col = 'black', bty = 'n', cex = 1)}
    legend('topright', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains),lty = 'solid', col = chain_cols[1:mcmc_variable_list$no_mcmc_chains], cex = 1, bty = 'n', lwd = 3)}
  #========================================================================================================

  #========================================================================================================
  # Plot the likelihood overall (including burnin, although ylim based on post burnin)
  # Calculate log_likelihood by taking product over malaria episodes
  log_likelihood_store_chains_sum <- apply(results$log_likelihood_store_chains,c(1,3),sum)

  plot(NULL,  ylim = range(log_likelihood_store_chains_sum[-burnin,])-c(10, 0),
       xaxt = 'n', yaxt = 'n', xlim = c(0, mcmc_variable_list$no_traces_preburnin), bty ='n', ylab = 'Log likelihood',
       xlab = 'Trace', cex.lab = 1.2)

  Seq <- seq(min(log_likelihood_store_chains_sum[-burnin,]),
             max(log_likelihood_store_chains_sum[-burnin,]),
             length.out = 3)

  axis(side = 2, at = Seq, labels = round(Seq,0), cex.axis = 1.2)

  axis(side = 1, at = c(0,mcmc_variable_list$no_traces_preburnin), labels = c(0,mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval), cex.axis = 1.2)

  for(chain in 1:mcmc_variable_list$no_mcmc_chains){ # Add the likelihood

    lines(y = log_likelihood_store_chains_sum[plot_thin_preburnin, chain], x = plot_thin_preburnin,  col = chain_cols[chain])

  }

  legend('bottom',
         legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains),
         lty = 'solid', col = chain_cols, cex = 1, bty = 'n')

  title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  #========================================================================================================

  #========================================================================================================
  # # Plot the posterior (including burnin, although ylim based on post burnin)
  # plot(NULL, ylim = range(results$log_posterior_store_chains[-burnin,])-c(10, 0), xaxt = 'n', yaxt = 'n',
  #      xlim = c(0, mcmc_variable_list$no_traces_preburnin), bty ='n', ylab = 'Log posterior', xlab = 'Trace', cex.lab = 1.2)
  #
  # Seq <- seq(min(results$log_posterior_store_chains[-burnin,]),
  #            max(results$log_posterior_store_chains[-burnin,]),
  #            length.out = 3)
  #
  # axis(side = 2, at = Seq, labels = round(Seq,0),cex.axis = 1.2)
  # axis(side = 1, at = c(0,mcmc_variable_list$no_traces_preburnin), labels = c(0, mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval), cex.axis = 1.2)
  #
  # for(chain in 1:mcmc_variable_list$no_mcmc_chains){
  #   lines(y = results$log_posterior_store_chains[plot_thin_preburnin, chain],
  #         x = plot_thin_preburnin,
  #         col = chain_cols[chain])
  # }
  # legend('bottom', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains),lty = 'solid', col = chain_cols, cex = 1, bty = 'n')
  #
  # title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  #========================================================================================================

  #========================================================================================================
  # Traces (including burnin, although ylim based on post burnin)
  plot(NULL, ylim = c(0,1.1), xlim = c(0, mcmc_variable_list$no_traces_preburnin), bty = 'n', ylab = 'Frequency', xlab = 'Trace',
       cex.lab = 1.2, xaxt = 'n', yaxt = 'n')

  axis(side = 1, at = c(0,mcmc_variable_list$no_traces_preburnin), labels = c(0,mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval), cex.axis = 1.2)
  axis(side = 2, at = seq(0,1,length.out = 3), cex.axis = 1.2)

  for(chain in 1:mcmc_variable_list$no_mcmc_chains){
    for(i in 1:no_haplotypes){
      lines(y = as.mcmc(results$genotype_freq_store_chains[plot_thin_preburnin, i, chain]), x = plot_thin_preburnin, col = chain_cols[chain])
    }}

  if(Simulated & !mcmc_variable_list$log_like_zero){
    abline(h = data_summary$Population_haplotype_frequencies)
    legend('topleft', col = 'black', lty = c('solid'),
           legend = c('Frequency used to simulate the data'), cex = 1, bty = 'n', lwd = 1.5)
  }

  legend('topright', col = chain_cols, lty = 'solid', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains), cex = 1, bty = 'n', lwd = 1.5)
  title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  #========================================================================================================

  #========================================================================================================
  # Denisty plots
  par(mfrow = c(min(4, no_haplotypes/2), 2))

  for(i in 1:no_haplotypes){
    plot(NULL, ylim = c(0, max(density(results$genotype_freq_store_chains[plot_thin_preburnin,i,])$y)+10),
         cex.lab = 1.2, cex.axis = 1.2, xlim = range(results$genotype_freq_store_chains[,i,]), bty = 'n',
         ylab = '', xlab = '')

    title(xlab = sprintf('Haplotype %s frequency',haplotypes[i]),
          ylab = 'Density', line = 2.5)

    for(chain in 1:mcmc_variable_list$no_mcmc_chains){
      lines(density(results$genotype_freq_store_chains[plot_thin_preburnin, i, chain]), col = chain_cols[chain])
    }

    legend('top', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains),lty = 'solid', col = chain_cols, cex = 1, bty = 'n')
    title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  }
  par(mfrow = c(1,1)) # reset
  #========================================================================================================
  #========================================================================================================
  # Haplotype counts/mois acceptance rate per individual
  if(child){
    thinnin_seq <- plot_thin_preburnin_noburnin
  } else {
    thinnin_seq <- plot_thin_preburnin
  }

  plot(NULL, ylim = c(0,1), xlim = c(min(thinnin_seq),mcmc_variable_list$no_traces_preburnin), xlab = 'Trace', ylab = 'Acceptance rate',
       bty = 'n', cex.lab = 1.2, xaxt = 'n')

  axis(side = 1, at = c(min(thinnin_seq),mcmc_variable_list$no_traces_preburnin), labels = c(min(thinnin_seq),mcmc_variable_list$no_traces_preburnin)*mcmc_variable_list$thinning_interval)

  abline(h = 0, lty = 'dotted')

  for(individual in 1:no_total){
    for(chain in 1:mcmc_variable_list$no_mcmc_chains){
      lines(y = results$acceptance_store_chains[thinnin_seq, individual, chain], x = thinnin_seq, col = chain_cols[chain])
    }
  }
  legend('topright', col = chain_cols, lty = 'solid', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains), cex = 1, bty = 'n', lwd = 1.5)
  title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  #========================================================================================================
  #   #========================================================================================================
  #   # Correlation (large)
  #   for(chain in 1:mcmc_variable_list$no_mcmc_chains){
  #     post_bin <- results$genotype_freq_store_chains[-unique(c(plot_thin_preburnin, burnin)),,chain]
  #     try(scatterplotMatrix(post_bin[sample(nrow(post_bin), min(100,nrow(post_bin))),]))
  #   }
  #   title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
  #   #========================================================================================================
  #=====================================================================================================
  # plot the haplotype aggregate - thinned by 10 and discluding burn in
  total_no_traces_postburnin <- dim(mcmc_As)[1]

  if(total_no_traces_postburnin > 1000){
    hap_thin <- 100
  }else{
    hap_thin <- 10
  }
  X<-array(NA, dim =c(total_no_traces_postburnin/hap_thin,
                      no_haplotypes,min(16,no_total)),
           dimnames = list(NULL,haplotypes,mini_title_names[randomly_selected_bloodsamples])) # Define X as a rehaplotypes ed version of Trace_genotype_counts in which to store thinned data with burn in disgarded

  for(i in 1:min(16, no_total)){ # Sample <=16 blood samples at random
    X[,,i]<-mcmc_As[seq(1,total_no_traces_postburnin, hap_thin),randomly_selected_bloodsamples[i],]
  } # Not sure if this is necessary anymore

  print(levelplot(X, # levelplot for array
                  aspect='fill',
                  xlab='Iteration',
                  ylab='Haplotype',
                  par.strip.text = list(cex=0.4), # Make pid label smaller
                  col.regions = gray(moi_list$moi_max:0/moi_list$moi_max),
                  cuts = moi_list$moi_max, # No. of moi cuts
                  sub = sprintf('Posterior for %s random blood samples, additionally thinned by %s, post burnin', min(no_total,16), hap_thin),
                  cex = 0.1,
                  cex.title = 0.1,
                  colorkey=list(space='right', # Colour bar parameters
                                col = gray(moi_list$moi_max:0/moi_list$moi_max),
                                at = 0:(moi_max+1),
                                width=0.4,
                                height=1,
                                labels=list(labels = paste('Count: ',0:moi_list$moi_max,sep=''),
                                            at = 0:moi_max+0.5,
                                            cex = 0.5)),
                  scales=list(x=list(tick.number=3, # Axis parameters
                                     label='',
                                     at=1,
                                     alternating=c(1,rep(0,1)), # Control labels
                                     tck=0,# Length of tick marks is zero
                                     rot=90,
                                     cex=0.4),# Rotate Labels
                              y=list(tick.number=no_haplotypes,
                                     alternating=c(1,rep(0,1)),
                                     tck=0,
                                     at=1:length(dimnames(X)[[2]]),
                                     labels=dimnames(X)[[2]],
                                     cex=0.3))))
  #=================================================================================================================
  #==================================================================================================================
  # Posterior distribution over genotypes for 16 randomly selected individuals
  # Remember, the simulated distribution is composed of one or more haplotypes if mixed, hence there is no 'true' haplotype;
  # rather, a combination of true haplotypes.
  haplotype_dataframe<-c()

  for(i in randomly_selected_bloodsamples){
    haplotype_dataframe <- rbind(haplotype_dataframe,
                                 data.frame(hap_counts = colSums(mcmc_As[,i,haplotypes]),
                                            hap_name = haplotypes,
                                            id = as.factor(mini_title_names[i])))}

  print(barchart(hap_counts ~ hap_name|id, data = haplotype_dataframe,
                 par.strip.text=list(cex=0.4), # Make id label smaller
                 stack=FALSE,
                 col='gray',
                 ylab='Haplotype count',
                 xlab='Haplotype',
                 sub = sprintf('Haplotype barchart, %s randomly selected blood samples, post burnin',min(no_total,16)),
                 cex.title=0.5,
                 scales=list(x=list(tck=0,# Length of tick marks is zero
                                    cex=0.6,
                                    rot=54),# Rotate Labels
                             y=list(tck=0,
                                    alternating=c(1,rep(0,1))))))
  #==================================================================================================================
  #==================================================================================================================
  # Plot the change in m for each individual - thinned by 10 and discluding burn in
  print(levelplot(mcmc_mois[seq(1,dim(mcmc_mois)[1],plot_thin), randomly_selected_bloodsamples], # Thin for plotting
                  aspect = 'fill',
                  xlab='Traces (burnin removed, chains collapsed, then further thinned (by 10))',
                  ylab='Blood sample ID',
                  col.regions=gray(7:0/7),
                  cuts=7, # No. of moi cuts
                  sub=sprintf('MOI posterior, %s blood samples, post burnin, additionally thinned by 10',min(no_total,16)),
                  cex.title=0.2,
                  colorkey=list(space='right', # Colour bar parameters
                                tick.number=9,
                                width=0.4,
                                height=1,
                                labels=list(cex=0.5,
                                            at = 1:moi_list$moi_max,
                                            labels=paste('MOI:',1:moi_list$moi_max,sep=''))),  # No MOI=0
                  scales=list(x=list(label='',
                                     at=1,
                                     tck=0,# Length of tick marks is zero
                                     rot=0,
                                     cex=0.6),# Rotate Labels
                              y=list(tck=0,
                                     labels = mini_title_names[randomly_selected_bloodsamples],
                                     cex=0.4))))
  #==================================================================================================================

  #==================================================================================================================
  # Plot the distribution of m for 16 randomly selected bloodsamples
  true_moi<-c()
  if(mcmc_variable_list$log_like_zero){moi_simulated_sample[]<-Inf}

  for(i in randomly_selected_bloodsamples){
    X <- rep('Estimate',moi_list$moi_max)
    if(moi_simulated_sample[i] <= moi_list$moi_max){X[moi_simulated_sample[i]]<-'Simulated'}
    true_moi<-c(true_moi,X)
  }

  moi_counts<-apply(mcmc_mois[,randomly_selected_bloodsamples],2,tabulate,nbins=moi_list$moi_max) # Tabulate the MOIs per individual
  MOI <- data.frame(moi_count = as.vector(moi_counts),   # Create data frame
                    moi = as.factor(rep(1:moi_list$moi_max, min(16, no_total))),
                    pid = rep(mini_title_names[randomly_selected_bloodsamples],rep(moi_list$moi_max, min(16, no_total))),
                    true_moi=true_moi)

  print(barchart(moi_count ~ moi | pid, data = MOI,
                 groups = true_moi,# Highlight true moi
                 auto.key = list(space = "right"),
                 par.strip.text = list(cex = 0.4), # Make pid label smaller
                 stack=TRUE,
                 ylab = 'MOI count',
                 xlab = 'MOI',
                 sub = sprintf('MOI barchart, %s blood samples, post burnin, chains combined', min(no_total,16)),
                 cex.title = 0.1,
                 scales=list(x=list(tck=0,# Length of tick marks is zero
                                    rot=45,
                                    cex=0.6),# Rotate Labels
                             y=list(tck=0,
                                    cex=0.6,
                                    alternating=c(1,rep(0,1))))))
  #==================================================================================================================

  if(child == TRUE){ # Extra plots if child hierachy is included
    #========================================================================================================
    # Population frequency acceptance rate per individual
    results$acceptance_store_population_chains[results$acceptance_store_population_chains == 0] <- NA
    par(mfrow = c(1,1))
    plot(NULL, ylim = c(0,1), xlim = c(max(burnin)+1,mcmc_variable_list$no_traces_preburnin), xlab = 'Trace', ylab = 'Population frequency acceptance rate',
         bty = 'n', cex.lab = 1.2, xaxt = 'n')
    axis(side = 1, at = c((max(burnin)+1),mcmc_variable_list$no_traces_preburnin), labels = c((max(burnin)+1),mcmc_variable_list$no_traces_preburnin)*mcmc_variable_list$thinning_interval)
    for(chain in 1:mcmc_variable_list$no_mcmc_chains){lines(y=results$acceptance_store_population_chains[plot_thin_preburnin_noburnin,chain],
                                                            x = plot_thin_preburnin_noburnin, col = chain_cols[chain])}
    legend('topright', col = chain_cols, lty = 'solid', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains), cex = 1, bty = 'n', lwd = 1.5)
    title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
    #========================================================================================================
    #========================================================================================================
    # Sigma acceptance rate per individual
    par(mfrow = c(1,1))
    plot(NULL, ylim = c(0,1), xlim = c(max(burnin)+1,mcmc_variable_list$no_traces_preburnin), xlab = 'Trace', ylab = 'Dispersion parameter acceptance rate',
         bty = 'n', cex.lab = 1.2, xaxt = 'n')
    axis(side = 1, at = c((max(burnin)+1),mcmc_variable_list$no_traces_preburnin), labels = c((max(burnin)+1),mcmc_variable_list$no_traces_preburnin)*mcmc_variable_list$thinning_interval)
    for(chain in 1:mcmc_variable_list$no_mcmc_chains){lines(y=results$acceptance_store_sigma_chains[plot_thin_preburnin_noburnin, chain],
                                                            x=plot_thin_preburnin_noburnin, col = chain_cols[chain], lty = 'solid')}
    legend('topright', col = chain_cols, lty = 'solid', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains), cex = 1, bty = 'n', lwd = 1.5)
    title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plot_thin), cex.sub = 0.5)
    #========================================================================================================
    #========================================================================================================
    # Classic mcmc plots of sigma (problems with start and end (try more iterations))
    mcmc_sigma_temp0 <- alply(results$sigma_store_chains[-burnin,], 2)
    mcmc_sigma_temp1 <- lapply(mcmc_sigma_temp0, FUN = mcmc, thin = mcmc_variable_list$thinning_interval, start = (max(burnin)*mcmc_variable_list$thinning_interval)+1, end = mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval)
    mcmc_sigma <- mcmc.list(mcmc_sigma_temp1)
    Eff_sigma <- round(effectiveSize(mcmc_sigma),0)
    plot(mcmc_sigma, bty = 'n', density = FALSE, col = chain_cols, ylab = expression(sigma), smooth = FALSE) # No burnin
    legend('topright', col = chain_cols, lty = 'solid', legend = paste('Chain', 1:mcmc_variable_list$no_mcmc_chains), cex = 1, bty = 'n', lwd = 1.5)
    density_sigma_posterior <- density(as.vector(results$sigma_store_chains[-burnin,]))

    density_sigma_prior <- dgamma(density_sigma_posterior$x, shape = sigma_list$sigma_shape_hyperparameter, scale = sigma_list$sigma_scale_hyperparameter)

    plot(density_sigma_posterior, bty = 'n', main = '', xlab = bquote(sigma ~ '( effective sample size' ~.(Eff_sigma)~ ')'), ylim = c(0,max(density_sigma_prior,density_sigma_posterior$y)), lwd = 2)

    if(mcmc_variable_list$log_like_zero){lines(y = density_sigma_prior, x = density_sigma_posterior$x, col = 'green', lwd = 2)}
    if(Simulated & !mcmc_variable_list$log_like_zero){abline(v = data_summary$sigma, lty = 'dashed', lwd = 2)}
    if(Simulated & mcmc_variable_list$log_like_zero){legend('top', legend = c('Posterior distribution', 'Prior distribution'),
                              col = c(1,3), lty = c('solid', 'solid'), bty = 'n', lwd = 2)
    }
    if(Simulated & !mcmc_variable_list$log_like_zero){legend('top', legend = expression('Simulated'~ sigma, 'Posterior distribution'),
                                                            col = c(1,1), lty = c('dashed', 'solid'), bty = 'n', lwd = 2)
    }
    autocorr.plot(mcmc_sigma)
    #========================================================================================================
    #========================================================================================================
    # Plots of frequency_child and sigma
    Xlist0 <- alply(results$genotype_freq_child_store_chains[-burnin,,,],2) # Split by child (value is a list with one entry per child)
    Xlist1 <- lapply(Xlist0, FUN = alply, 3) # For each child entry, split by chain (value is a list of lists, one entry per child, and, per chain, however many entries as chains)
    Xlist2 <- lapply(Xlist1, FUN = lapply, mcmc, start = (max(burnin)*mcmc_variable_list$thinning_interval)+1, end = mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval, thin = mcmc_variable_list$thinning_interval) # Turn each chain entry into an mcmc object
    mcmc_frequency_child_chains <- lapply(Xlist2, FUN = mcmc.list) # For each child, create mcmc.list over chains
    # Classic mcmc plots for a random child
    random_child <- sample(no_children, 1)
    plot(mcmc_frequency_child_chains[[random_child]], bty = 'n', col = chain_cols, smooth = FALSE)

    # Classic mcmc summary for all children combined (an estimate of the frequency population based on the frequency of the children)
    Xlist0 <- alply(results$genotype_freq_child_store_chains[-burnin,,,],2) # Create list of children
    Xlist1 <- abind(Xlist0, along = 3) # Combine all children
    dimnames(Xlist1)[[3]] <- NULL
    Xlist2 <- alply(Xlist1, 3) # Chains
    Xlist3 <- lapply(Xlist2, FUN = mcmc, thin = mcmc_variable_list$thinning_interval) # create mcmc objects
    mcmc_population_child_chains <- mcmc.list(Xlist3)
    summary(mcmc_population_child_chains) # All chains combined

    # Classic mcmc plot for all children combined (thinned before plotting)
    plotting_thinning_interval <- 100
    thin_mcmc_population_child_chains <- seq(1, nrow(mcmc_population_child_chains[[1]]), by = plotting_thinning_interval)
    Xlist0 <- lapply(mcmc_population_child_chains, FUN = function(x){x[thin_mcmc_population_child_chains,]}) # thin
    Xlist1 <- lapply(Xlist0, mcmc, thin = mcmc_variable_list$thinning_interval*plotting_thinning_interval) # reclassify as mcmc object
    mcmc_population_child_chains_thinned = mcmc.list(Xlist1) # re-classify as mcmc list
    plot(mcmc_population_child_chains_thinned, smooth = FALSE, bty = 'n', density = FALSE, col = rainbow(length(mcmc_population_child_chains_thinned), alpha = 0.2))
    title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plotting_thinning_interval), cex.sub = 0.5)
    plot(mcmc_population_child_chains_thinned, bty = 'n', trace = FALSE, xlim = c(0,1)) # All chains combined
    title(sub = sprintf('In addition to the thinning interval used to generate results, thinned by %s', plotting_thinning_interval), cex.sub = 0.5)
    par(mfrow = c(1,1))

    #========================================================================================================
    # Violin of frequency pop estimated from randomly selected children
    # Samples from the dirichlet prior for overlaying when mcmc_variable_list$log_like_zero is true
    randomly_selected_children <- sample(length(mcmc_population_child_chains_thinned),10)
    Cols <- rainbow(length(randomly_selected_children), alpha = 0.5)
    priorsamples <- rdirichlet(1000, frequency_list$frequency_hyperparameter) # frequency_hyperparameter
    colnames(priorsamples) <- haplotypes
    plot(NULL, xlim=c(0.5,no_haplotypes + 0.5), ylim=c(0,1.05), xaxt = 'n', yaxt = 'n', ylab = 'Frequency', cex.lab = 1.2, xlab = 'Haplotypes')
    for(Child in 1:length(randomly_selected_children)){
      for(haplotype in haplotypes){
        if(mcmc_variable_list$log_like_zero){if(Child ==1){vioplot(priorsamples[,haplotype], at = which(haplotype == haplotypes), col = 'gray', add = TRUE)}}
        try(vioplot(mcmc_population_child_chains_thinned[,haplotype][[Child]], at = which(haplotype == haplotypes), col = Cols[Child], add = TRUE))}}
    if(!mcmc_variable_list$log_like_zero & Simulated){points(y = data_summary$Sample_haplotype_frequencies[,haplotypes ], x=1:no_haplotypes, pch = 19, col = 'black')}
    box(col = 'white', lwd = 6)

    axis(side = 1, at = 1:no_haplotypes, labels = haplotypes, tick = FALSE, cex.axis = 1, las = 2, line = -1)
    axis(side = 2, at = seq(0,1,0.5), labels = seq(0,1,0.5), cex.axis = 1.2)
    if(!mcmc_variable_list$log_like_zero){legend('topright', legend = c('Frequency used to simulate the data'), pch = c(19), col = 'black', bty = 'n', cex = 1)}
    legend('topleft', legend = paste('Child', randomly_selected_children),lty = 'solid', col = Cols, cex = 1, bty = 'n', lwd = 3)
    title(sub = sprintf('Thinned by an extra %s', plotting_thinning_interval))
    #========================================================================================================
  }


  #==================================================================================================================
  if(names(dev.cur()) == 'pdf'){dev.off()}

  # if(child == TRUE){ # Extra plots if child is true (too big for pdf)
  # # Classic mcmc plots for all children for a random chain and 3 random haplotypes (massive pdf size hence not printed to pdf)
  # Xlist0 <- alply(results$genotype_freq_child_store_chains[-burnin,,,],4) # Split by chain (value is a list with one entry per chain)
  # Xlist1 <- lapply(Xlist0, FUN = alply, 2) # For each chain, split by child (value is a list of list, one entry per chain, and, per chain, however many entries as children)
  # Xlist2 <- lapply(Xlist1, FUN = lapply, mcmc, start = (0.5*mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval)+1, end = mcmc_variable_list$no_traces_preburnin*mcmc_variable_list$thinning_interval, thin = mcmc_variable_list$thinning_interval) # Turn each chain entry into an mcmc object
  # mcmc_frequency_child_chains <- lapply(Xlist2, FUN = mcmc.list) # For each child, create mcmc.list over chains
  # random_chain <- sample(mcmc_variable_list$no_mcmc_chains, 1) # Pick a random chain
  # random_haplotypes <- sample(no_haplotypes, 3)
  # plot(mcmc_frequency_child_chains[[random_chain]][,random_haplotypes], bty = 'n') # plot
  # }
  #==================================================================================================================
}
artaylor85/FreqEstimationModel documentation built on Feb. 1, 2024, 6:44 p.m.