Note that to get this to really be a vignette, I must insert this at the top:
<!-- %\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{Running on chinook data and comparing to Colony} -->
library(fullsniplings)
So, are going to use the chinook full sibling data:
dim(fs_dev_test_data$chinook_full_sibs_genos)
That is r nrow(fs_dev_test_data$chinook_full_sibs_genos)
at r ncol(fs_dev_test_data$chinook_full_sibs_genos)/2
SNPs.
We can run the chain like this. Note that I have commented out the actual running of the chain.
#chinook_results <- run_mcmc(fs_dev_test_data$chinook_full_sibs_genos, burn_in = 50, num_sweeps = 200) #save(chinook_results, file="chinook_chain_results.rda") # i have commented that out so it doesn't take took much time, and I can get it back quickly # like this: saved_results <- system.file("chinook_chain_results.rda", package="fullsniplings", mustWork = T) load(saved_results)
Here I pull out some code from observe_mcmc.R
to get the hashable names of the
different true sibling groups as a character vector called true_sibgroups_as_strings
.
ped <- fs_dev_test_data$chinook_full_sibs_pedigree ordered.kids <- rownames(fs_dev_test_data$chinook_full_sibs_genos) ped$kididx <- as.integer(factor(ped$Kid, levels = ordered.kids)) - 1 kid.split <- split(ped$kididx, paste(ped$Pa, "---", ped$Ma, sep="")) # here we get a list of all the sibling groups sibgroups <- unname(kid.split[order(sapply(kid.split, length), decreasing=T)]) # and here we can store them as strings: true_sibgroups_as_strings <- sapply(sibgroups, function(x) paste(x, collapse="-"))
And, I also get a list of all the siblings that belong to each individual, in the truth:
# now get a list of the siblings of an individual sg.list <- vector("list", length = nrow(ped)) for(L in kid.split) { for(ind in L) { sg.list[[ind+1]] <- setdiff(L, ind) } }
Now, if Ind
is the base-0 index of an individual then
sg.list[[Ind+1]]
is a vector of the base-0 indices of all of its full siblings
Put the greedy partition results into a table with a short name, then add some columns to it:
crtab <- chinook_results$Partition crtab$Correct <- rownames(crtab) %in% true_sibgroups_as_strings crtab$CumulCorrectSibs <- cumsum(crtab$NumSibs * crtab$Correct) crtab$CumulSibsInIncorrectSibships <- cumsum(crtab$NumSibs * (1-crtab$Correct))
Note that the last line gives us a really punishing measure: if a sibship is incorrect, even if it is only off by one individual, everyone in it is counted as wrong.
There is one sibship with posterior of 1 and it is size 8, but it is wrong. Compare these:
# here is the row that has them crtab["94-105-231-405-529-532-551-847", ] # and here is what the sibship probably should look like: c(94, sg.list[[95]])
I think we have seen these guys before and they are full cousins or something.
We will add some more columns to the table
# this is a list of sibgroups by their base-0 index S <- lapply(strsplit(rownames(chinook_results$Partition), "-"), as.numeric) OmAndCom <- ave_err_commission_and_ommission(S, sg.list) crtab$AveNumNonSibs <- OmAndCom$AveNumNonSibs crtab$AveNumSibsMissing <- OmAndCom$AveNumSibsMissing
This will be a little weird. I will use a previous Colony run, since I don't have time to re-run it. That previous run included 50 extra individuals (some half-siblings) which I will discard from the results which will give us something that is almost comparable.
First I note that I can look at the standard output:
2014-05-29 11:53 /PairX1/--% pwd /Users/eriq/Documents/xp_dev_svn_checkouts/slg_pipe/arena/FRH_ColonyRun/ColonyArea/Collections/AA/PairX1 2014-05-29 11:55 /PairX1/--% gzcat StdoutColony.txt.gz # and find in there the running time: TotalRunningTime(Min)=1.458552E+02
So, running time is 146 minutes, which I can call about 2.5 hours, and that, remember is for the pairwise version of Colony.
# read in the colony sibships colony_sib_file <- system.file("data_files/colony/AA_PairX1.BestFSFamily", package="fullsniplings", mustWork = T) csibs <- SlurpSibships(file=colony_sib_file, skip=1) # now, put the names back on them: load(system.file("data_files/colony/FRH-sibs-for-pipeline.Rda", package="fullsniplings", mustWork = T)) names(fish.names) <- rownames(data) idxs <- 1:nrow(data) names(idxs) <- rownames(data) csibs <- NamizeSibs(csibs, fish.names, idxs)
At this juncture the names of these fish in the colony output are like this: T065899_F_10-6-2010--T030068_?_9-28-2007--T030092_?_9-28-2007
.
So, we can use what is in ped
to go immediately to the base-0 index of each fish, just like we have for the
full-sniplings output:
bz_idx <- ped$kididx # make names for this base-0 index vector by catenating Kid-Pa-Ma names(bz_idx) <- paste(ped$Kid, ped$Pa, ped$Ma, sep="--") # and then make a sibship list of just those indices: colony_sibs <- lapply(csibs$sibs.names, function(x) {z <- unname(bz_idx[x]); z[!is.na(z)]}) # drop the empty ones empties <- sapply(colony_sibs, length)>0 colony_sibs <- colony_sibs[empties] colony_probs <- as.numeric(csibs$probs[empties])
And now we want to make a data frame of those results that is similar to crtab
colony_tab <- data.frame(Posterior = colony_probs, NumSibs = sapply(colony_sibs, length)) rownames(colony_tab) <- sapply(colony_sibs, function(x) paste(sort(x), collapse="-")) # now put them in order by posterior and then size ord <- order(colony_tab$Posterior, colony_tab$NumSibs, decreasing = T) colony_tab <- colony_tab[ord, ] colony_tab$Correct <- rownames(colony_tab) %in% true_sibgroups_as_strings colony_tab$CumulCorrectSibs <- cumsum(colony_tab$NumSibs * colony_tab$Correct) colony_tab$CumulSibsInIncorrectSibships <- cumsum(colony_tab$NumSibs * (1-colony_tab$Correct)) # then get average errors of commission and omission OmAndCom <- ave_err_commission_and_ommission(colony_sibs, sg.list) colony_tab$AveNumNonSibs <- OmAndCom$AveNumNonSibs colony_tab$AveNumSibsMissing <- OmAndCom$AveNumSibsMissing
Here is one for my method
# uncomment the following line if you want to make pdfs on a mac. #quartz(width=10, height=6) sib_result_plot(crtab$NumSibs, crtab$Visits/200, false_positive = crtab$AveNumNonSibs>0.001, lscale = .03, correct = crtab$Correct) dev.copy2pdf(file="fullsniplings-posterior-plot.pdf")
and here is one for colony:
sib_result_plot(colony_tab$NumSibs, colony_tab$Posterior, false_positive = colony_tab$AveNumNonSibs>0.001, lscale = .03, correct = colony_tab$Correct) dev.copy2pdf(file="colony-posterior-plot.pdf") #sib_result_plot(crtab$NumSibs, (crtab$Visits/200) - .4, false_positive = crtab$AveNumNonSibs>0.001, lscale = .03, correct = crtab$Correct, add=T)
How about some plots of cumulative number of individuals put in the right sibship and number put in the wrong sibship:
plot(colony_tab$CumulCorrectSibs, colony_tab$CumulSibsInIncorrectSibships, xlim = c(0, max(crtab$CumulCorrectSibs)), type = "l", lwd = 2, ylab = "Number of Fish Placed in Incorrect or Incomplete Sibling Groups", xlab = "Number of Fish Placed in Correct Sibing Groups") lines(crtab$CumulCorrectSibs, crtab$CumulSibsInIncorrectSibships, lwd = "2", col = "orange") legend("topleft", legend = c("COLONY", "FULLSNIPLINGS"), col = c("black", "orange"), lty = "solid", lwd = 2 ) text(x = 0, y = 175, "Totals:" ) text(x = 0, y = 150, paste("FULLSNIPLINGS: ", crtab[nrow(crtab), "CumulCorrectSibs"], "correct and ", crtab[nrow(crtab), "CumulSibsInIncorrectSibships"], "incorrect."), pos = 4 ) text(x = 0, y = 125, paste("COLONY: ", colony_tab[nrow(colony_tab), "CumulCorrectSibs"], "correct and ", colony_tab[nrow(colony_tab), "CumulSibsInIncorrectSibships"], "incorrect."), pos = 4 ) dev.copy2pdf(file = "cumul-correct-vs-incorrect.pdf")
We can also think about the cumulative number if individuals that have been placed in sibships that have at least one non sib, as a function of the total number of individuals placed correctly into their sibships.
crcum <- cumsum(crtab$NumSibs * (crtab$AveNumNonSibs > 0.01)) colcum <- cumsum(colony_tab$NumSibs * (colony_tab$AveNumNonSibs > 0.01)) plot(crtab$CumulCorrectSibs, crcum, type = "l", lwd=2, col = "orange", ylab = "Number of Fish Grouped With Non-Siblings", xlab = "Number of Fish Placed in Correct Sibing Groups") lines(colony_tab$CumulCorrectSibs, colcum, lwd = 2) legend("topleft", legend = c("COLONY", "FULLSNIPLINGS"), col = c("black", "orange"), lty = "solid", lwd = 2 ) text(x = 0, y = 48, " Totals:" ) text(x = 0, y = 44, paste("FULLSNIPLINGS: ", crtab[nrow(crtab), "CumulCorrectSibs"], "correct and ", max(crcum), "grouped with a non-sib"), pos = 4 ) text(x = 0, y = 38, paste("COLONY: ", colony_tab[nrow(colony_tab), "CumulCorrectSibs"], "correct and ", max(colcum), "grouped with a non-sib"), pos = 4 ) dev.copy2pdf(file = "cumul-correct-vs-grouped-with-non-sib.pdf")
And also cumulative number of individuals placed in sibgroups that are missing at least one of their true sibs:
plot(crtab$CumulCorrectSibs, cumsum(crtab$NumSibs * (crtab$AveNumSibsMissing > 0.01)), type = "l", lwd=2, col = "orange", ylim=c(0,sum(colony_tab$NumSibs * (colony_tab$AveNumSibsMissing > 0.01)))) lines(colony_tab$CumulCorrectSibs, cumsum(colony_tab$NumSibs * (colony_tab$AveNumSibsMissing > 0.01)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.