In mahendra-mariadassou/L1fdvGenetics: Companion self-learning R package to the Genetics Project of FDV bachelor

library(learnr)
### Functions --------------------------
create_fake <- function() {
  data <- data.frame(
    rat  = 1:100, 
    Genotype  = rep(c("G0", "G1"), times = c(49, 51)), 
    Phenotype = factor(sample(c("Low", "High"), size = 100, replace = TRUE), levels = c("Low", "High")))
  tab <- with(data, table(Genotype, Phenotype))
  deltap <- -100 * diff(tab[, 2] / rowSums(tab))
  print(tab)
  cat(paste0("Delta p = ", round(deltap, digits = 2), "%"))
  invisible(deltap)
}
create_fake_cont <- function() {
  data <- data.frame(
    rat  = 1:100, 
    Genotype  = rep(c("G0", "G1"), times = c(49, 51)), 
    Phenotype = rnorm(100, mean = 10.21, sd = 1.08))
  deltamu <- diff(with(data, tapply(X = Phenotype, INDEX = Genotype, FUN = mean)))
  cat(paste0("Delta mu = ", round(deltamu, digits = 2)))
  invisible(deltamu)
}
log_likelihood_H0 <- function(mu = 10, sigma = 1) {
  sum(dnorm(data_cont$Phenotype, mean = mu, sd = sigma, log = TRUE))
}
log_likelihood_H1 <- function(mu0 = 10, mu1 = 10, sigma = 1) {
  with(data_cont, 
       sum(dnorm(Phenotype[Genotype == "G0"], mean = mu0, sd = sigma, log = TRUE)) + 
         sum(dnorm(Phenotype[Genotype == "G1"], mean = mu1, sd = sigma, log = TRUE))
  )
}
### data ------------------------------------
# set.seed(2012)
# data <- data.frame(rat  = 1:100, 
#                    Genotype  = sample(c("G0", "G1"), size = 100, replace = TRUE), 
#                    Phenotype = factor(sample(c("Low", "High"), size = 100, replace = TRUE), levels = c("Low", "High")))
# Genotype <- sample(c("G0", "G1"), size = 100, replace = TRUE) 
# data_cont <- data.frame(rat  = 1:100, 
#                    Genotype  = Genotype,
#                    Phenotype = rnorm(100, 10, 1) + ifelse(Genotype == "G0", 0.5, 0))
data <- structure(list(Rat = 1:100, Genotype = structure(c(2L, 1L, 
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 1L), .Label = c("G0", "G1"), class = "factor"), Phenotype = structure(c(2L, 
2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 
1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 
1L, 2L, 1L), .Label = c("Low", "High"), class = "factor")), class = "data.frame", row.names = c(NA, 
-100L))
Genotype <- c("G0", "G1", "G0", "G0", "G1", "G1", "G1", "G0", "G1", "G1", 
"G1", "G1", "G0", "G1", "G1", "G0", "G1", "G0", "G1", "G0", "G0", 
"G1", "G1", "G0", "G0", "G1", "G1", "G0", "G0", "G0", "G0", "G0", 
"G1", "G0", "G1", "G1", "G0", "G1", "G0", "G0", "G0", "G1", "G1", 
"G0", "G0", "G0", "G1", "G0", "G1", "G0", "G0", "G1", "G1", "G0", 
"G0", "G0", "G0", "G0", "G1", "G0", "G1", "G0", "G1", "G1", "G1", 
"G1", "G0", "G1", "G1", "G1", "G1", "G1", "G0", "G0", "G1", "G1", 
"G0", "G1", "G0", "G0", "G1", "G0", "G0", "G0", "G1", "G1", "G1", 
"G0", "G1", "G0", "G1", "G0", "G1", "G1", "G1", "G1", "G0", "G1", 
"G0", "G0")
data_cont <- structure(list(Rat = 1:100, Genotype = structure(c(1L, 2L, 
1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 
1L, 1L), .Label = c("G0", "G1"), class = "factor"), Phenotype = c(9.93734092284792, 
10.0548659404013, 10.4215516706911, 11.6381450846651, 8.74185446056025, 
10.4611997336135, 8.05276399846212, 11.1150308690493, 11.9985652013207, 
9.61956928875065, 9.26046906692287, 10.4727744963503, 12.4675501750474, 
9.69775819853524, 10.6544037091517, 10.8126772600706, 9.79313109845306, 
9.25220042461025, 11.2820476689928, 11.6266954320819, 8.6283804660775, 
8.00496534983642, 10.9575888584778, 10.5892593381063, 10.7711606735996, 
9.29939254059148, 8.60956254494855, 11.2366683342963, 12.339738650315, 
9.35626230756599, 10.8052458754541, 9.38268386473913, 10.2010042546926, 
12.2530916649827, 10.7820958747831, 11.5020214591256, 8.75399262028028, 
10.4772015744214, 10.2598624792032, 11.117157662582, 12.2862191973056, 
6.78925644314705, 8.44306462727756, 10.4141317381677, 12.5971160044294, 
10.6915251703253, 7.54714404206572, 11.5177547499849, 9.86939216703955, 
10.874850607998, 10.5553335548083, 10.9354233599546, 11.2003272130828, 
11.4471039138719, 11.5593890685936, 9.53822989272397, 9.27902841060648, 
10.166931222005, 11.6008349802568, 12.3437714394999, 8.05183536400729, 
10.1256855775324, 9.52486044644431, 9.4941808279217, 11.7748865296171, 
9.56756862085787, 10.5101454872055, 8.651924297414, 9.93229886391162, 
10.5303077528798, 10.1504897408479, 8.57953875122042, 9.46878233927001, 
10.5647336608206, 10.5302482547551, 11.6552175793199, 9.87240555992876, 
9.34364943707429, 10.3080589602983, 9.43929946872875, 9.48704460603593, 
9.7325552505353, 10.0195890393412, 9.83830983666413, 9.39409702815054, 
11.0951295518432, 9.43630513858789, 11.1294633606095, 10.3072857605579, 
8.82007290277783, 10.839027154399, 10.616956137227, 9.60540096295152, 
10.6913338487242, 10.5297884075298, 9.15731217518669, 10.4207978576609, 
9.695185585511, 10.0440365954716, 10.435352524834)), class = "data.frame", row.names = c(NA, 
-100L))
# set.seed(1)
# values <- replicate(expr = create_fake(), n = 200)
values <- c(G1 = -7.96318527410965, G1 = 7.88315326130452, G1 = -5.76230492196879, 
G1 = 7.8031212484994, G1 = -0.200080032012806, G1 = -18.0872348939576, 
G1 = -1.84073629451781, G1 = 3.88155262104842, G1 = 4.12164865946378, 
G1 = 10.0840336134454, G1 = 0.200080032012795, G1 = -9.92396958783513, 
G1 = 5.92236894757903, G1 = 12.2849139655862, G1 = -2.08083233293317, 
G1 = 11.5646258503401, G1 = -0.200080032012806, G1 = 1.84073629451781, 
G1 = 8.36334533813525, G1 = 13.9255702280912, G1 = -5.76230492196879, 
G1 = -3.96158463385354, G1 = -7.8031212484994, G1 = 4.04161664665867, 
G1 = -8.12324929971989, G1 = -5.92236894757904, G1 = -12.0448179271709, 
G1 = 11.6446578631453, G1 = -9.84393757503002, G1 = 6.08243297318927, 
G1 = -3.88155262104842, G1 = -2.00080032012805, G1 = 4.12164865946378, 
G1 = -9.84393757503002, G1 = -12.0448179271709, G1 = -4.04161664665866, 
G1 = 13.8455382152861, G1 = -2.24089635854342, G1 = -3.80152060824329, 
G1 = -11.8047218887555, G1 = 14.0856342537015, G1 = -12.0448179271709, 
G1 = 16.2064825930372, G1 = -20.0480192076831, G1 = 16.046418567427, 
G1 = 0.200080032012795, G1 = 0.120048019207686, G1 = 2.72108843537415, 
G1 = -17.9271708683473, G1 = 10.2440976390556, G1 = -10.1640656262505, 
G1 = -10.1640656262505, G1 = 1.60064025610244, G1 = -3.88155262104842, 
G1 = 2.16086434573829, G1 = -2.1608643457383, G1 = 15.8863545418167, 
G1 = 9.76390556222489, G1 = 18.2472989195678, G1 = -8.20328131252501, 
G1 = -16.2865146058423, G1 = -4.20168067226891, G1 = -5.68227290916367, 
G1 = -10.2440976390556, G1 = 4.04161664665867, G1 = -9.84393757503002, 
G1 = -8.04321728691477, G1 = -2.00080032012805, G1 = 2.00080032012805, 
G1 = -0.0400160064025601, G1 = 9.60384153661464, G1 = -3.96158463385354, 
G1 = -3.96158463385354, G1 = 4.36174469787916, G1 = -7.96318527410965, 
G1 = 4.04161664665867, G1 = 4.12164865946378, G1 = 1.76070428171268, 
G1 = 24.2096838735494, G1 = -11.484593837535, G1 = 3.96158463385354, 
G1 = 6.40256102440977, G1 = -16.2865146058423, G1 = -4.04161664665866, 
G1 = 12.0448179271709, G1 = 17.7671068427371, G1 = -6.00240096038415, 
G1 = 8.04321728691476, G1 = 5.76230492196879, G1 = -16.1264505802321, 
G1 = 10.3241296518607, G1 = 15.9663865546218, G1 = -1.68067226890757, 
G1 = -4.28171268507403, G1 = 7.64305722288915, G1 = 2.08083233293318, 
G1 = -8.04321728691477, G1 = 10.0040016006402, G1 = -8.04321728691477, 
G1 = -6.40256102440976, G1 = -2.32092837134854, G1 = -1.76070428171269, 
G1 = -12.2048819527811, G1 = 0.200080032012795, G1 = -2.24089635854342, 
G1 = -5.68227290916367, G1 = 3.96158463385354, G1 = 13.6854741896759, 
G1 = -2.08083233293317, G1 = -20.2080832332933, G1 = -2.1608643457383, 
G1 = 6.00240096038416, G1 = -1.84073629451781, G1 = -25.8503401360544, 
G1 = -10.0040016006403, G1 = 2.08083233293318, G1 = -6.1624649859944, 
G1 = -3.88155262104842, G1 = 0.0400160064025656, G1 = 7.8031212484994, 
G1 = 5.84233693477391, G1 = -8.28331332533013, G1 = -23.8895558223289, 
G1 = 7.96318527410965, G1 = 1.84073629451781, G1 = 12.124849939976, 
G1 = 14.2456982793117, G1 = 4.12164865946378, G1 = 0.0400160064025656, 
G1 = -2.08083233293317, G1 = -20.1280512204882, G1 = 5.92236894757903, 
G1 = 11.8847539015606, G1 = -14.2456982793117, G1 = 5.84233693477391, 
G1 = 6.24249699879952, G1 = 2.08083233293318, G1 = 11.8047218887555, 
G1 = -2.00080032012805, G1 = 6.00240096038416, G1 = 1.76070428171268, 
G1 = 2.32092837134854, G1 = 0.0400160064025656, G1 = 5.84233693477391, 
G1 = 1.68067226890756, G1 = -6.00240096038415, G1 = -6.40256102440976, 
G1 = 2.16086434573829, G1 = 22.0088035214086, G1 = 11.8847539015606, 
G1 = 23.9695878351341, G1 = -9.84393757503002, G1 = -13.5254101640656, 
G1 = 0.200080032012795, G1 = -7.88315326130452, G1 = -12.124849939976, 
G1 = 3.8015206082433, G1 = -13.6854741896759, G1 = -6.00240096038415, 
G1 = 3.96158463385354, G1 = 16.046418567427, G1 = 13.6054421768708, 
G1 = -0.200080032012806, G1 = -7.96318527410965, G1 = -5.84233693477391, 
G1 = -10.2440976390556, G1 = -8.20328131252501, G1 = 15.8863545418167, 
G1 = -8.20328131252501, G1 = 9.76390556222489, G1 = 13.9255702280912, 
G1 = 11.7246898759504, G1 = 7.88315326130452, G1 = -0.0400160064025601, 
G1 = -7.88315326130452, G1 = 9.92396958783513, G1 = -4.04161664665866, 
G1 = -6.32252901160464, G1 = 5.84233693477391, G1 = -2.64105642256903, 
G1 = 2.16086434573829, G1 = -2.08083233293317, G1 = -2.1608643457383, 
G1 = 6.00240096038416, G1 = 3.8015206082433, G1 = -10.2440976390556, 
G1 = -10.0040016006403, G1 = -6.40256102440976, G1 = 19.6478591436575, 
G1 = 0.120048019207686, G1 = 14.0856342537015, G1 = -5.92236894757904, 
G1 = 14.2456982793117, G1 = 16.046418567427, G1 = 1.76070428171268, 
G1 = -5.68227290916367, G1 = -4.36174469787916, G1 = 0.0400160064025656, 
G1 = 10.0040016006402, G1 = -3.72148859543818)
# set.seed(1)
# values_cont <- replicate(expr = create_fake_cont(), n = 400)
## Options
values_cont <- c(G1 = 0.168874928042785, G1 = 0.0424472307158261, G1 = 0.207148048914965, 
G1 = -0.285269291944129, G1 = 0.39202991779255, G1 = 0.131232512500929, 
G1 = -0.0903680374141231, G1 = 0.289504708094434, G1 = 0.341499541472631, 
G1 = 0.00500205697282396, G1 = 0.285554107314825, G1 = 0.177814975292648, 
G1 = -0.153630970551768, G1 = -0.369772457693715, G1 = 0.209907337699034, 
G1 = 0.340080346679901, G1 = 0.29620902264246, G1 = -0.171844817972852, 
G1 = -0.0826279031872321, G1 = 0.12731242301972, G1 = -0.334175105835156, 
G1 = -0.19177945733942, G1 = -0.00242497180618351, G1 = -0.197921495977811, 
G1 = -0.167084188602518, G1 = 0.137149755622799, G1 = 0.215485041912951, 
G1 = -0.19657686522978, G1 = 0.0837060500230606, G1 = -0.126225078355295, 
G1 = 0.221279414904743, G1 = 0.198465552248829, G1 = -0.265993039269379, 
G1 = -0.0954136711886076, G1 = -0.184542814221194, G1 = 0.187345445257844, 
G1 = -0.127325951615042, G1 = 0.044305572964646, G1 = 0.146685082580891, 
G1 = 0.0509769616322728, G1 = -0.204773004116495, G1 = 0.159731938400665, 
G1 = 0.22603358464257, G1 = -0.102520420168485, G1 = 0.255817138485554, 
G1 = 0.185839248541916, G1 = -0.405998674114199, G1 = 0.0175082044056829, 
G1 = 0.104002089904423, G1 = -0.437450635522705, G1 = 0.369579257537307, 
G1 = -0.120455523125964, G1 = -0.116199027940995, G1 = 0.137601845706367, 
G1 = 0.144826460724129, G1 = -0.110541577445705, G1 = 0.188905861880034, 
G1 = 0.399365728736422, G1 = -0.320676746359743, G1 = -0.129888116862764, 
G1 = 0.281580700597855, G1 = 0.372725045010027, G1 = 0.191966440172212, 
G1 = -0.257341587662012, G1 = -0.0682595143833034, G1 = 0.373107447371101, 
G1 = 0.421366403491222, G1 = 0.280994544843868, G1 = 0.260543630190226, 
G1 = -0.203660867047468, G1 = -0.0201393335006745, G1 = -0.101303389249962, 
G1 = 0.093610293786492, G1 = -0.168783974770626, G1 = 0.485324306146802, 
G1 = -0.15235461940212, G1 = -0.105221340206388, G1 = 0.0276463582384903, 
G1 = -0.171384951794026, G1 = -0.119107011307065, G1 = -0.119659068843694, 
G1 = -0.304817105886373, G1 = 0.147997382916786, G1 = 0.17239430706508, 
G1 = 0.0474412765868184, G1 = -0.294607756332365, G1 = -0.238880215442968, 
G1 = 0.227341271208935, G1 = -0.0135269163938752, G1 = -0.111079333761392, 
G1 = 0.112836019587286, G1 = 0.0677840854667249, G1 = 0.250726911936678, 
G1 = 0.0292948206801054, G1 = 0.109437902964343, G1 = -0.288080420771244, 
G1 = 0.153502897202488, G1 = -0.0410735946269298, G1 = 0.225697007869343, 
G1 = 0.0770485825321412, G1 = -0.260473792634917, G1 = 0.290606585768463, 
G1 = 0.0796518643846831, G1 = 0.392432844748109, G1 = 0.120338046000205, 
G1 = 0.515714696965256, G1 = 0.0442951320973783, G1 = -0.0992792894937615, 
G1 = 0.312610739623684, G1 = 0.220092140081949, G1 = 0.142825932474882, 
G1 = -0.133191482893931, G1 = -0.358182977158087, G1 = 0.0143391982496084, 
G1 = -0.230532856752401, G1 = -0.00525476760205912, G1 = -0.365356513675435, 
G1 = 0.0743004614113829, G1 = 0.0348149409675997, G1 = 0.094657351971291, 
G1 = 0.161723122901847, G1 = -0.346996164930001, G1 = -0.185883062650243, 
G1 = -0.140080383737779, G1 = 0.100523434804929, G1 = -0.033815026137848, 
G1 = -0.0564541839659789, G1 = 0.207420192147994, G1 = 0.171214415066972, 
G1 = 0.00421274179931608, G1 = -0.217575034369126, G1 = 0.0747287324370998, 
G1 = -0.272637286976494, G1 = 0.0554442780179301, G1 = -0.190380297820118, 
G1 = -0.269928664597129, G1 = 0.16566639884697, G1 = 0.333328023707589, 
G1 = 0.350773785961328, G1 = -0.382316683800772, G1 = 0.0917709619805862, 
G1 = 0.160102205893669, G1 = -0.0696134812668081, G1 = 0.00472535523560857, 
G1 = 0.324254234980508, G1 = -0.212014300820988, G1 = 0.0195100113455382, 
G1 = 0.362518968547242, G1 = -0.217369991736748, G1 = 0.367146104294969, 
G1 = -0.0697602553060701, G1 = -0.080235914494212, G1 = 0.214181045475273, 
G1 = 0.143586506946665, G1 = 0.232745558540399, G1 = 0.0058673037949859, 
G1 = -0.0682041465014169, G1 = 0.00246861122462505, G1 = -0.184798733621729, 
G1 = 0.0181253922503117, G1 = -0.323517977147075, G1 = 0.0182931174583487, 
G1 = -0.0203401874713993, G1 = 0.242654299953703, G1 = -0.104258272660372, 
G1 = -0.199107883487585, G1 = 0.022103325705972, G1 = 0.412533429216699, 
G1 = -0.0964392758515249, G1 = 0.284573884220295, G1 = -0.220797491532061, 
G1 = 0.190726624520444, G1 = 0.0125962468171199, G1 = 0.0368437456489215, 
G1 = 0.060232716903009, G1 = 0.275778315530777, G1 = 0.489137143350119, 
G1 = -0.286171923783487, G1 = 0.0499933620941775, G1 = -0.224049487904855, 
G1 = -0.0599592674948362, G1 = -0.0586531410585938, G1 = -0.0805361166488314, 
G1 = 0.361433244866069, G1 = -0.381915990919641, G1 = 0.198819512125205, 
G1 = 0.340724736068989, G1 = -0.342736360064462, G1 = 0.157201239276519, 
G1 = 0.04092041960274, G1 = -0.312841937208328, G1 = 0.316511165919575, 
G1 = 0.351656551984364, G1 = -0.0908732329744186, G1 = 0.0490370774951678, 
G1 = -0.289244850963851, G1 = -0.248025736188417, G1 = -0.416067065849866, 
G1 = -0.0839599669261748, G1 = 0.278705140873953, G1 = -0.151756379650806, 
G1 = 0.118558481104731, G1 = 0.0977632579155259, G1 = -0.283035195465319, 
G1 = -0.338295832225603, G1 = -0.194206012677295, G1 = -0.268327005844025, 
G1 = 0.137952871152827, G1 = -0.113720629210563, G1 = 0.101006307305218, 
G1 = -0.354600899654752, G1 = 0.112085835120748, G1 = 0.0933327350412707, 
G1 = 0.0438067125531187, G1 = -0.411673534922135, G1 = 0.085451741551184, 
G1 = -0.124433140163639, G1 = 0.326886660595445, G1 = -0.0447807025697493, 
G1 = -0.166217794970448, G1 = -0.135303175259669, G1 = 0.104490730712026, 
G1 = -0.330859179245669, G1 = -0.142864230716595, G1 = -0.214395198900577, 
G1 = -0.153065236404332, G1 = 0.0907710252021001, G1 = 0.240239748590085, 
G1 = 0.160822512801239, G1 = -0.184645159585115, G1 = -0.00172416957140875, 
G1 = 0.0604022213917172, G1 = 0.339347668424802, G1 = 0.554764924390584, 
G1 = 0.261950835473101, G1 = -0.0411770369715558, G1 = -0.270019944319845, 
G1 = 0.174946059483416, G1 = -0.184996545826563, G1 = -0.503684042985288, 
G1 = 0.0460269832060245, G1 = -0.0246148154102421, G1 = -0.264382645424988, 
G1 = 0.225359500133923, G1 = -0.0303564166640697, G1 = -0.267624071686077, 
G1 = -0.0114764281891304, G1 = 0.184317904891017, G1 = 0.252409274585041, 
G1 = 0.0746816084718152, G1 = -0.0886766818047668, G1 = 0.0764909087414942, 
G1 = -0.101276787448924, G1 = -0.171446943064046, G1 = 0.272337307388145, 
G1 = -0.182072786460642, G1 = 0.169104358630202, G1 = 0.0382040903808871, 
G1 = -0.0804314863959572, G1 = 0.0142332343709004, G1 = -0.248385076227081, 
G1 = -0.175421663562581, G1 = 0.10816182379366, G1 = 0.0774590763990854, 
G1 = -0.0387438334392716, G1 = -0.218171958995075, G1 = -0.142967573607532, 
G1 = 0.116203393255603, G1 = -0.0452001853410202, G1 = 0.0842893842216839, 
G1 = -0.306605021270409, G1 = 0.118735441408411, G1 = 0.163141263717458, 
G1 = 0.165993988251767, G1 = -0.315410749923362, G1 = 0.320370553328488, 
G1 = -0.0840705012132865, G1 = 0.158545518170909, G1 = -0.252250806860966, 
G1 = 0.205480635396595, G1 = 0.131890351098502, G1 = -0.0233340032909606, 
G1 = 0.721128237487218, G1 = 0.322783411797602, G1 = -0.264838195578308, 
G1 = 0.252737097092078, G1 = -0.0269664151565738, G1 = -0.316668244804626, 
G1 = 0.186396868334439, G1 = -0.0280302682954368, G1 = 0.127772314309324, 
G1 = 0.112839472917706, G1 = 0.172654158633884, G1 = 0.0334995352389225, 
G1 = -0.219565928823098, G1 = 0.272000505509338, G1 = 0.481540002432279, 
G1 = -0.190102299170062, G1 = -0.245766572112567, G1 = -0.168077194373502, 
G1 = 0.115735495919148, G1 = 0.204560058204033, G1 = -0.440209599924758, 
G1 = -0.44262560243325, G1 = 0.059762839502099, G1 = -0.0986221377326437, 
G1 = 0.413029236915195, G1 = -0.0320404334260793, G1 = 0.0676428482965594, 
G1 = 0.254054467333598, G1 = 0.261663117749503, G1 = -0.136170396259141, 
G1 = 0.0130732086458156, G1 = 0.037531835868986, G1 = -0.00954384304903577, 
G1 = -0.300849158817194, G1 = 0.185056478768955, G1 = -0.132811564047294, 
G1 = -0.384015154742164, G1 = -0.0147305723967488, G1 = 0.0523486495343146, 
G1 = -0.267523486846347, G1 = 0.395636802410072, G1 = -0.0771404619957856, 
G1 = 0.221731384617071, G1 = 0.108348261735246, G1 = -0.514666774566589, 
G1 = 0.0116918295177406, G1 = -0.057917402823259, G1 = 0.0901149410162212, 
G1 = -0.289624322226976, G1 = -0.252491805531877, G1 = 0.0762893030675258, 
G1 = 0.0203239654826461, G1 = -0.125012639931148, G1 = 0.23902831385691, 
G1 = 0.235831157297184, G1 = 0.104948412123438, G1 = -0.152266182784494, 
G1 = -0.142474697968208, G1 = 0.395484428407968, G1 = 0.17139148591626, 
G1 = -0.248832162945439, G1 = 0.0672342607885419, G1 = -0.104069736433562, 
G1 = 0.0676145763406151, G1 = 0.0684896593856994, G1 = -0.665274422735157, 
G1 = 0.466027149328546, G1 = -0.321189647887817, G1 = -0.537411413386142, 
G1 = -0.00866065931720783, G1 = -0.297005180220978, G1 = -0.14452947410293, 
G1 = 0.121841152229599, G1 = 0.132684749675409, G1 = -0.286412328311767, 
G1 = -0.0222142524685367, G1 = 0.0893258495477625, G1 = -0.0411574930852545, 
G1 = -0.0613674011009628, G1 = 0.30749773038211, G1 = 0.322000681112478, 
G1 = -0.0512419818484826, G1 = 0.381166579568717, G1 = 0.280201662809533, 
G1 = -0.0582262564526452, G1 = 0.176803466712448, G1 = -0.00274279215491191, 
G1 = 0.315480928010297, G1 = 0.0718646620020369, G1 = 0.135352053817659, 
G1 = 0.329561718317686, G1 = 0.0992923054207964, G1 = -0.203882090478974, 
G1 = -0.292893639305518, G1 = 0.235947954857037, G1 = -0.168871433690587, 
G1 = 0.205141770660067, G1 = -0.0105467777611938, G1 = 0.213416608919202, 
G1 = -0.0769663613352787, G1 = -0.148412052377454, G1 = -0.378119498678878, 
G1 = -0.321412684590163, G1 = -0.0503310869045919, G1 = 0.131558017687491, 
G1 = -0.0844644558437864, G1 = -0.165263502828834, G1 = -0.0197702232680506, 
G1 = 0.0400377521419095, G1 = -0.235598667324236, G1 = -0.0268395209649928, 
G1 = 0.324409391603592, G1 = 0.182173339205601, G1 = 0.0385582467194592, 
G1 = -0.146186624099263, G1 = -0.0390001792322145, G1 = 0.182013445077557, 
G1 = -0.0489743603413029)
checker <- function(label, user_code, check_code, envir_result, evaluate_result, ...) {
  list(message = check_code, correct = TRUE, location = "append")
}
tutorial_options(exercise.timelimit = 60, exercise.checker = checker)
knitr::opts_chunk$set(echo = FALSE)

Introduction

The goal of these tutorials is to help you understand the basics of genetic association testing. We'll start with very basic methods, such as simple marker analyses to introduce the important notions. We'll then introduce genetic maps and show how to compute the frequency of unobserved markers form the frequencies of observed markers. We'll end up with simple interval mapping, a method that combines simple marker analyses and genetic maps to find Quantitative Traits Loci (QTL) when we don't have a priori candidate genes.

Simple Marker Analyses

We're interested in the possible association between a phenotype (say IgG1 levels) and a locus (say the locus corresponding to gene G, with two alleles G0 and G1, or LEW and BN). Assume for now for simplicity that:

gene G was selected based on expert knowledge (or previous results from the literature).
you work under a dominance (rather than additive or recessive) assumption for allele G1 so that the genotype at the locus can be coded G1 if the rat has at a least one copy of allele G1 at gene G (i.e. it is G1\G1, G1\G0 or G0\G1) and coded G0 if both alleles are G0 (i.e. it is G0\G0).

Remark: The dominance assumption is convenient for this tutorial as it allows us to work with only two effective genotypes. There are of courses always 4 (G1\G1, G1\G0, G0\G1 and G0\G0 where the maternal allele comes first) but we can recode them as 2 for our purpose. We could do the same under a recessive assumption (with a different coding) but would need to work with more categories under an additive assumption. We could also work with haploid rats. We'll stick with the dominance assumption for now.

We can rephrase the association study as the following question "Are IgG1 levels different in rats with genotypes G0 and G1?". We'll try to answer that question step-by-step.

Discrete phenotype

We assume first (again for simplicity) that the phenotype is discrete and takes only two values: low and high. To answer our question, we need to collect data first. We consider a population of $n = 100$ rats. For each rat $i \in {1, \dots\, n}$, we measure its

phenotype $x_i$ (IgG1 level): high or low
genotype $g_i$ (under the dominance assumption): G0 or G1

For conciseness, we refer to rats with genotype G0 (resp. G1) as G0 rats (resp. G1 rats). The data might look like this (first 10 rats):

knitr::kable(head(data))

To answer our question, we first summarize the data into a contigency table:

with(data, table(Genotype, Phenotype))

To make sure we read the table correctly, let's answer a few questions:

question(
  "What's the proportion $p_H^0$ of `High` IgG1 levels among `G0` rats?",
  answer("$p_H^0 = 0.51$", correct = TRUE),
  answer("$p_H^0 = 0.50$", correct = FALSE, message = "Compute carefully the number of `G0` rats"),
  allow_retry = TRUE,
  post_message = "This a double entry table. There are 25 `G0` rats with `High` IgG1 levels and 24 `G0` rats with `Low` IgG1 levels. In total there are 24+25=49 `G0` rats, of which a proportion 25/49=0.51 have `High` IgG1 levels."
)

question(
  "What's the proportion $p_H^1$ of `High` IgG1 levels among `G1` rats?",
  answer("$p_H^1 = 0.0.49$", correct = TRUE),
  answer("$p_H^1 = 0.50$", correct = FALSE, message = "Compute carefully the number of `G1` rats"),
  allow_retry = TRUE,
  post_message = "This a double entry table. There are 25 `G0` rats with `High` IgG1 levels and 26 `G0` rats with `Low` IgG1 levels. In total there are 26+25=51 `G1` rats, of which a proportion 25/51=0.49 have `High` IgG1 levels."
)

In the following, I will use $p_H^0$ to denote the true (and unknown) proportion in the full population of G0 rats and $\hat{p}_H^0$ the observed proportion (in our sample). Think of $\hat{p}_H^0$ as a noisy version of $p_H^0$.

Comparing proportions

barplot(height = cbind(c(24, 25), c(26, 25)), names.arg = c("G0", "G1"), 
        xlab = "Genotype", 
        ylab = "Nb. rats"
        )
text(x = c(0.7, 1.9), y = c(25, 27), labels = "High", adj = c(0.5, 0), col = "black")
text(x = c(0.7, 1.9), y = 1, labels = "Low", adj = c(0.5, 0), col = "white")

In this toy dataset, there are 51% of High levels among G0 rats and 49% among G1 rats. The difference is $\Delta\hat{p} = \hat{p}_H^0 - \hat{p}_H^1 = 2\%$.

Based on those results, we might be tempted to conclude that allele $G_0$ increases IgG1 levels. A tiny problem is that we don't known whether $\Delta\hat{p} = 2\%$ is that big of a difference. After all, if two of our rats with Low levels had been G0 instead of G1, the results would have been quite different...

question(
  "Assume that two rats with `Low` levels had been `G0` instead of `G1`. What would be value of $\\Delta \\hat{p}$?",
  answer("$\\Delta \\hat{p} = 2$%", correct = FALSE, message = "You changed the data, $\\Delta \\hat{p}$ should surely change as a result."),
  answer("$\\Delta \\hat{p} = 0$%", correct = FALSE, message = "Try again."),
  answer("$\\Delta \\hat{p} = -2$%", correct = TRUE),
  allow_retry = TRUE,
  post_message = "Changing the genotype of these two rats leads to the same contingency table as before but with rows `G0` and `G1` switched. The value of $\\Delta \\hat{p}$ therefore just changes sign. You can also do the full computation to convince yourself."
)

Assessing significance

We saw in the previous part that $\Delta \hat{p} = 2\%$ is not that big and that we can't draw strong conclusions from it. Indeed changing a mere two data points would revert our conclusions. As we expect our data set to be noisy (some rats might have incorrect genotypes and/or phenotypes), we'd like to know when we can confidently say anything about the genotype - phenotype link from $\Delta \hat{p}$.

We therefore need to determine what constitutes a large difference $\Delta \hat{p}$. In statistical term, it's called assessing significance.

Let's do a thought experiment first. Assume that there are no differences between genotypes G0 and G1 (or in formal terms that $\Delta p = 0$). The (observed) proportion of High levels in the whole population is $\hat{p}_H = \frac{25+25}{25+25+24+26} = \frac{1}{2}$. We had $n_0 = 49$ G0 rats and $n_1 = 51$ G1 and in our original data set. Let's create fake data sets by assuming that each rat has High levels with probability $p_H = 0.5$ (and therefore Low levels with probability $p_L = 0.5$). Although called fake, we could have observed every single one of those data sets if $p_H^0 = p_H^1 = p_H = 0.5$.

The function create_fake() does exactly that: it creates a fake dataset (as stated before), computes its contigency table and compute $\Delta \hat{p}$ on that data set. Run it several times (by clicking the Run code button).

# create_fake <- function() {
#   data <- data.frame(
#     Rat  = 1:100, 
#     Genotype  = rep(c("G0", "G1"), times = c(49, 51)), 
#     Phenotype = factor(sample(c("Low", "High"), size = 100, replace = TRUE), levels = c("Low", "High")))
#   tab <- with(data, table(Genotype, Phenotype))
#   deltap <- -100 * diff(tab[, 2] / rowSums(tab))
#   print(tab)
#   cat(paste0("Delta p = ", round(deltap, digits = 2), "%"))
#   invisible(deltap)
# }

create_fake()

Compare (mentally) the value $\Delta \hat{p}$ computed on the original data set to the range of values computed on those fake data sets.

question("Would you say $\\Delta \\hat{p}$ is", 
         answer("Within the range", correct = TRUE), 
         answer("Outside of the range", correct = FALSE), 
         post_message = "$\\Delta \\hat{p}$=2% is well within the range: most values are between -14% and 14%."
         )

Assessing significance (II)

The fake datasets from the previous section informed us that, even when there are no differences between G0 and G1, it's quite common to observe values of $\Delta \hat{p}$ as low as -15% and as high as 15% based on sheer luck (see following graph, based on 200 calls to create_fake()). $\Delta \hat{p} = 2$% is therefore quite typical and not large enough to conclude that G0 and G1 are different.

hist(values, main = NULL, xlab = expression(Delta*hat(p)~from~fake~samples), freq = TRUE)
abline(v = 2, col = "red")
text(x = 2, y = 30, font = 2, col = "red", expression(Delta*hat(p)==2), adj = c(0, 0.5))

question("Based on the previous graph, what ranges of values would you consider atypical for $\\Delta \\hat{p}$?", 
         answer("[-100, -20]", correct = TRUE), 
         answer("(-20, -10]"),
         answer("(-10, -10)"), 
         answer("(10, 20]"),
         answer("(20, 100]", correct = TRUE),
         allow_retry = TRUE, 
         post_message = "Values of $\\Delta \\hat{p}$ larger than 18% and smaller than -18% are observed less than 5% of the time. They are therefore quite atypical if `G0` and `G1` are not associated to IgG1 levels."
         )

Assessing significance (III)

Based on the previous graph, we can formulate the following rules:

If $|\Delta \hat{p}| < 18$, we can't conclude on differences between G0 and G1 and therefore on association between either genotype and either phenotype.
If $\Delta \hat{p} > 18$, G0 is significantly associated with High levels (and by symmetry G1 is significantly associated with Low levels)
If $\Delta \hat{p} < -18$, G0 is significantly associated with Low levels (and by symmetry G0 is significantly associated with High levels)

Given our sample size ($n = 100$) and our data set, we conclude in this example that there is no association between the genotype and the phenotype.

Remark: What we just did to find associations is called a test of equality of proportions and can be used in a much broader context. Formally and with our notation, we tried to compare the so-called null hypothesis $H_0: p_H^0 = p_H^1$ against the so-called alternative hypothesis $H_1: p_H^0 \neq p_H^1$ using the parameter $\Delta p = p_H^0 - p_H^1$. Since the true value of $\Delta p$ is unkown, we estimated it from the observations by means of $\Delta \hat{p}$. We then ran simulations to find the typical range of $\Delta \hat{p}$ under $H_0$ and look whether $\Delta \hat{p}$ fell in that range or not. Keep a few of those terms in mind, you'll meet them again next year...

Continuous phenotype

We now assume that the phenotype is continuous and takes values in $\mathbb{R}$ (or $\mathbb{R}_+$ or some interval). No matter the nature of our phenotype, we still need to collect data.

We consider a population of $n = 100$ rats. For each rat $i \in {1, \dots\, n}$, we measure its

phenotype $x_i$ (IgG1 level): any value in $\mathbb{R}$
genotype $g_i$ (allele of gene G): G0 or G1

The data might look like this (first 10 rats):

knitr::kable(head(data_cont))

Since we're looking at continuous values, we need a different summary. Let's compute the average IgG1 level in each genotype.

with(data_cont, tapply(X = Phenotype, INDEX = Genotype, FUN = mean))

and have a look at the raw data

boxplot(Phenotype ~ Genotype, data = data_cont, col = c("lightgreen", "lightblue"), 
        outline = FALSE)
with(data_cont, 
     points(x = ifelse(Genotype == "G0", 1, 2) + rnorm(100, sd = 0.05), 
            y = data_cont$Phenotype, 
            pch = 19, cex = 0.8, 
            col = ifelse(Genotype == "G0", "darkgreen", "darkblue"))
     )

Note $\mu_0$ and $\mu_1$ the (unkown) average values for each genotype. Noting $\Delta \mu = \mu_0 - \mu_1$, we'd like to test $H_0: \Delta \mu = 0$ against $H_1: \Delta \mu \neq 0$ as before.

Once again, since we don't have access to the true values $\mu_0$ and $\mu_1$, we need to replace them by noisy versions $\hat{\mu}_0$ and $\hat{\mu}_1$ estimated from our dataset.

quiz(caption = "Estimators", 
     question("Pick the right value for $\\hat{\\mu}_0$", 
              answer("9.88", message = "You may have mixed up `G0` and `G1`"), 
              answer("10.55", TRUE), 
              answer("0.67"), 
              answer("-0.67"), 
              random_answer_order = TRUE, 
              allow_retry = TRUE), 
     question("Pick the right value for $\\hat{\\mu}_1$", 
              answer("10.55", message = "You may have mixed up `G0` and `G1`"), 
              answer("9.88", TRUE), 
              answer("0.67"), 
              answer("-0.67"), 
              random_answer_order = TRUE, 
              allow_retry = TRUE), 
     question("Pick the right value for $\\Delta \\hat{\\mu}$", 
              answer("9.88"), 
              answer("10.55"), 
              answer("0.67", TRUE), 
              answer("-0.67", message = "You may have mixed up `G0` and `G1`"), 
              random_answer_order = TRUE, 
              allow_retry = TRUE) 
     )

Comparing means

Our best estimate of $\Delta \mu$ from the data is $\Delta \hat{\mu} = 0.67$. We'll use the same procedure as before to asses if this value is small and typical under $H_0$ (in the sense that we're likely to observe on real data sets if $\Delta \mu = 0$) or large and atypical under $H_0$ (in the sense that we're unlikely to observe on real data sets if $\Delta \mu = 0$).

In our data set, we have:

$n_0 = 49$
$n_1 = 51$
$\hat{\mu} = 10.21$ (average over rats from both genotypes, assuming $\Delta \mu = 0$).

To repeat the procedure used in the discrete setting, we need to create fake data sets with 49 rats from G0, 51 from G1 and all phenotypes sampled from the same distribution (\emph{i.e.} independently of the genotype). Unlike the discrete setting, where the value was either Low or High, the value can be any number here so we need to specify a proper probability distribution on $\mathbb{R}$. We'll use the famous gaussian distribution.

par(xpd = TRUE, mar = c(2, 2, 2, 2))
curve(dnorm, from = -3, to = 3, main = "Gaussian density", 
      axes = FALSE, xlab = NA, ylab = NA)
abline(h = 0, v = 0)
points(0, 0, pch = 19)
text(x = 0, y = 0, labels = expression(mu), pos = 1, cex = 2)
arrows(x0 = -1, x1 = 1, y0 = dnorm(1), code = 3, length = 0.1, lwd = 2)
text(x = 0, y = dnorm(1), pos = 3, cex = 2, labels = expression(2*sigma))

The gaussian distribution has two parameters: - a mean $\mu$ that we can set equal to $\hat{\mu} = 10.21$ - a variance $\sigma^2$.

We can estimate $\sigma^2$ in one step from the whole dataset, or in two steps from each genotype first and then pool the results. For reasons that will be much clearer next year when you study gaussian variables and the $t-test$, it's smarter to use the two-steps estimator.

The variance in genotype G0 is $\hat{\sigma}^2_0 = 1.02$
The variance in genotype G1 is $\hat{\sigma}^2_1 = 1.15$
The pooled variance is $\hat{\sigma}^2 = \frac{(n_0 - 1)\hat{\sigma}^2_0 + (n_1 - 1)\hat{\sigma}^2_1}{n_0 + n_1 - 2} = 1.09$

Compare that to the one-step estimator $\hat{\sigma}^2 = 1.13$ computed directly on the whole population.

Remarks Note that $\hat{\sigma}^2$ is nothing but a weighted average of $\hat{\sigma}^2_0$ and $\hat{\sigma}^2_1$. The weights are not exactly $n_0$ and $n_1$ but rather $n_0 - 1$ and $n_1 - 1$ for reasons that would hard to understand right now but will make sense next year (in a nutshell, you pay the price of estimating the mean phenotype of each group, which costs you one degree of freedom, don't worry if it doesn't make sense yet).

We can now create many fake data sets using create_fake_cont(). Run the function several times to get a feeling of the typical values.

# create_fake_cont <- function() {
#   data <- data.frame(
#     Rat  = 1:100, 
#     Genotype  = rep(c("G0", "G1"), times = c(49, 51)), 
#     Phenotype = rnorm(100, mean = 10.21, sd = 1.08))
#   deltamu <- diff(with(data, tapply(X = Phenotype, INDEX = Genotype, FUN = mean)))
#   cat(paste0("Delta mu = ", round(deltamu, digits = 2)))
#   invisible(deltamu)
# }

create_fake_cont()

Compare (mentally) the value $\Delta \hat{\mu}$ computed on the original data set to the range of values computed on those fake data sets.

question("Would you say that $\\Delta \\hat{\\mu}$ is", 
         answer("Within the range", correct = FALSE), 
         answer("Outside of the range", correct = TRUE), 
         post_message = "$\\Delta \\hat{\\mu}$=0.67 is outside the range: we need to be lucky to observe greater values."
         )

Assessing significance (II)

The fake datasets from the previous section informed us that, when there are no differences between G0 and G1, it's quite uncommon to observe values of $\Delta \hat{\mu}$ below -0.4 and above 0.4 (see graph, based on 400 calls to create_fake_cont()). $\Delta \hat{\mu} = 0.67$ is therefore quite atypical and very unlikely to be observed by chance if $\Delta \mu=0$ is true. Among the 400 data sets simulated under $H_0$, only 1 resulted in a value $\Delta \hat{\mu}$ larger than 0.67.

hist(values_cont, main = NULL, xlab = expression(Delta*hat(mu)~from~fake~samples), 
     freq = TRUE, breaks = 40)
abline(v = 0.67, col = "red")
text(x = 0.67, y = 30, font = 2, col = "red", expression(Delta*hat(mu)==0.67), adj = c(1, 0.5))

question("Based on the previous graph, what ranges of values would you consider atypical for $\\Delta \\hat{\\mu}$?", 
         answer("<-0.4", correct = TRUE), 
         answer("[-0.4, -0.2)"),
         answer("[-0.2, 0.2]"), 
         answer("(0.2, 0.4]"),
         answer("$\\geq$ 0.4", correct = TRUE),
         allow_retry = TRUE, 
         post_message = "Values of $\\Delta \\hat{\\mu}$ larger than 0.42 and smaller than -0.42 are observed less than 5% of the time. They are therefore quite atypical if `G0` and `G1` are the same average IgG1 level."
         )

Remark Our observed $\Delta \hat{\mu} = 0.67$ is unlikely to arise in data sets purposefully generated under $\Delta \mu = 0$. We observe as extreme values ($\geq 0.67$ in absolute value) in only 1 in 400 fake samples. In statistical terms, the p value is 1/400 (or 0.0025). Keep that concept in mind, it will be handy next year and when reading research articles.

Assessing significance (III)

Given our sample size ($n = 100$) and our data set, we conclude in this example that G0 is associated to higher IgG1 levels but we were able to formalize our intuition.

Remark: What we just did to find associations is called a test of equality of means and can be used in a much broader context. When the phenotypes have a Gaussian distribution, we don't actually need to generate fake data sets as we have access to analytical formula to calibrate the range of typical/atypical values. This is much cheaper from a computational point of view and very useful when you're looking at very atypical values. You'll again study this in more details next year.

Towards Interval Mapping

In the previous section, we presented the principle of association-testing in simple settings. We built rules to help us decide whether a loci was siginificantly associated to a phenotypic response.

In this section, we go further and quantify the evidence supporting this association. To do so, we need to compute and compare two quantities: - the probability of observing our datset under $H_0$ (i.e if there is no effect of the genotype at locus $G$ on IgG1 levels) - the probability of observing our datset under $H_1$ (i.e if there is an effect of the genotype at locus $G$ on IgG1 levels)

For ease of exposition, we'll focus on the continuous setting and introduce the concepts of likelihood and odds-ratio.

Gaussian density

When comparing the average IgG1 levels under genotype G0 and G1, we assumed that the phenotypes followed a gaussian distribution: - with mean $\mu_0$ and variance $\sigma^2$ for genotype G0 - with mean $\mu_1$ and variance $\sigma^2$ for genotype G1

par(xpd = TRUE, mar = c(2, 2, 2, 2))
curve(dnorm(x, mean = 0.5), 
      from = -4, to = 4, main = "Gaussian density for G0 and G1", 
      axes = FALSE, xlab = NA, ylab = NA, col = "lightgreen", 
      lwd = 3)
curve(dnorm(x, mean = 0.0), add = TRUE, col = "lightblue", lwd = 3)
abline(h = 0)
abline(v = 0.5, col = "lightgreen", lty = 2, lwd = 3)
points(0.5, 0, pch = 19, col = "lightgreen")
text(x = 0.5, y = 0, labels = expression(mu[0]), adj = c(0, 1), cex = 2, col = "lightgreen")
abline(v = 0, col = "lightblue", lty = 2, lwd = 3)
points(0, 0, pch = 19, col = "lightblue")
text(x = 0, y = 0, labels = expression(mu[1]), adj = c(1, 1), cex = 2, col = "lightblue")
arrows(x0 = -0.5, x1 = 1.5, y0 = dnorm(1), code = 3, length = 0.1, lwd = 2)
text(x = 0.5, y = dnorm(1), pos = 3, cex = 2, labels = expression(2*sigma))
text(x = 4, y = 0, labels = "Phenotype", pos = 1)

A gaussian variable $X$ with mean $\mu$ and variance $\sigma^2$ is noted $X \sim \mathcal{N}(\mu, \sigma^2)$ and has density (remember your homework?): $$ f_{\mu,\sigma^2}(x) = \frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(x - \mu)^2}{2\sigma^2}} $$ The formula may look daunting at first but it corresponds to a bell-shaped curve (like the two shown above). A few important points: - the density is unimodal (it has a unique maximum) at $x=\mu$ - the density goes to $0$ when $|x|$ goes to $\infty$. - the spread of the density is proportional to $\sigma$.

Informally, the density tells us what values of $X$ we're likely to observe. For any interval of infinitesimal length $dx$: $$ \mathbb{P}(X \in [x, x+dx]) = f_{\mu,\sigma^2}(x) dx $$

quiz(caption = "Properties of gaussian densities", 
     question(
       "Are you more likely to observe values of X", 
       answer("Close to $\\mu$", correct = TRUE), 
       answer("Far from $\\mu$", correct = FALSE), 
       random_answer_order = TRUE, allow_retry = TRUE,
       post_message = "Since the density is highest closer to $\\mu$, you're more likely to observe values close to $\\mu$." 
       ), 
     question(
       "When $\\sigma$ increases, the range of values you're likely to observe", 
       answer("increases", correct = TRUE), 
       answer("decreases", correct = FALSE), 
       random_answer_order = TRUE, allow_retry = TRUE,
       post_message = "When $\\sigma$ increases, the spread of the density increases. The range of value with relatively high density (the ones you're likely to observe) therefore increases." 
     )
)

Likelihood function

If we have independent variables $X_1$ and $X_2$ with means $\mu_1$ and $\mu_2$ and identical variance $\sigma^2$, we have a similar formula: $$ \mathbb{P}(X_1 \in [x_1, x_1+dx_1]; X_2 \in [x_2, x_2+dx_2]) = \mathbb{P}(X_1 \in [x_1, x_1+dx_1])\mathbb{P}(X_2 \in [x_2, x_2+dx_2]) = f_{\mu_1,\sigma^2}(x_1)f_{\mu_2,\sigma^2}(x_2)dx_1 dx_2 $$

This formula generalizes to an arbitrary number $n$ of independent variables. It also reveals a strong connection between Probability of observation and density function. Let's formalize this idea with the likelihood function.

Consider a set of $n$ independent observations $(x_1, \dots, x_n)$ from gaussian variables $\mathcal{N}(\mu, \sigma^2)$. The likelihood of $(x_1, \dots, x_n)$

$$ L(x_1, \dots, x_n; \mu, \sigma) = f_{\mu,\sigma^2}(x_1) \times \dots \times f_{\mu,\sigma^2}(x_n) = \prod_{i=1}^n f_{\mu,\sigma^2}(x_i) $$

Up to an irrelevant infinitisemal multiplicative factor, $L(x_1, \dots, x_n; \mu, \sigma)$ is the probability of observing $(x_1, \dots, x_n)$ when sampling $n$ independent gaussian variables $\mathcal{N}(\mu, \sigma^2)$.

Remark: Since $(x_1, \dots, x_n)$ depend only on the data, we often only consider $L$ as a function of $\mu$ and $\sigma$. Furthermore and since $L$ is a product of many small terms, it is often more convenient in practice to compute and work with the log-likelihood rather than with the likelihood.

Likelihood under $H_0$ (I)

Under $H_0$, all observations come from the same distribution $\mathcal{N}(\mu, \sigma^2)$. We don't know the true values of $\mu$ and $\sigma^2$ but we can nevertheless compute the log-likelihood $L_{H_0}$ of our observations for test values.

# log_likelihood_H0 <- function(mu = 10, sigma = 1) {
#   sum(dnorm(data_cont$Phenotype, mean = mu, sd = sigma, log = TRUE))
# }

Using log_likelihood_H0(), we can try to find a combination of parameters that maximizes the (log-)likelihood. Using log_likelihood_H0() in the following snippet, find the optimal values of $\mu$ and \sigma$.

log_likelihood_H0(mu = 10, sigma = 1)
log_likelihood_H0(mu = 15, sigma = 5)

Likelihood under $H_0$ (II)

This is of course a maximisation problem. The optimal values $\hat{\mu}$ and $\hat{\sigma}$ can be found by setting the partial derivatives $\frac{\partial L(\mu, \sigma)}{\partial \mu}$ and $\frac{\partial L(\mu, \sigma)}{\partial \sigma}$ to $0$ and solving the equations for $\mu$ and $\sigma$. If you're brave enough, you can do that (working with the log-likelihood makes derivations much easier) and find the exact relation between $\hat{\mu}, \hat{\sigma}$ and $(x_1, \dots, x_n)$.

We look at the whole curve $(\mu, \sigma) \mapsto L_{H_0}(\mu, \sigma)$ using a contour plot. Dark colors correspond to high values of $L_{H_0}$ and light colors colors to low values and levels of the function are shown as blacks contour.

f <- Vectorize(log_likelihood_H0)
x <- seq(5, 15, length.out = 101)
y <- seq(0.8, 1.4, length.out = 101)
z <- outer(x, y, 
           FUN = function(x, y) {f(mu = x, sigma = y)}
)
par(mar = c(4, 4, 2.5, 1))
image(x, y, z, 
      xlab = expression(mu), 
      ylab = expression(sigma), 
      main = expression(logL[H[0]](mu, sigma)))
contour(x, y, z, levels = c(-200, seq(-300, by = -300, length.out = 5)), add = TRUE)
points(x = 10.21, y = 1.13, pch = 19, col = "white")
segments(x0 = 10.21, y0 = 0, y1 = 1.13, lty = 2, col = "white")
text(x = 10.21, y = 1, labels = expression(hat(mu)==10.21), pos = 4, col = "white")
segments(x0 = 5, x1 = 10.21, y0 = 1.13, lty = 2, col = "white")
text(x = 7, y = 1.13, labels = expression(hat(sigma)==1.13), pos = 3, col = "white")

The log-likelihood is maximized for $\hat{\mu} = 10.21$ and $\hat{\sigma} = 1.13$. Do those values remind you of something?

For illustrative purpose, we also look at the transect $\mu \mapsto L_{H_0}(\mu, \sigma)$ for $\sigma = \hat{\sigma} = 1.13$.

par(mar = c(4.3, 4.3, 1, 1), xpd = FALSE)
f <- Vectorize(log_likelihood_H0, vectorize.args = "mu")
curve(f(mu = x, sigma = 1.13), from = 5, to = 15,
      xlab = expression(mu),
      ylab = expression(logL[H[0]](mu,sigma==1.13)))
abline(v = 10.21, lty = 2)
text(x = 10.21, y = -1100, labels = expression(hat(mu)==10.21),
     pos = 4)

We call $L_{H_0}(\hat{\mu}, \hat{\sigma})$ the likelihood of $H_0$.

Likelihood under $H_1$ (I)

We can do the same thing under $H_1$ with a a few notable differences. - there are now two distributions $\mathcal{N}(\mu_0, \sigma^2)$ and $\mathcal{N}(\mu_1, \sigma^2)$ - the log-likelihood should treat rats with genotype G0 and G1 separately.

We thus consider the following function

$$ L_{H_1}(x_1, \dots, x_n; \mu_0, \mu_1, \sigma) = \prod_{i:g_i = G0} f_{\mu_0,\sigma^2}(x_i) \prod_{i: g_i = G1} f_{\mu_1,\sigma^2}(x_i) $$

question("How are the 100 rats split across the 2 products?",
         answer("49 / 51", correct = TRUE), 
         answer("50 / 50", message = "Really? How many rats with genotype `G0` do we have."), 
         answer("51 / 49", message = "Maybe you mixed up `G0` and `G1`"), 
         allow_retry = TRUE, random_answer_order = TRUE)

Again, we don't know the true values of $\mu_0$, $\mu_1$ and $\sigma^2$ but we can nevertheless compute the log-likelihood $L_{H_1}$ of our observations for test values and try to maximize it.

# log_likelihood_H1 <- function(mu0 = 10, mu1 = 10, sigma = 1) {
#   with(data_cont, 
#        sum(dnorm(Phenotype[Genotype == "G0"], mean = mu0, sd = sigma, log = TRUE)) + 
#          sum(dnorm(Phenotype[Genotype == "G1"], mean = mu1, sd = sigma, log = TRUE))
#   )
# }

log_likelihood_H1(mu0 = 10, mu1 = 10, sigma = 1)
log_likelihood_H1(mu0 = 15, mu1 = 15, sigma = 5)

Likelihood under $H_1$ (II)

Remark: This is again of course a maximisation problem that you can solve by setting the partial derivatives to $0$ if you're brave enough.

We look at the whole curve $(\mu_0, \mu_1) \mapsto L_{H_1}(\mu_0, \mu_1, \sigma)$ for $\sigma = 1.08$ (value found in the previous section) using a contour plot. Dark colors correspond to high values of $L_{H_1}$ and light colors colors to low values and levels of the function are shown as blacks contour.

f <- Vectorize(log_likelihood_H1, vectorize.args = c("mu0", "mu1"))
x <- seq(5, 15, length.out = 101)
y <- seq(5, 15, length.out = 101)
z <- outer(x, y, 
           FUN = function(x, y) {f(mu0 = x, mu1 = y, sigma = 1.08)}
)
par(mar = c(4, 4, 2.5, 1))
image(x, y, z, 
      xlab = expression(mu[0]), 
      ylab = expression(mu[1]), 
      main = expression(logL[H[1]](mu[0], mu[1], sigma==1.08)))
contour(x, y, z, add = TRUE)
points(x = 10.55, y = 9.88, pch = 19, col = "white")
segments(x0 = 10.55, y0 = 5, y1 = 9.88, lty = 2, col = "white")
text(x = 10.55, y = 9, labels = expression(hat(mu)[0]==10.55), pos = 4, col = "white")
segments(x0 = 5, x1 = 10.55, y0 = 9.88, lty = 2, col = "white")
text(x = 9, y = 9.88, labels = expression(hat(mu)[1]==9.88), 
     pos = 3, col = "white")

The log-likelihood is maximized for $\hat{\mu}_0 = 10.55$ and $\hat{\mu}_1 = 9.88$.

Remark Since it's very hard to show the map $(\mu_0, \mu_1, \sigma) \mapsto L_{H_1}(\mu_0, \mu_1, \sigma)$, I cheated and directly plugged in he optimal value of $\sigma$.

We call $L_{H_1}(\hat{\mu}_0, \hat{\mu}_1, \hat{\sigma})$ the likelihood of $H_1$.

Log of Odds Ratio (LOD)

We're now ready to compute the evidence in favor of $H_1$ (versus $H_0$). We call it the LOD score (or Log of Odds Ratio score) and define it as

$$ LOD = \log_{10}\left( \frac{L_{H_1}(\hat{\mu}0, \hat{\mu}_1, \hat{\sigma})}{L{H_0}(\hat{\mu}, \hat{\sigma})} \right) $$

Note that there is a slight abuse of notation here as the $\hat{\sigma}$ in the numerator ($\hat{\sigma}=1.08$) is not the same as the one in the denominator ($\hat{\sigma}=1.13$).

As stated in Lander and Botstein (1989), the LOD score essentially indicates how much more probable the data are to have arisen assuming the presence of a QTL [read $H_1: \Delta \mu \neq 0$] than assuming its absence [read $H_1: \Delta \mu = 0$].

We usually consider LOD higher than 2 or 3 as strong evidence for association between the loci and the phenotype, or in other words, as strong evidence for the presence of a QTL.

question("If we want the presence of a QTL to be 1000 times more probable than its absence, we must choose have:",
         answer("LOD > 2"), 
         answer("LOD > 3", correct = TRUE), 
         answer("LOD > 6.9"), 
         allow_retry = TRUE, random_answer_order = TRUE, 
         post_message = "According to the formula, the fraction should be higher than 1000 and thus the LOD should be higher than $\\log_{10}(1000) = 3$.")

In our data set, the LOD is $2.01$ and the effect of the QTL is $\Delta \hat{\mu} = 0.67$. It's quite small (the coefficient of variation $\frac{\Delta \hat{\mu}}{\hat{\mu}} = \frac{0.67}{10.21} = 0.065$ or 6.5%) but nevertheless significant: you're unlikely to observe it base on pure randomness.

Conclusion

Armed with your new knowledge, you can now test for a genotype / phenotype association at any candidate locus. The next tutorial focuses on testing across many loci.

mahendra-mariadassou/L1fdvGenetics documentation built on May 20, 2021, 10:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

mahendra-mariadassou/L1fdvGenetics
Companion self-learning R package to the Genetics Project of FDV bachelor

In mahendra-mariadassou/L1fdvGenetics: Companion self-learning R package to the Genetics Project of FDV bachelor

Introduction

Simple Marker Analyses

Discrete phenotype

Comparing proportions

Assessing significance

Assessing significance (II)

Assessing significance (III)

Continuous phenotype

Comparing means

Assessing significance (II)

Assessing significance (III)

Towards Interval Mapping

Gaussian density

Likelihood function

Likelihood under $H_0$ (I)

Likelihood under $H_0$ (II)

Likelihood under $H_1$ (I)

Likelihood under $H_1$ (II)

Log of Odds Ratio (LOD)

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

mahendra-mariadassou/L1fdvGenetics Companion self-learning R package to the Genetics Project of FDV bachelor

In mahendra-mariadassou/L1fdvGenetics: Companion self-learning R package to the Genetics Project of FDV bachelor

Introduction

Simple Marker Analyses

Discrete phenotype

Comparing proportions

Assessing significance

Assessing significance (II)

Assessing significance (III)

Continuous phenotype

Comparing means

Assessing significance (II)

Assessing significance (III)

Towards Interval Mapping

Gaussian density

Likelihood function

Likelihood under $H_0$ (I)

Likelihood under $H_0$ (II)

Likelihood under $H_1$ (I)

Likelihood under $H_1$ (II)

Log of Odds Ratio (LOD)

Conclusion

R Package Documentation

Browse R Packages

We want your feedback!

mahendra-mariadassou/L1fdvGenetics
Companion self-learning R package to the Genetics Project of FDV bachelor