knitr::opts_chunk$set(echo = FALSE, warnings = F, message = F)
# Prep stuff
## libraries
library(readxl)
library(ggpubr)
library(cowplot)
library(dplyr)
library(pander,warn.conflicts = F)
library(plotrix)

#load data
file. <- "Meillere_et_al_2015_telomere_data.xlsx"
dat.full <- readxl::read_xlsx(file.,sheet = "orig")

# clean up data
dat.full$cort[is.na(dat.full$cort)] <-NA
dat.full$cort <- as.numeric(dat.full$cort)
dat.full$Group <- with(dat.full,
                       paste(treatment,sex))

dat.full$Group <- gsub("control","Control",dat.full$Group)
dat.full$Group <- gsub("disturbed","Noise exposure",dat.full$Group)

dat.full$Group <- gsub(" F","\nFemales",dat.full$Group)
dat.full$Group <- gsub(" M","\nMales",dat.full$Group)
dat.full$cort.log <- log(dat.full$cort)
dat.full$telomere.length.log <- log(dat.full$telomere.length)


names(dat.full) <- gsub("treatment",
                        "Treatment",
                        names(dat.full))

names(dat.full) <- gsub("sex",
                        "Sex",
                        names(dat.full))
# Set up supset of focal columns
dat.focal.cols <- dat.full[,c("Treatment","Sex","telomere.length","cort")]
dat.foc.cols.rnd <- dat.focal.cols
dat.foc.cols.rnd$telomere.length <- dat.foc.cols.rnd$telomere.length %>% round(2)
dat.foc.cols.rnd <- dat.foc.cols.rnd  %>% arrange(Treatment,telomere.length)

dat.foc.cols.rnd$telo.bin.07 <- ifelse(dat.foc.cols.rnd$telomere.length < 0.8,
                                  "*","")
dat.foc.cols.rnd$telo.bin.08 <- ifelse(dat.foc.cols.rnd$telomere.length < 0.9 &  
                                  dat.foc.cols.rnd$telomere.length >= 0.8,
                                  "*","")
dat.foc.cols.rnd$telo.bin.09 <- ifelse(dat.foc.cols.rnd$telomere.length < 1 &  
                                  dat.foc.cols.rnd$telomere.length >= 0.9,
                                  "*","")

# dat.foc.cols.rnd$telo.bin.10 <- ""
# dat.foc.cols.rnd$telo.bin.11 <- ""

Introduction

In this exercise you will work with raw data from a group of ecologists (Meillere et al. 2015) who study how stressful environmental conditions impact birds, In particular, they examine telomeres to determine if stress is reducing the health of birds that otherwise look fine.

Telomeres long, repetitive sequences of DNA at the end of eukaryotic chromosomes. Telomeres are part of a system that projects chromosomes from being accidentally treated like damage DNA the should be broken down and recycled. Telomeres naturally shorten in most cells during mitosis and once they become too short apoptosis is triggered. In cancer telomere shortening often fails to trigger apoptosis, facilitating uncontrolled cell proliferation.

Telomeres shorten normally over the lifespan of an organism. It has been shown that stressful conditions can accelerate telomere shortening. This phenomena is studied by many branches of biology. Ecologists are interested in telomeres because telomere length can be used to assess the overall health of an organism.

Many factors potentially increase the rate of telomere shortening, including poor nutrition, heat stress, hosting parasites, and living with the risk of being eaten by predators. Conditions created by humans, such as noise and light pollution, also place stress on animals and there is evidence that this can increase the rate of telomere shortening in adult and young organisms.

Assignment

In the table on the following page is some of the original data from the paper "Traffic noise exposure affects telomere length in nestling house sparrows" by Meillere et al. (2015). The exercises below you through a basic work up of this raw data so you can determine if experimental exposure to noisy conditions reduces telomere length in birds.

Read through the following exercise and fill out the worksheet at the end of packet. When prompted enter answers in on the associated TopHat assignment. Be sure to answer any other additional questions on TopHat.

Vocab

Key terms in this exercise include:

telomere, kilobase, standard error, standard deviation, confidence interval, boxplot, errorplot, histogram, corticosterone, sample size, mean, median, mode, outliers, correlation, scatterplot

\newpage

Bird Telomere Length & Corticosterone Data

pander(dat.foc.cols.rnd)

\newpage

Data dictionary

The meaning of the columns:

  1. Treatment = experimental treatment; "control" = exposed only to natural sounds; "disturbed" = exposed to artificial noise
  2. Sex = sex of baby bird (M = male, F = Female)
  3. telomere.length = length of telomeres at the end of the DNA strand (in kilobases). A length of 1.34 is 1.34*1000 = 1340 kilobases
  4. cort = concentration of corticosterone in blood
  5. telo.bin.07 = is the telomere length between 0.7 and 0.799?
  6. telo.bin.08 = is the telomere length between 0.8 and 0.899?
  7. telo.bin.09 = is the telomere length between 0.9 and 0.999?

Summary statistics

[ ] TASK: Determine the following things for these data

  1. Sample size in each group (N; the number of observations)
  2. Minimum: Smallest telomere size
  3. Median: The approximate median value. The median is the "middle value". If you have an odd number of observation it is the exact middle. If you have an even number of observations its the mid-point between the two observations tied for the middle.
  4. Maximum: The Largest telomere
  5. Outliers/Extreme observations Are there any "outliers"? (extreme observations / values MUCH bigger or MUCH smaller than the rest?; Indicate YES/NO)

Write your answers in the table on the worksheet, then enter the appropriate info into the TopHat assignment.

Boxplots & Histograms

Tabulate the number of observations per "bin"

We can assigned each telomere length to a "bin" of similar values. I've decided all telomeres between 0.7 and 0.799 go into the "0.7" bin, all telomeres between 0.8 and 0.899 go in the "0.8" bin, etc. Basically we're rounding down each value to the nearest 0.1. The 0.7, 0.8, and 0.9 bins have already been marked on the original data table as an example.

[ ] TASK: Count up the number of telomeres in each "bin". Put a star next to the mode (most common observation) for each group.

Enter you tabulations into the table on the worksheet.

Building histograms

Histograms are useful for examining the distribution of data. We can turn our tabulations into a histograms For each bin we can fill in 1 box of the grid per observation in that bin. For the "disturbed treatment" I have penciled in 0 boxes for the 0.7 bin (which spans from 0.7 to 0.8 on the graph), 3 boxes for the 0.8 bin, and 4 boxes for the 0.9 bins.

[ ] Task: Make the rest of the histogram for the control and disturbed group. Draw your median as a vertical line through the distribution.

Enter your results into the grids on the attached worksheet.

Interpreting boxplots

Boxplots are another tool for examining the distribution of data. Boxplots have the following elements:

  1. A single line which represents the median (middle value, aka the 50% percentile)
  2. A box which represents observations falling between the 50th & 75th percentile (between the median and the approximate 75th ranked observation). This is the upper quartile (you don't need to know that term but you do need to understand the concept) .
  3. A box which presents observations falling 25th and 50th percentile. This is the lower quartile.
  4. Together the 2 small boxes combine into a larger box. This is the interquartile range and encompass 50% the data. (You don't need to remember this term but you should know what it is conceptually).
  5. A vertical line which extends up from the 75th percentile to approximately to the maximum (outliers are ignored!). This line represents the upper quartile.
  6. A vertical line which extends down from the 25th percentile approximately to the minimum value (outliers are ignored!). This line represents the lower quartile.
  7. Dots for outliers: curiously large or small observations. Note that the definition of outlier is arbitrary and it doesn't mean anything is wrong with the data.

Important note: A confusing aspect about boxplots is that they usually remove outliers from determination of the upper and lower quartiles. The lines that extend up and down therefore don't necessarily go all the way to the true maximum and minimum. Because if they are present outliers are treated separately the top and bottom vertical lines (sometimes called "Whiskers") can be considered the trimmed maximum and trimmed minimum. If present, the outliers get trimmed off and the minimum and maximum re-calculated. (You don't need to know these terms, but do need to understand the concept).

Percentiles: Percentiles relate to the relative rank of an observation. If you have 100 observations, the 25th percentile is 25th ranked observation. The median is the 50th percentile, or middle-ranked. The 75th percentile is the 75th ranked: 75 observations are lower, and 24 are bigger. Percentiles make the most sense where there are 100 observations; the math for how it works for different numbers of observation isn't important for this course. How ties are dealt with also isn't a focus.

Not always, but sometimes boxplots plot points for each observation overlayed on the boxes and vertical lines.

TASKS:

  1. [ ] Label the key parts of the boxplots displayed on the attached worksheet
  2. [ ] Write your median, min, and max values next to the appropriate places on the boxplot.
  3. [ ] Did the computer decide there were outliers? Label them if present.
  4. [ ] Calculate the percentage of data points that fall into the lower boxes.
  5. [ ] Calculate the percentage of data points that fall into the upper boxes.

Plotting means & error bars

Histograms and boxplots are frequently used for understanding the overall distribution of the data and are very good at showing things like the amount of spread or variation in the data.

In publications scientists frequently focus just on the mean value (aka "average") and make plots showing how means vary between groups of interest, such as control vs. experimental groups. When making plots with the mean we usually also draw error bars which extend up and down from the mean. These are sometimes called error plots.

Error bars

Error bars extend above and below a mean. Plots with error bars are frequently called error plots. Its confusing, but the length of a single error bar can represent usually 1 of 3 things:

  1. The standard deviation (SD)
  2. The standard error (SE)
  3. Part of a 95% confidence interval (95% CI).

All of these things are related but unfortunately they indicate different things. In order to accurately interpret scientific results you need to be familiar with each one.

Mean, standard deviation & standard error

The table below has the mean, standard deviation and other summaries for the telomere data. The standard deviation (SD) represents the amount of variation in the data. When lots of data is spread far away from the mean, the standard deviation (SD) is big. When most of the values are the same, the SD is small.

In some scientific fields people make plots showing the mean and draw error bars representing the standard deviation. The top of the upper error bar is at the mean + 1 SD, and the bottom of the lower error bars is at the mean - 1 SD. You see this sometimes in biology but its becoming less common.

The standard error (SE) is the standard deviation divided by the square root of the sample size: SE = SD/sqrt(N). Don't worry - we won't focus on calculating this by hand. The top of the upper error bar is at the mean + 1 SE, and the bottom of the lower error bars is at the mean - 1 SE. The standard error tells us how confident we should be about our estimate of the mean. Frequently in biology plots are made using the standard error but its better to use a confidence interval.

The 95% confidence interval extends 2 x SE above the mean and 2 X SE below. To be specific, the interval is everything from the bottom of the lower bar to the top of the upper bar. One arm of the interval is equal to about twice the standard error. So a confidence interval spans from the mean - 2SE to the mean + 2SE.

Like the SE, a confidence interval tells us how confident we can be that we have a good estimate of the mean. A narrow confidence interval means that our estimate of the mean is probably pretty good and if we repeated the experiment, we'd get a similar value again. A wide confidence interval means we should NOT be too certain about our mean, and if we repeated the experiment we'd likely get a pretty different result.

95% confidence intervals allow us to do a bit of statistics just by eye. When you have the means of 2 groups and their error bars only overlap a bit, statisticians would say the groups are "significantly different" from each other. When there is a significant statistical difference between 2 groups it lends support to the argument that there are relevant biological differences between 2 groups. However, if there is a lot of overlap between the confidence intervals of 2 means, the difference between means is considered non-significant.

Example 95% CI calculation:

Control upper 95% CI = mean + 2 x SE \newline = 0.95 + 2 x 0.044 \newline = 0.95 + 0.088

So the 95% confidence interval extends up from the mean to 1.038

[ ] TASK: Calculate the 95% Confidence Interval (CI) for the 2 treatments using the standard error (SE) and mean provided.

Enter your results into the table on the attached worksheet.

[ ] TASK: Use the mean & 95% confidence interval limits you just calculated to sketch an Error Plot on the attached worksheet.

Scatter plots

So far we have looked at a numeric variable (telomere length) split up into 2 groups. Often we want to examine the relationship between 2 numeric variables. This can be done with a scatterplots. Scatterplots can give you insight into how 2 variables are related.

[ ] TASK: Do you think there is a correlation between the 2 variables in this plot? Is the correlation (relationship) positive (+) or negative (-)?

Look at the graph below and enter your answer into the TopHat assignment.

scatter. <- ggscatter(data = na.omit(dat.foc.cols.rnd),
          y = "telomere.length",
          x = "cort",
          #add = "reg.line",
          #color = white,
          #cor.coef = T,
          title = "Scatter plot: Telomeres vs corticosterone",
          xlab = "Predictor: Blood corticosterone\nconcentration",
          ylab = "Response: Telomere length\n")

scatter.

Be sure to answer any other additional questions on TopHat.


\newpage

Last name:___ First name:__ Email:____

Recitation: 9 am / 10 am / 1 pm / 2 pm

Summary statistics

#Summary stats
summ.tab <- dat.foc.cols.rnd %>% 
  group_by(Treatment) %>% 
  summarise(sample.size = n(),
            min.telomere = min(telomere.length),
            median.telo = median(telomere.length),
            max.telomore = max(telomere.length),
            "outliers?" = "")

summ.tab.blnk <- summ.tab
summ.tab.blnk[,-1] <- "_____"
#output summary stats
pander(summ.tab.blnk)

Tabulate the number of observations per "bin"

## table for tabulation
tabulation.tab <- expand.grid(telomere.bin  = seq(0.7,1.6,0.1),
            N.control = "",
            N.disturbed = "",stringsAsFactors = F)

tabulation.tab[1:3,"N.disturbed"] <- c(0,3,4)
pander(tabulation.tab)

Interpreting boxplots

box. <- ggboxplot(data = dat.foc.cols.rnd,
          y = "telomere.length",
          x = "Treatment",
          #fill = "Sex",
          title = "Boxplot of telomere length",
          xlab= "Predictor: Experimental group",
          ylab= "Response: Telomere length",add = "dotplot")
box.

Building histograms

gg.hist.blank <- gghistogram(data = dat.foc.cols.rnd,
          x = "telomere.length",
          desc_stat = "mean_ci",
          title = "Hist. of __________ treatment \n telomere lengths",
          #fill = "sex",
          xlab  = "X-axis: Variable of interest\nTelomere length in bins",
          ylab = "Y-axis: Number of observations",
          color = "white",
          bins = 30,
          xlim = c(0.7,1.6)) + 
  geom_vline(xintercept = seq(0.7,1.6,0.1)) +
  geom_hline(yintercept = seq(0,10,1)) +
  scale_x_continuous(breaks =  seq(0.7,1.6,0.1)) +
  scale_y_continuous(breaks =  seq(0,10,2)) +
  theme( # text = element_text(size=20),
        axis.text.x = element_text(size = 8)) 

plot_grid(gg.hist.blank,gg.hist.blank)

Calculate the 95% Confidence Interval

#Summary stats
summ.tab2 <- dat.foc.cols.rnd %>% 
  group_by(Treatment) %>% 
  summarise(N = n(),
            mean.telo = min(telomere.length),
            SD = sd(telomere.length),
            SE = std.error(telomere.length),
            "95% CI top" = "",
            "95% CI bottom" = "")
pander(summ.tab2)

N = sample size | SD = standard deviation | SE = standard error | CI = confidence interval

Sketch error plot

error. <- ggerrorplot(data = dat.foc.cols.rnd,
          y = "telomere.length",
          x = "Treatment",
          desc_stat = "mean_ci",
          color = "white"  , 
          title = "Error plot: means & 95% confidence interval",
          xlab= "Predictor: Experimental group\n",
          ylab= "Response: Telomere length\n")

error.


brouwern/sparrowtelomeres documentation built on May 23, 2020, 12:34 a.m.