mpmap_teloscript.md
In jtlovell/jtlTools: John's package with some utilities and required pacakges

title: "Telomere candidate gene search" author: "JT Lovell, EV Shakirov" date: "r format(Sys.time(), '%d %B, %Y')" output: html_document: toc: true theme: united fig_width: 8 fig_height: 8

```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)


### 1. Set up the environment
#### Load required packages, install updates from github
```{r loadPackages, warnings = FALSE, results = 'hide'}
rm(list = ls())

pkg<-c("qtlTools","limmaDE2","knitr","data.table","plyr",
       "ggplot2","reshape2","mpMap")
suppressMessages(sapply(pkg, require, character.only = TRUE))

The Arabidopsis magic population data

```{r loadMAGIC,results = 'hide'} setwd("/Users/John/Desktop/magic") load('/Users/John/Desktop/magic/ArabidopsisExample.RData') tel<-read.delim("~/Desktop/magic/telomere.all.txt")

phe<-armpc$pheno phe$SUBJECT.NAME<-rownames(phe) phe<-merge(phe, tel, by = "SUBJECT.NAME", all.x = T) rownames(phe)<-phe$SUBJECT.NAME phe$SUBJECT.NAME<-NULL phe<-phe[match(rownames(armpc$pheno),rownames(phe)),] phe$mean.trf[is.na(phe$mean.trf)]<-mean(phe$mean.trf) armpc$pheno<-phe


#### The Arabidopsis tair 10 gene annotation
```{r loadTAIR, results = 'hide'}
gff<-read.delim("/Users/John/Desktop/juengerLabProjects/telomere_data/TAIR10_GFF3_genes.gff", header=F) 
colnames(gff)<-c("chr","tair.version","type","start","end","unk","strand","unk2","info")
gff<-gff[gff$type == "gene",]
gff<-gff[gff$chr=="Chr5",]

gff<-gff[!duplicated(gff$info),]
gff$geneID<-gsub("Parent=","",gff$info)
gff$geneID<-gsub("ID=","",gff$geneID)
gff$geneID<-substring(gff$geneID,1,9)
gff$info<-as.character(gff$info)

The parental alleles of the 19 genomes

```{r loadAlleles, results = 'hide'} snps<-fread("/Users/John/Desktop/magic/chr5.alleles.txt") snps<-data.frame(snps)


#### The parental expression counts of the 19 genomes
```{r loadCounts, results = 'hide'}
counts<-read.delim("/Users/John/Desktop/juengerLabProjects/telomere_data/counts19genomes.tab", header=T)
colnames(counts)<-gsub("X.fml.ag.raetsch.nobackup.projects.sequencing_runs.A_thaliana_magic.results.alignments.new2_splice.","",colnames(counts))
colnames(counts)<-gsub("R1.4.nodup.","",colnames(counts))
counts<-counts[,c(1, grep("nodup.RPKM",colnames(counts)))]
colnames(counts)<-gsub(".R2.4.nodup.RPKM","",colnames(counts))

The southern blotting candidate gene results

tdna<-read.csv("/Users/John/Desktop/juengerLabProjects/telomere_figures/southernResults.csv",
               stringsAsFactors=F)
tdna$Gene<-toupper(tdna$Gene)

```{r runMpMap, results = 'hide'} s1 <- mpIM(object=armpc, ncov=1, responsename="mean.trf", dwindow=50, window = 50)


```{r}
tmp<-armpc$prob$Chr5
tmp<-ifelse(tmp<0.5, FALSE, TRUE)
tmp[tmp<0.5]<-FALSE
tmp[tmp>=0.5]<-TRUE
tmp<-data.frame(tmp)

wh <- apply(tmp[,grep("MN5_22714627", colnames(tmp))],1, which.max)
als <- as.numeric(sapply(colnames(tmp[,grep("MN5_22714627", colnames(tmp))])[wh], function(x) strsplit(x,".", fixed = T)[[1]][4]))
aln <- rownames(armpc$founders)
names(als)<-names(wh)
alo <- aln[als]
names(alo)<-names(wh)

al.dt <- data.table(id = names(alo), haplotype = alo)
p.dt <- data.table(id = tel$SUBJECT.NAME, phe = tel$mean.trf)
p <- merge(al.dt, p.dt, by = "id")
p$is.sf2 <- p$haplotype == "Sf"

library(lme4)
library(MuMIn)

#Fit Model
m <- lmer(phe ~ 1 + (1|haplotype), data = p)

#Determine R2:
r.squaredGLMM(m) 

m
mnames<-names(armpc$map$Chr5) 
out<-lapply(mnames, function(x){
  dat<-tmp[,grep(x,colnames(tmp))]
  rownames(armpc$founders)[as.numeric(unlist(apply(dat, 1, which)))]
})
out<-do.call(cbind, out)
out<-data.frame(out)
colnames(out)<-mnames
rownames(out)<-rownames(tmp)
pdat<-data.frame(armpc$pheno,allele = out[,"MN5_22714627"])

Print results

```{r printSummary} kable(summary(s1))


#### Plot the results
```{r plotS1}
par(mfrow=c(2,1))
plot(s1)
plot(s1,chr = 5, lodsupport=5)

Get % variance explained by each QTL

dat<-fillmiss(armpc)
map<-dat$map
grid.mar<-unlist(lapply(map, function(x) {
  grid<-seq(from = 0, to = max(x), by = 5)
  out<-sapply(grid, function(y)  names(x)[which.min(abs(x-y))])
  return(out)
  }))

tomat<-apply(dat$finals, 2, function(x) ifelse(x == min(x), -1, ifelse(x == max(x), 1,0)))
amat<-A.mat(tomat[,grid.mar])
y = dat$pheno$mean.trf  


tmp<-do.call(cbind, armpc$prob)
tmp<-ifelse(tmp<0.5, FALSE, TRUE)
tmp[tmp<0.5]<-FALSE
tmp[tmp>=0.5]<-TRUE
tmp<-data.frame(tmp)
qtlposs<-unlist(attr(res$qtl,"index"))
qtlmars<-names(unlist(map))[qtlposs]
qtlmars<-sapply(qtlmars, function(y) strsplit(y, ".", fixed = T)[[1]][2])
out<-lapply(qtlmars, function(x){
  dat<-as.matrix(tmp[,grep(x,colnames(tmp))])
  line = rownames(dat)
  wh = apply(dat, 1, which)
  wh[sapply(wh,length)==0]<-NA
  wh<-as.numeric(unlist(wh))
  rownames(armpc$founders)[wh]
})
out<-do.call(cbind, out)
out<-data.frame(out)
colnames(out)<-qtlmars
rownames(out)<-rownames(tmp)

qtlMar<-out
identical(rownames(tomat), rownames(qtlMar))

y = dat$pheno$mean.trf
adat<-cbind(y, qtlMar)
fit<-aov(y~MN5_22714627, data = adat)
af <- anova(fit)
afss <- af$"Sum Sq"
print(cbind(af,PctExp=afss/sum(afss)*100))

aovnull<-aov(y~1, data = adat)
fixed.matrix<-model.matrix(form, data = qtlMar2)
test.q5<-rrBLUP::mixed.solve(y=y, K = amat,
                              X = fixed.matrix)
print(test.q5$Vu); print(test.q5$Ve)
print(test.q5$Vu/(test.q5$Vu+test.q5$Ve))

form<-as.formula("~ MASC04211")
fixed.matrix<-model.matrix(form, data = qtlMar2)
test.null<-rrBLUP::mixed.solve(y=y, K = amat)
print(test.null$Vu); print(test.null$Ve)
print(test.null$Vu/(test.null$Vu+test.null$Ve))

Pull out means and ses

```{r getMeans} res<-s1$QTLresults est<-do.call(cbind,res$fndrfx) se<-do.call(cbind,res$se) qtl.est<-est[,attr(res$qtl,"index")$Chr5] founder.id<-rownames(armpc$founders) qtl.se<-se[,attr(res$qtl,"index")$Chr5] qtl.estimate<-data.frame(founder.id,qtl.est,qtl.se, stringsAsFactors=F)


#### Plot means and SE
```{r plotEffect}
ggplot(qtl.estimate, aes(x = founder.id, y = qtl.est, fill = qtl.est))+
  geom_bar(stat="identity", width = .5)+ 
  scale_fill_gradient(low = "grey", high = "red", guide = F)+
  theme_jtlbar()+
  geom_errorbar(aes(ymin = qtl.est-qtl.se, ymax = qtl.est+qtl.se), width = 0)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  labs(x = "MAGIC Founder Parent", y = "QTL effect estimate (+/- SE)",
       title = "Chr 5 QTL Effect")

Pull out all genes in the interval

```{r getInterval} wh<-which.max(-log10(summary(s1)$pvalue)) for(i in c(2.5,5,7.5,10)){ interval<-as.numeric(supportinterval(s1, lodsupport=i)$support[,wh]) print(names(s1$map$Chr5[s1$map$Chr5 %in% interval])) }

intervals<-data.frame(level = c(2.5,7.5), lower = c(22415000,22414731), upper = c(23115566,23115566), stringsAsFactors=F) intervals$width = with(intervals, upper-lower) interval.bp<-c(22397775, 23246248)


### 5. Get statistics for expression and mutations of all genes in the interval
```{r compareFunction, echo = F}
# this function takes the gff, snps from parents and expression counts, plus a window
# it parses the files based on where the genes are and calculates various statistics
#########################################################
#########################################################
compareSF2<-function(gff, snps, counts, lowCI, highCI){
  #prepare annotation - subset to region
  gff_drop<-gff[gff$start>lowCI & gff$end<highCI,]
  cat("comparing SF-2 to the other 19 ecotypes for",nrow(gff_drop),"genes\n")
  genos<-colnames(snps)[6:24]
  narrow<-snps[snps$bp>lowCI & snps$bp<highCI,]
  gff_drop$info<-as.character(gff_drop$info)
  cat("calculating the number of SF-2 private alleles in each gene\n")
  out<-apply(narrow[,genos],1,function(x) {
    sfa<-x[14]
    if(sfa==0){
      return(NA)
    }else{
      s<-sum(x[-14]==sfa)
      return(s)
    }
  })
  narrow$propsf2<-out
  narrow$onlysf<-ifelse(narrow$propsf2==0 & !is.na(narrow$propsf2),1,0)
  #count the number of private alleles to sf2
  n.private_sf2<-sapply(unique(gff_drop$info), function(i){
    wind<-c(gff_drop$start[gff_drop$info==i], gff_drop$end[gff_drop$info==i])
    dat<-narrow[narrow$bp>wind[1] & narrow$bp<wind[2],]

    sum(dat$onlysf)
  })
  gff_drop$n.privateSf2<-n.private_sf2

  #pull out the genes in the region
  genes_drop<-gff_drop
  narrowGenes<-genes_drop$geneID
  genes_drop$gene<-genes_drop$geneID

  #subset the expression counts data by the genes
  counts_5<-counts[counts$gene %in% narrowGenes,]

  #calculate statistics about the distribution of counts
  cat("calculating expression differences between SF-2 and the other ecotypes\n")
  exp.test<-apply(counts_5[,-1], 1, function(x) {
    sf2<-x[14]
    other<-x[-14]
    if(sd(x)<0.000001){
      out.t<-NA
    }else{
      out.t<-t.test(other, mu=sf2)$p.value # one sample t-test where there is some variance
    }
    out.percentile<-sum(other<sf2) #n of lines with expression less than sf2
    c(out.t,out.percentile)
  })
  exp.test<-data.frame(t(exp.test))
  colnames(exp.test)<-c("t.test_pvalue","n_ecotypes_belowSF2")
  exp.test$gene<-counts_5$gene
  exp.test$t.test_qvalue<-p.adjust(exp.test$t.test_pvalue, method="bonferroni")
  exp.test<-exp.test[complete.cases(exp.test),]
  out<-merge(genes_drop, exp.test, by="gene")
  return(out)
}
#########################################################
#########################################################

Calculate stats for all genes in the window, comparing SF-2 to all other parents

dropstats<-compareSF2(gff=gff, snps=snps, counts=counts, 
                      lowCI=min(interval.bp)-100, 
                      highCI=max(interval.bp)+100)
dropstats$priv.cat<-with(dropstats,ifelse(n.privateSf2==0,"same","sf2private"))

ggplot(dropstats, aes(x=n_ecotypes_belowSF2, y=(-log10(t.test_pvalue)), col=priv.cat))+
  geom_point()+
  scale_x_continuous("n ecotype with expression less than SF")+
  scale_y_continuous("-log10 Pvalue of 1-sample t-test for mean 18 ecotypes != SF-2")+
  geom_abline(intercept=-log10(.05/nrow(dropstats)), slope=0, col="red", lty=2)+
  ggtitle(paste("drop7.5 LOD window contains",nrow(dropstats),"genes"))+
  theme_bw()
candidateGenes<-dropstats$gene[dropstats$n.privateSf2>0 & dropstats$t.test_qvalue<0.05]
print(candidateGenes)


boxplot(tdna$mean.TRF ~ tdna$Gene, xaxt = "n", range=0, ylab = "% Increase from Wt (Col-0)")


wt<-tdna[tdna$Gene == "WT-COL-0",]
tdna<-tdna[tdna$Gene != "WT-COL-0",]
mean.wt<-mean(wt$mean.TRF)
se.wt<-sd((wt$mean.TRF - mean.wt)/mean.wt)/sqrt(length(wt$mean.TRF))
tdna$prop.wt<-with(tdna, (mean.TRF-mean.wt)/mean.wt)
tdna.mean.se<-ddply(tdna, .(Gene), summarize,
                    mean = mean(prop.wt),
                    sd = sd(prop.wt),
                    se = sd(prop.wt)/sqrt(length(prop.wt)))

tdna2<-merge(tdna.mean.se, with(gff[gff$geneID %in% tdna.mean.se$Gene,], 
                      data.frame(Gene = geneID, pos = start, stringsAsFactors=F)),
                      by = "Gene")
tdna2$order<-order(tdna2$pos)/nrow(tdna2)

p.min<-min(dropstats$start)
p.max<-max(dropstats$start)
d.diff<-p.max-p.min
tdna2$x<-(tdna2$order*d.diff)+p.min


abline(h = -se.wt, lwd = 3, col = rgb(1,0,0,.5))
with(tdna2, plot(x,mean))
with(tdna2, segments(x0 = x, x1=x, y1 = mean-sd, y0 = mean+sd))
tdna2$mean<-tdna2$mean*100
tdna2$sd<-tdna2$sd*100
tdna2$se<-tdna2$se*100

tdna2$mean<-tdna2$mean*100
tdna2$sd<-tdna2$sd*100
tdna$prop.wt<-tdna$prop.wt*100
se.wt<-se.wt*100

cand = unique(tdna$Gene)

par(mar = c(0, 12, 0, 2) + 0.1, mfrow = c(2,1))
boxplot(tdna$prop.wt ~ tdna$Gene, xaxt = "n", range=0, ylab = "% Increase from Wt (Col-0)")
rect(xleft = -1, xright = 22, ytop = se.wt, ybottom = -se.wt, col = rgb(1,0,0,.5),
     border = NA)
abline(h = 0, lwd = 3, col = rgb(1,0,0,1))

with(dropstats, plot(c(min(start), max(start)),c(0,0),type = "l", ylim = c(-.2,1), axes=F,
                     xlab = "Chr 5 physical position (Mbp)",
                     ylab = NA))
with(dropstats, segments(x0 = start,x1=start,y0 = -0.05, y1 = 0.05))
with(dropstats[dropstats$gene %in% unique(tdna$Gene),], 
     points(x = start, y = rep(.6, length(start)), pch = 8, col = "green"))

with(dropstats[dropstats$n.privateSf2>0,], 
     points(x = start, y = rep(.2, length(start)), pch = 1, col = "darkblue", cex = .5))

with(dropstats[dropstats$n.privateSf2>0 & dropstats$t.test_qvalue<=0.05,], 
     points(x = start, y = rep(.6, length(start)), pch = 8, col = "green"))

with(dropstats[dropstats$n.privateSf2==0 & dropstats$t.test_qvalue<0.05,], 
     points(x = start, y = rep(.4, length(start)), pch = 1, col = "darkred", cex = .5))
with(tdna2, segments(x0 = x, x1 = pos, y0 = .7,y1 = .6, lty = 3))

with(tdna2, text(x = x,y = .72, labels = Gene, srt = 90, adj = 0, cex = .8))

axis(1, at = c(22420000,22760000,23100000), labels = c(22.42000,22.76000,23.10000))
axis(2, at = c(0,.2,.4,.6), 
     labels = c("genes",
                ">=1 SF2 Private SNP", 
                "SF2 Expression P <= 0.05",
                "Private SNP and Expression"),
     las = 2, line=NULL)



par(mar = c(5, 4, 4, 2) + 0.1)

oli2<-counts[counts$gene == "AT5G55920",-1]
oli2<-data.frame(parents = names(oli2), exp = as.numeric(oli2), stringsAsFactors = F)
oli2$rank<-rank(oli2$exp)
with(oli2, plot(rank, exp))
with(oli2, text(x = rank, y = exp, label = parents, adj = c(1,1)))

x=sqrt(mean.trf/100)
head(tel)
hist(tel$mean.trf/1000, breaks = 40)
with(tel, hist(sqrt(mean.trf)*mean.trf,mean.trf))
hist(log2(tel$mean.trf/1000), breaks = 40, xaxt = "n", xlab = "telo.length")
axis(1,at = log2(2:8), labels = 2:8)

jtlovell/jtlTools documentation built on Jan. 17, 2024, 2:53 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jtlovell/jtlTools
John's package with some utilities and required pacakges

mpmap_teloscript.md
In jtlovell/jtlTools: John's package with some utilities and required pacakges

2. Load data

The Arabidopsis magic population data

Add the phenotype data to the MAGIC R object

The parental alleles of the 19 genomes

The southern blotting candidate gene results

3. Run mpMap composite interval mapping

Print results

4. Analyze the effect of the focal QTL

Get % variance explained by each QTL

Pull out means and ses

Pull out all genes in the interval

Calculate stats for all genes in the window, comparing SF-2 to all other parents

6. Examine genes in the interval

R Package Documentation

Browse R Packages

We want your feedback!

jtlovell/jtlTools John's package with some utilities and required pacakges

mpmap_teloscript.md In jtlovell/jtlTools: John's package with some utilities and required pacakges

2. Load data

The Arabidopsis magic population data

Add the phenotype data to the MAGIC R object

The parental alleles of the 19 genomes

The southern blotting candidate gene results

3. Run mpMap composite interval mapping

Print results

4. Analyze the effect of the focal QTL

Get % variance explained by each QTL

Pull out means and ses

Pull out all genes in the interval

Calculate stats for all genes in the window, comparing SF-2 to all other parents

6. Examine genes in the interval

R Package Documentation

Browse R Packages

We want your feedback!

jtlovell/jtlTools
John's package with some utilities and required pacakges

mpmap_teloscript.md
In jtlovell/jtlTools: John's package with some utilities and required pacakges