library(rtracklayer)
library(ShortRead)
library(Biostrings)
library(BSgenome)
## you know what I give up on Bioc and try to go tidy for bio too...
library(biobroom)
library(tidyverse)
## to create the example data
EfalGenome <- readDNAStringSet("/home/ele/ToxoDB-46_EfalciformisBayerHaberkorn1970_Genome.fasta")
EfalAnnotation <- import.gff("/home/ele/ToxoDB-46_EfalciformisBayerHaberkorn1970.gff")
## EfalAnnotationTxDb <- makeTxDbFromGFF("/home/ele/ToxoDB-46_EfalciformisBayerHaberkorn1970.gff")
save(EfalGenome, file="data/EfalGenome.rda", compress='xz')
save(EfalAnnotation, file="data/EfalAnnotation.rda", compress='xz')
## From the original transcriptome work on A. crassus
## if(!exists("VCF")){
## source("scripts/3_SNP_analysis.R")
## }
EfalGenomeNamed <- EfalGenome
names(EfalGenomeNamed) <- gsub("(.*?) .*", "\\1", names(EfalGenomeNamed))
EfalCDS <- subset(EfalAnnotation, type%in%"CDS")
EfalCDSseq <- getSeq(EfalGenomeNamed, EfalCDS)
EfalCDSParented <- as.factor(unlist(EfalCDS@elementMetadata$Parent))
EfalCDSStrand <- as.factor(EfalCDS@strand)
EfalCDSTranscriptStrands <- by(EfalCDSStrand, EfalCDSParented, function(x) unique(as.character(x)))
EfalCDSTranscripts <- by(EfalCDSseq, EfalCDSParented, function(x) do.call(c, x))
transONT <- lapply(seq_along(EfalCDSTranscriptStrands), function(i) {
if(EfalCDSTranscriptStrands[[i]]=="-"){
transcript <- reverseComplement(EfalCDSTranscripts[[i]])
} else {
transcript <- EfalCDSTranscripts[[i]]
}
tran <- as.character(transcript)
codons <- substring(tran,
seq(1, nchar(tran), by=3),
seq(3, nchar(tran), by=3))
ont <- base.ontology.encode(tolower(codons))
otl <- paste0(ont, collapse="")
if(EfalCDSTranscriptStrands[[i]]=="-"){
reverse(otl)
} else {
otl
}
})
## That's a complete mess...
## we'd now need to split again by cds then replace the cds in the genome...
## by(EfalCDSseq, EfalCDSParented, function (x) cumsum(width(x)))
cat("foo")
cat("bar")
## ## BASE ONTOLOGY
## #
## # Code from Mark Blaxter, modified by John Davey,
## # Translated from Perl to R by Emanuel Heitlinger:
## # A phase 1 any change is nonsynonymous
## # B phase 2 any change is nonsynonymous
## # C phase 3 any change is nonsynonymous
## # D phase 1 change to CT is nonsynonymous
## # E phase 2 change to CT is nonsynonymous
## # F phase 3 change to CT is nonsynonymous
## # G phase 1 change to AG is nonsynonymous
## # H phase 2 change to AG is nonsynonymous
## # I phase 3 change to AG is nonsense
## # K phase 1 change to GT is nonsynonymous
## # L phase 2 change to A is nonsense, to anything else is nonsynonymous
## # J phase 3 change to G is nonsynonymous
## # M phase 3 change to G is nonsense, to A is nonsynonymous
## # N phase 3 any change synonymous
## # O phase 1 change to T nonsense, others nonsynonymous
## # P phase 3 change to AG is nonsynonymous
## # Q phase 1 change to T nonsense, to G nonsynonymous
## # R phase 2 change to AG nonsense, others nonsynonymous
## # S phase 3 change to A nonsense, others nonsynonymous
## # T phase 3 change to A nonsense, G nonsynonymous
## # W all changes are unknown # EH added 08/23/2011
## # a g c t
## #
## # a aaa K OBF aga R QBF aca T ABN ata I ABJ
## # aag K OBF agg R KBF acg T ABN atg M ABC
## # aac N ABP agc S ABP acc T ABN atc I ABJ
## # aat N ABP agt S ABP act T ABN att I ABJ
## #
## # g gaa E OBF gga G OBN gca A ABN gta V ABN
## # gag E OBF ggg G ABN gcg A ABN gtg V ABN
## # gac D ABP ggc G ABN gcc A ABN gtc V ABN
## # gat D ABP ggt G ABN gct A ABN gtt V ABN
## #
## # c caa Q OBF cga R QBN cca P ABN cta L GBN
## # cag Q OBF cgg R KBN ccg P ABN ctg L GBN
## # cac H ABP cgc R ABN ccc P ABN ctc L ABN
## # cat H ABP cgt R ABN cct P ABN ctt L ABN
## #
## # t taa * AEF tga * AEC tca S ARN tta L GRF
## # tag * ABF tgg W ALS tcg S ALN ttg L GLF
## # tac Y ABI tgc C ABT tcc S ABN ttc F ABP
## # tat Y ABI tgt C ABT tct S ABN ttt F ABP
base.ontology.encode <- function(x){
## Set up Base Ontology vector
base.ontology.encode.string = c(
"aaa" = "OBF",
"aag" = "OBF",
"aac" = "ABP",
"aat" = "ABP",
"aga" = "QBF",
"agg" = "KBF",
"agc" = "ABP",
"agt" = "ABP",
"aca" = "ABN",
"acg" = "ABN",
"acc" = "ABN",
"act" = "ABN",
"ata" = "ABJ",
"atg" = "ABC",
"atc" = "ABJ",
"att" = "ABJ",
##
"gaa" = "OBF",
"gag" = "OBF",
"gac" = "ABP",
"gat" = "ABP",
"gga" = "OBN",
"ggg" = "ABN",
"ggc" = "ABN",
"ggt" = "ABN",
"gca" = "ABN",
"gcg" = "ABN",
"gcc" = "ABN",
"gct" = "ABN",
"gta" = "ABN",
"gtg" = "ABN",
"gtc" = "ABN",
"gtt" = "ABN",
##
"caa" = "OBF",
"cag" = "OBF",
"cac" = "ABP",
"cat" = "ABP",
"cga" = "QBN",
"cgg" = "KBN",
"cgc" = "ABN",
"cgt" = "ABN",
"cca" = "ABN",
"ccg" = "ABN",
"ccc" = "ABN",
"cct" = "ABN",
"cta" = "GBN",
"ctg" = "GBN",
"ctc" = "ABN",
"ctt" = "ABN",
##
"taa" = "AEF",
"tag" = "ABF",
"tac" = "ABI",
"tat" = "ABI",
"tga" = "AEC",
"tgg" = "ALS",
"tgc" = "ABT",
"tgt" = "ABT",
"tca" = "ARN",
"tcg" = "ALN",
"tcc" = "ABN",
"tct" = "ABN",
"tta" = "GRF",
"ttg" = "GLF",
"ttc" = "ABP",
"ttt" = "ABP",
##
## bad strings
"x"="W",
"xx"="WW",
"xxx"="WWW"
);
## a replace for iupac and other "bad" bases
tr <- function (y) paste0(rep("x", times=nchar(y)), collapse="")
offenders <- which(nchar(gsub("[^agct]", "", x))!=3)
x[offenders] <- sapply(x[offenders], tr)
## do the magic
x <- unlist(x)
base.ontology.encode.string[x]
}
base.ontology.decode = list(
"A" = c("A" = "Nonsynonymous", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"B" = c("A" = "Nonsynonymous", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"C" = c( "A" = "Nonsynonymous", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"D" = c( "A" = "Synonymous", "C" = "Nonsynonymous",
"G" = "Synonymous", "T" = "Nonsynonymous" ),
"E" = c( "A" = "Synonymous", "C" = "Nonsynonymous",
"G" = "Synonymous", "T" = "Nonsynonymous" ),
"F" = c( "A" = "Synonymous", "C" = "Nonsynonymous",
"G" = "Synonymous", "T" = "Nonsynonymous" ),
"G" = c( "A" = "Nonsynonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Synonymous" ),
"H" = c( "A" = "Nonsynonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Synonymous" ),
"I" = c( "A" = "Nonsense", "C" = "Synonymous",
"G" = "Nonsense", "T" = "Synonymous" ),
"J" = c( "A" = "Synonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Synonymous" ),
"K" = c( "A" = "Synonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"L" = c( "A" = "Nonsense", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"M" = c( "A" = "Nonsynonymous", "C" = "Synonymous",
"G" = "Nonsense", "T" = "Synonymous" ),
"N" = c( "A" = "Synonymous", "C" = "Synonymous",
"G" = "Synonymous", "T" = "Synonymous" ),
"O" = c( "A" = "Nonsynonymous", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsense" ),
"P" = c( "A" = "Nonsynonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Synonymous" ),
"Q" = c( "A" = "Synonymous", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Nonsense" ),
"R" = c( "A" = "Nonsense", "C" = "Nonsynonymous",
"G" = "Nonsense", "T" = "Nonsynonymous" ),
"S" = c( "A" = "Nonsense", "C" = "Nonsynonymous",
"G" = "Nonsynonymous", "T" = "Nonsynonymous" ),
"T" = c( "A" = "Nonsense", "C" = "Synonymous",
"G" = "Nonsynonymous", "T" = "Synonymous" ),
"X" = c( "A" = "Nonsense", "C" = "Nonsense",
"G" = "Nonsense", "T" = "Nonsense" ),
"W" = c("A" = NA, "C" = NA,
"G" = NA, "T" = NA , "R" = NA,
"Y" = NA, "S" = NA, "W" = NA,
"K" = NA, "M" = NA, "B" = NA,
"D" = NA, "H" = NA, "V" = NA,
"N" = NA, "X" = NA),
"Y" = c("A" = "low coverage", "C" = "low coverage",
"G" = "low coverage", "T" = "low coverage" , "R" = "low coverage",
"Y" = "low coverage", "S" = "low coverage", "W" = "low coverage",
"K" = "low coverage", "M" = "low coverage", "B" = "low coverage",
"D" = "low coverage", "H" = "low coverage", "V" = "low coverage",
"N" = "low coverage", "X" = "low coverage"),
"Z" = c("A" = "outside ORF", "C" = "outside ORF",
"G" = "outside ORF", "T" = "outside ORF" , "R" = "outside ORF",
"Y" = "outside ORF", "S" = "outside ORF", "W" = "outside ORF",
"K" = "outside ORF", "M" = "outside ORF", "B" = "outside ORF",
"D" = "outside ORF", "H" = "outside ORF", "V" = "outside ORF",
"N" = "outside ORF", "X" = "outside ORF")
);
get.coding.seq <- function(transcript, start, end, strand){
coding <- tolower(substr(transcript, as.numeric(start),
as.numeric(end)))
if(nchar(coding)%%3!=0){
warning("coding region has not multiple of 3 lenght:\n",
transcript, "\tstart:", start,
"\tend:", end, "\tstrand:", strand)
}
if(strand%in%"-"){
coding <- revcom(coding) # revcom on the minus strand
}
return(coding)
}
get.ontology <- function(transcript, start, end, strand) {
utr1 <- substr(transcript, 1,
as.numeric(start) - 1)
utr2 <- substr(transcript,
as.numeric(end)+1,
nchar(transcript))
coding <- get.coding.seq(transcript, start, end, strand)
## split the cds by 3 and get the base ontology for each codon
codons <- substring(coding,
seq(1, nchar(coding), by=3),
seq(3,nchar(coding), by=3))
ont <- lapply(codons, base.ontology.encode)
ontology <- paste(ont, collapse="")
utr1 <- gsub("\\w", "Z", utr1)
utr2 <- gsub("\\w", "Z", utr2)
if(strand%in%"+"){
return(paste(utr1, ontology, utr2, sep=""))
}
### Need to strReverse the wrong way round cds
if(strand%in%"-"){
return(paste(utr1, strReverse(ontology), utr2, sep=""))
}
}
cds.gff.df$base.ontology <-
as.character(apply(cds.gff.df, 1, function (x) {
get.ontology(x["transcript"], x["start"],
x["end"], x["strand"])}))
## all ontologies have the right length
table(nchar(as.character(cds.gff.df$transcript))==
nchar(as.character(cds.gff.df$base.ontology)))
VCF$SNP <- rownames(VCF)
VAR <- merge(cds.gff.df, VCF, by.x = "seqnames", by.y = "V1")
VAR$transcript <- as.character(VAR$transcript)
transversion.transition <- function (from, to){
get.trans <- function (x) {
trans <- summary(as.factor(from):as.factor(to))
trans <- trans[trans!=0]
}
transf <- get.trans(VARobj)
get.vers <- function (x){
transitions <- c("A:G", "G:A", "C:T", "T:C")
sition <- sum(x[names(x)%in%transitions])
version <- sum(x[!names(x)%in%transitions])
res <- cbind(transitions=sition, transversions=version, ratioTS.TV=sition/version)
return(as.data.frame(res))
}
get.vers(transf)
}
transversion.transition(VAR$V4, VAR$V5)
get.effect <- function (ontology, base, to, strand){
if(strand%in%"-"){
to <- revcom(to)
}
code <- unlist(strsplit(ontology, ""))[[base]]
base.ontology.decode[[code]][to]
}
VAR$effect <- apply(VAR, 1, function(x){
get.effect(as.character(x["base.ontology"]),
as.numeric(x["V2"]),
as.character(x["V5"]),
as.character(x["strand"]))
})
tapply(VAR$effect, as.character(VAR$strand), table)
get.sites <- function (ontology, transcript){
s.sites <- sapply(1:length(ontology), function (i) {
split.ont <- unlist(strsplit(ontology[[i]], ""))
split.cod <- unlist(strsplit(transcript[[i]], ""))
decoded <- lapply(split.ont, function (a){
base.ontology.decode[[a]]})
reduced.decoded <- lapply(1:length(decoded), function (x) {
subset(decoded[[x]], names(decoded[[x]])!=toupper(split.cod[[x]]))})
s <- lapply(reduced.decoded, function (w) {
length(w[grepl("Syn", w)])/length(w[grepl("Syn|Non", w)])})
n <- lapply(reduced.decoded, function (w) {
length(w[grepl("Non", w)])/length(w[grepl("Syn|Non", w)])})
nsyn.sites <- sum(unlist(n), na.rm=TRUE)
syn.sites <- sum(unlist(s), na.rm=TRUE)
cbind(nsyn.sites, syn.sites)
})
data.frame(t(s.sites))
}
sites <- get.sites(VAR[!duplicated(VAR$seqnames), "base.ontology"],
VAR[!duplicated(VAR$seqnames), "transcript"])
rownames(sites) <- VAR$seqnames[!duplicated(VAR$seqnames)]
names(sites) <- c("nsyn.sites", "syn.sites")
VAR <- merge(VAR, sites, by.x = "seqnames", by.y = 0)
get.pn.ps <- function(VARobj){
npSNP <- nrow(VARobj)
uni <- !duplicated(as.character(VARobj[,"seqnames" ]))
pN <- nrow(VARobj[grepl("Non*", VARobj$effect),])/
sum(as.numeric(VARobj[uni, "nsyn.sites"]), na.rm=T)
pS <- nrow(VARobj[VARobj$effect=="Synonymous",])/
sum(as.numeric(VARobj[uni, "syn.sites"]), na.rm=T)
pNpS <- pN/pS
return(list(npSNP=npSNP, pN=pN, pS=pS, pNpS=pNpS))
}
pn.ps.overall <- get.pn.ps(VAR)
## use only SNPs which are ther in at least 3 genotypes
VAR.div <- VAR[VAR$SNP%in%rownames(GT[rowSums(GT,na.rm = TRUE)>2, ]),]
get.pn.ps(VAR.div)
pn.ps.list <- by(VAR, VAR$seqnames, function (x){
list(get.pn.ps(x))
})
pn.ps.list.div <- by(VAR.div, VAR.div$seqnames, function (x){
list(get.pn.ps(x))
})
pn.ps.frame <- do.call(rbind, pn.ps.list)
pn.ps.frame <- as.data.frame(apply(pn.ps.frame, 2, unlist))
## pn.ps.frame.div <- do.call(rbind, pn.ps.list.div)
## pn.ps.frame.div <- as.data.frame(apply(pn.ps.frame.div, 2, unlist))
pn.ps.frame$pNpS[is.infinite(pn.ps.frame$pNpS)] <- 7
pn.ps.frame$pNpS[is.nan(pn.ps.frame$pNpS)] <- 0
pn.ps.frame <- pn.ps.frame[!is.na(pn.ps.frame$npSNP),]
pn.ps.transset <- rownames(subset(pn.ps.frame, pNpS>1 & npSNP>3))
pn.ps.geneset <- gsub("_seq\\d+$", "", pn.ps.transset)
## pn.ps.transset.div <- rownames(subset(pn.ps.frame.div, pNpS>1&npSNP>3))
## pn.ps.geneset.div <- gsub("_seq\\d+$", "", pn.ps.transset.div)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.