analysis/07_129_CAST_analysis.md

Analyzing 129 and CAST transcripts

library(Biostrings)
Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors

Loading required package: stats4


Attaching package: 'S4Vectors'

The following object is masked from 'package:utils':

    findMatches

The following objects are masked from 'package:base':

    expand.grid, I, unname

Loading required package: IRanges

Loading required package: XVector

Loading required package: GenomeInfoDb


Attaching package: 'Biostrings'

The following object is masked from 'package:base':

    strsplit
library(Matrix)
Attaching package: 'Matrix'

The following object is masked from 'package:S4Vectors':

    expand
library(ggplot2)
library(tidyr)
Attaching package: 'tidyr'

The following objects are masked from 'package:Matrix':

    expand, pack, unpack

The following object is masked from 'package:S4Vectors':

    expand
library(dplyr)
Attaching package: 'dplyr'

The following objects are masked from 'package:Biostrings':

    collapse, intersect, setdiff, setequal, union

The following object is masked from 'package:GenomeInfoDb':

    intersect

The following object is masked from 'package:XVector':

    slice

The following objects are masked from 'package:IRanges':

    collapse, desc, intersect, setdiff, slice, union

The following objects are masked from 'package:S4Vectors':

    first, intersect, rename, setdiff, setequal, union

The following objects are masked from 'package:BiocGenerics':

    combine, intersect, setdiff, union

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tibble)
library(DECIPHER)
Loading required package: RSQLite

Loading required package: parallel
library(stringr)

Load in data

tx <- readDNAStringSet('../inst/extdata/emase.pooled.targets.fa')
ensembl_ids <- read.csv('../inst/extdata/ensembl_tx_gene.csv')
tx_names <- tx@ranges@NAMES
tx_df <- tx_names |> 
  as.data.frame() |>
  separate(tx_names, into=c('tx', 'strain'), sep='_') |>
  left_join(ensembl_ids, by = c('tx' = 'ensembl_transcript_id'))
# create an index from the row idx 
tx_df <- tx_df |> 
  rownames_to_column(var = 'idx') |>
  pivot_wider(id_cols = c('tx', 'external_gene_name'), names_from = 'strain', values_from='idx')
colnames(tx_df) <- c('tx', 'gene', 's129', 'sCAST')

Xist example

# all Xist transcripts
tx_df |> filter(gene=='Xist')
# A tibble: 2 × 4
  tx                 gene  s129  sCAST 
  <chr>              <chr> <chr> <chr> 
1 ENSMUST00000153883 Xist  4331  119456
2 ENSMUST00000127786 Xist  69052 184177
# align each one
df <- tx_df |> filter(gene=='Xist')
for (i in 1:nrow(df)) {
  test <- pairwiseAlignment(as.character(tx[[as.integer(df$s129[i])]]), as.character(tx[[as.integer(df$sCAST[i])]]))
  s1 <- alignedPattern(test)
  s2 <- alignedSubject(test)
  sdf <- data.frame(
    idx = seq(1,width(s1)),
    s129 = unlist(str_split(as.character(s1), pattern='')),
    sCAST = unlist(str_split(as.character(s2), pattern=''))
  ) |>
    mutate(aligned = ifelse(s129==sCAST, 'lightgray', 'red')) 

  sdf |>
    pivot_longer(cols = c('s129', 'sCAST')) |>
    ggplot(aes(x = idx, y = factor(name))) +
    geom_vline(data= sdf |> filter(aligned=='red') |> unique(),
               aes(xintercept = idx), color = 'red', linewidth=0.5) +
    scale_x_continuous(breaks = c(1, round(width(s1)/4), round(width(s1)/2), round(width(s1)/4*3), width(s1)), labels = c(1, round(width(s1)/4), round(width(s1)/2), round(width(s1)/4*3), width(s1)), limits = c(1, width(s1))) +
    theme_classic() +
    ylab('') +
    xlab('Position') +
    ggtitle(paste0(df[i,1], ', ', df[i,2], ' mismatch: ', nmismatch(test), '; ins: ', as.integer(nindel(test)@insertion[,2]), 'bp; del:', as.integer(nindel(test)@deletion[,2]), 'bp'))
  ggsave(paste0('figures/07_',df[i,2],'_',df[i,1],'.pdf'),height=1, width=8)
}
plot_tx <- function(g) {
  df <- tx_df |> filter(gene==g)
  for (i in 1:nrow(df)) {
    test <- pairwiseAlignment(as.character(tx[[as.integer(df$s129[i])]]), as.character(tx[[as.integer(df$sCAST[i])]]))
    s1 <- alignedPattern(test)
    s2 <- alignedSubject(test)
    sdf <- data.frame(
      idx = seq(1,width(s1)),
      s129 = unlist(str_split(as.character(s1), pattern='')),
      sCAST = unlist(str_split(as.character(s2), pattern=''))
    ) |>
      mutate(aligned = ifelse(s129==sCAST, 'lightgray', 'red')) 

    print(sdf |>
      pivot_longer(cols = c('s129', 'sCAST')) |>
      ggplot(aes(x = idx, y = factor(name))) +
      geom_vline(data= sdf |> filter(aligned=='red') |> unique(),
                 aes(xintercept = idx), color = 'red', linewidth=0.5) +
      scale_x_continuous(breaks = c(1, round(width(s1)/4), round(width(s1)/2), round(width(s1)/4*3), width(s1)), labels = c(1, round(width(s1)/4), round(width(s1)/2), round(width(s1)/4*3), width(s1)), limits = c(1, width(s1))) +
      theme_classic() +
      ylab('') +
      xlab('Position') +
      ggtitle(paste0(df[i,1], ', ', df[i,2], ' mismatch: ', nmismatch(test), '; ins: ', as.integer(nindel(test)@insertion[,2]), 'bp; del:', as.integer(nindel(test)@deletion[,2]), 'bp')))
    ggsave(paste0('figures/07_',df[i,2],'_',df[i,1],'.pdf'),height=1, width=8)
  }
}
plot_tx('Ptgds')

All transcripts

tx_df$nmismatch <- 0
tx_df$ins_bp <- 0
tx_df$del_bp <- 0
for (i in 1:nrow(tx_df)) {
  if (i%%1000 == 0) {
    message(i)
  }
  test <- try(pairwiseAlignment(as.character(tx[[as.integer(tx_df$s129[i])]]), as.character(tx[[as.integer(tx_df$sCAST[i])]])))
  if (is(test, 'try-error')) {
    tx_df$nmismatch[i] <- NA
    tx_df$ins_bp[i] <- NA
    tx_df$del_bp[i] <- NA
  } else {
    tx_df$nmismatch[i] <- nmismatch(test)
    tx_df$ins_bp[i] <- as.integer(nindel(test)@insertion[,2])
    tx_df$del_bp[i] <- as.integer(nindel(test)@deletion[,2])
  }
}
tx_df$width <- width(tx)[as.integer(tx_df$s129)]
saveRDS(tx_df, file = '07_129_CAST_df.rds')
tx_df <- readRDS('../inst/extdata/07_129_CAST_df.rds')

Summary statistics

tx_df |>
  mutate(frac_mismatch = nmismatch/width,
         frac_mismatch_indel = (nmismatch + ins_bp + del_bp)/width) |>
  summary()
      tx                gene               s129              sCAST          
 Length:115125      Length:115125      Length:115125      Length:115125     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character




   nmismatch           ins_bp            del_bp           width       
 Min.   :  0.000   Min.   :  0.000   Min.   : 0.000   Min.   :     9  
 1st Qu.:  1.000   1st Qu.:  0.000   1st Qu.: 0.000   1st Qu.:   549  
 Median :  4.000   Median :  0.000   Median : 0.000   Median :   945  
 Mean   :  8.383   Mean   :  1.959   Mean   : 1.699   Mean   :  1694  
 3rd Qu.: 11.000   3rd Qu.:  1.000   3rd Qu.: 1.000   3rd Qu.:  2381  
 Max.   :301.000   Max.   :143.000   Max.   :90.000   Max.   :106824  
 NA's   :7         NA's   :7         NA's   :7                        
 frac_mismatch      frac_mismatch_indel
 Min.   :0.000000   Min.   :0.000000   
 1st Qu.:0.001625   1st Qu.:0.002034   
 Median :0.003945   Median :0.005038   
 Mean   :0.005411   Mean   :0.007576   
 3rd Qu.:0.007353   3rd Qu.:0.009804   
 Max.   :0.166667   Max.   :0.553571   
 NA's   :7          NA's   :7
tx_df |>
  ggplot(aes(x=nmismatch/width)) +
  geom_histogram(bins=50) +
  theme_minimal() +
  xlab('n SNPs / transcript length')
Warning: Removed 7 rows containing non-finite values (`stat_bin()`).

tx_df |>
  ggplot(aes(x=(nmismatch+ins_bp+del_bp)/width)) +
  geom_histogram(bins=50) +
  theme_minimal() +
  xlab('(n SNPs + indel bp) / transcript length')
Warning: Removed 7 rows containing non-finite values (`stat_bin()`).

# num transcripts with indels
tx_df |>
  mutate(indel = ifelse(ins_bp>0 | del_bp>0, 'indel', 'no indel')) |>
  group_by(indel) |>
  summarise(n=n())
# A tibble: 3 × 2
  indel        n
  <chr>    <int>
1 indel    46454
2 no indel 68664
3 <NA>         7

Plot number of reads by SNPs

# read in visium cerebellum data
cere4 <- readRDS('results/rctd_cere_4_visium.rds')
gene_counts <- rowSums(cere4@spatialRNA@maternalCounts)+rowSums(cere4@spatialRNA@paternalCounts)
gene_df <- data.frame(gene = names(gene_counts), count = gene_counts)
tx_df <- left_join(tx_df, gene_df, by = 'gene')
dd <- tx_df |>
  group_by(gene) |>
  slice_max(nmismatch, n=1, with_ties = F)

mycor <- cor(log2(dd$nmismatch+1), log2(dd$count+1), use='pairwise.complete')

dd |>
  filter(!is.na(count)) |>
  mutate(has_indel = ifelse(ins_bp>0 | del_bp>0, 'Has indel (46,454)', 'No indel (68,664)')) |>
  filter(!is.na(has_indel)) |>
  ggplot(aes(x = log2(nmismatch+1), y = log2(count+1))) +
  geom_point(alpha=0.05) +
  geom_smooth(method='lm') +
  facet_wrap(has_indel ~ .) +
  theme_classic() +
  xlab('log2(n mismatches + 1)') 
`geom_smooth()` using formula = 'y ~ x'

ggsave('figures/07_129_CAST_nmismatch_vs_count.pdf', height=3, width=6)
`geom_smooth()` using formula = 'y ~ x'

Total number of SNPs in non-zero count genes

dd |>
  filter(count > 0)  |>
  ungroup() |>
  summarise(nsnps = sum(nmismatch, na.rm=T))
# A tibble: 1 × 1
   nsnps
   <dbl>
1 104408


lulizou/spASE documentation built on May 22, 2024, 5:24 a.m.