knitr::opts_knit$set( echo = TRUE, root.dir = getwd(), fig.width = 6, fig.height = 5, warning = FALSE, message = FALSE )
htmltools::img( src = knitr::image_uri(file.path(R.home("doc"), "html", "logo.jpg")), alt = "logo", style = "position:absolute; top:0; right:0; padding:10px;" )
This report includes an overview of all the sequences analyzed. For further information for each animal sequence, please check the reports per folder.name.
library(ggplot2) library(scifer)
Here is attached four plots to compare the filtered sequences by the proposed quality requirements, the graphs on the left are before the filtering and the graphs on the right are after filtering. The code was written to filter the best sequences, if they were repeated, it will select the best quality sequence after comparing both sequences quality scores. The y-axis contain quality score similar to Phread Quality Score, which is logarithmically related to the base-calling error probabilities. Thus, a score of 10 represents a basecalling error probability of 1 in 10 (90% accuracy), a quality score of 20, 1 in 100 (99% accuracy) etc.\ The filtering was based on:
The full script of filtering can be found on the Rmd file.
sf <- summarise_quality( folder_sequences=folder_sequences, secondary.peak.ratio=0.33, trim.cutoff=0.01, processors=processors )
if (any(is.null(dim(sf[["summaries"]])))) { print("No files were processed, knitting was stopped early. Please double check the folder location provided for `folder_sequences` with ab1 files.") knitr::knit_exit() } else if (any(is.null(sf[["summaries"]][["raw.length"]])) | any(is.na(sf[["summaries"]][["raw.length"]]))) { print("Columns inside sf_summaries are empty (NA or NULL). Please double check the folder location provided for `folder_sequences` with ab1 files.") knitr::knit_exit() } else if (all(c("raw.legnth", "trimmed.mean.quality", "trim.start") %in% colnames(sf[["summaries"]]))) { print("Columns exist in sf_summaries and are not empty, knitting can proceed.") }
# create same columns for the data frame not filtered sf$summaries <- sf$summaries %>% mutate( plate = folder.name, well = gsub("\\-.*|\\_.*", "", file.name), well_number = gsub("[a-zA-Z]", "", well), well_letter = gsub("[[:digit:]]", "", well), well = paste0(well_letter, sprintf("%02s", well_number)), ID = gsub("\\-.*|\\_.*", "", folder.name), sequence_id = sub("_R", "", paste(plate, well, sep = "_")) ) %>% select(-c(well_number, well_letter)) sf$summaries[is.na(sf$summaries)] <- 0 # filtering the dataset and piping the results to the next filtering (you should change all the parameters depending on your dataset) sf_filtered <- sf[["summaries"]] %>% # Filter per length, trimmed quality position, and overall trimmed quality. filter( raw.length >= raw_length, trim.start <= trim_start & trim.finish >= trim_finish, trimmed.mean.quality >= trimmed_mean_quality ) %>% # group by file names (important if you have repetead sequences with the same name and you want to use the best one among them) group_by(sequence_id) %>% # among the repeated sequences filter the one with the highest mean quality filter(raw.mean.quality == max(raw.mean.quality)) %>% ungroup() # here is the code used to create the listed S4 objects for secondary peaks detection and fasta file creation pathnames <- as.character(sf_filtered$file.path) sangerseqlisted <- sapply(pathnames, sangerseqR::readsangerseq) sp <- lapply(sangerseqlisted, scifer:::secondary_peaks, ratio = 0.33)
# selecting CDR3 and creating a column in the data frame to say the number of secondary peaks in the CDR3 region df <- lapply(sp, function(x) x[["secondary.peaks"]]) df <- lapply(df, function(x) filter(x, position > cdr3_start & position < cdr3_end)) df <- lapply(df, function(x) ifelse(nrow(x) > 0, nrow(x), 0)) df <- tibble(sec.peak.CDR3=as.numeric(as.character(df)), file.path=names(df)) df <- df %>% filter(sec.peak.CDR3 <= 5) sf_filtered <- merge(sf_filtered, df, by.default = file.path) pathnames <- as.character(sf_filtered$file.path) sangerseqlisted <- sapply(pathnames, sangerseqR::readsangerseq)
g1 <- sf[["summaries"]] %>% ggplot(aes(x=folder.name, y=trimmed.mean.quality)) + geom_boxplot() + ylim(0, 60) + ggtitle("Before filtering") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + labs(y = "Trimmed Mean Quality Score", x = NULL) + theme_bw() g2 <- sf_filtered %>% ggplot(aes(x=folder.name, y=trimmed.mean.quality)) + geom_boxplot() + ylim(0, 60) + ggtitle("After filtering") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + labs(y = NULL, x = NULL) + theme_bw() gridExtra::grid.arrange(g1, g2, nrow = 1)
# graph raw vs trimmed mean quality scores before and after filtering g3 <- sf$summaries %>% ggplot(aes( trimmed.mean.quality, raw.mean.quality )) + geom_point() + ylim(0, 60) + xlim(0, 60) + ggtitle("Not filtered") + theme_bw() g4 <- sf_filtered %>% ggplot(aes( trimmed.mean.quality, raw.mean.quality ), label=file.name) + geom_point() + ylim(0, 60) + xlim(0, 60) + ggtitle("Filtered") + theme_bw() gridExtra::grid.arrange(g3, g4, nrow=1)
Here you can see a table containing the number of repeated sequences, total sequences, filtered sequences, and the percentage of the selected sequences out of the total number of unique sequences (total-repeated).It also contain the quality score per plate, and standard deviation, based on the Phred Quality Score explained above.
options(dplyr.summarise.inform=FALSE) # This code is to create a general table containing the quality report per plate # Calculate the mean quality per folder.name folder.name_scores <- sf_filtered %>% group_by(folder.name) %>% summarise( mean.quality = mean(raw.mean.quality), standard.deviation = sd(raw.mean.quality) ) %>% ungroup() # Add the row containing the total, including the mean quality of all the filtered sequences and its standard deviation folder.name_scores <- folder.name_scores %>% add_row( folder.name="Total", mean.quality=mean(sf_filtered$raw.mean.quality), standard.deviation=sd(sf_filtered$raw.mean.quality) ) options(digits = 4) # Calculate the number of repeated sequences per folder.name n.repeated.seq <- sf$summaries %>% group_by(folder.name) %>% filter(stringr::str_detect(folder.name, "_R")) %>% summarise(n_repeated = n()) %>% data.frame() # calculate the total number of sequences per folder.name total.seq <- sf$summaries %>% group_by(folder.name) %>% summarise(n_total = n()) %>% data.frame() # calculate the number of sequences post filtering per folder.name filtered.seq <- sf_filtered %>% group_by(folder.name) %>% summarise(n_filtered = n()) %>% data.frame() # Merge all the different datasets created above x <- merge(n.repeated.seq, total.seq, by.default=folder.name, all=TRUE) x <- merge(x, filtered.seq, by.default = folder.name, all = TRUE) x[is.na(x)] <- 0 # Caculated the percentage of sequences filtered out of the total unique sequences and add the value in a new column (not repeated) y <- x %>% mutate(n_unique = n_total - n_repeated) %>% mutate(used_percentage = (n_filtered / (n_unique)) * 100) # Add row with the total number plates_used_sequences <- y %>% add_row( folder.name = "Total", n_repeated = sum(y$n_repeat), n_total = sum(y$n_total), n_unique = sum(y$n_unique), n_filtered = sum(y$n_filtered), used_percentage = mean(y$used_percentage) ) # By now you have to tables containing different information per plate, here it just merging them into one single table to plot it table.seq <- merge(plates_used_sequences, folder.name_scores, by.default=folder.name) kableExtra::kable(table.seq) %>% kableExtra::kable_styling(bootstrap_options="striped", full_width=FALSE)
# extract primary basecalls to a dataframe sangerbasecall.string <- sapply(sangerseqlisted, sangerseqR::primarySeq, string=TRUE ) sangerbasecall.string <- sangerbasecall.string %>% data.frame() %>% tibble::rownames_to_column() names(sangerbasecall.string)[names(sangerbasecall.string) == "rowname"] <- "file.path" names(sangerbasecall.string)[names(sangerbasecall.string) == "."] <- "sequence" # Merge data frames to add column with the sequence sf_filtered <- merge(sf_filtered, sangerbasecall.string, by.default = file.path)
If the algorithm detected a secondary peak on CDR3 region, in our case between position 100 and 150, it will plot automatically the chromatogram from the CDR3 region. Below you can see a histogram based on the number of secondary peaks detected inside the CDR3 region. The secondary.peak needed to be at least 1/3 the size (ratio = 0.33) of the primary peak to be considered a true secondary peak. You can check the CDR3 chromatograms in the folder called "chromatograms".
# plot number secondary peaks in CDR3 g5 <- sf_filtered %>% ggplot(aes(sec.peak.CDR3 > 0)) + geom_bar() + xlab("Secondary peaks CDR3 > 0") + theme_bw() g6 <- sf_filtered %>% ggplot(aes(sec.peak.CDR3)) + geom_bar(aes(y=(..count..) / sum(..count..))) + scale_y_continuous(labels=scales::percent) + ylab("relative frequencies") + xlab("Number of secondary peaks CDR3") + theme_bw() gridExtra::grid.arrange(g5, g6, nrow = 1)
# Create chromatograms of CDR3 with higher than one peak chromatogram <- sf_filtered %>% filter(sec.peak.CDR3 > 0) %>% select(file.path) name <- merge(sf_filtered, chromatogram, by.default=file.path) name$sequence_id <- sub("HC_", "", name$sequence_id) chromatogramlisted <- sapply(name$file.path, sangerseqR::readsangerseq) dir.create(paste0(output_dir, "/chromatograms"), showWarnings=FALSE, recursive=TRUE) mapply( FUN=scifer:::secondary_peaks, s=chromatogramlisted, file.prefix=name$sequence_id, MoreArgs=list(output.folder = paste0(output_dir, "/chromatograms")) )
All the informations about the sequence primary basecall, quality scores, folder.name, well, plate, secondary peaks etc. can be found in the csv file created inside the "processing" folder. If there is probing with flow cytometry data index files to be integrated, the csv files and fasta will contain that information, if not, it will be ignored or invalid, it will be ignored.
if (is.null(folder_path_fcs)) { process_fcs_data <- FALSE } else if (is.na(folder_path_fcs)) { process_fcs_data <- FALSE } else if (is.numeric((folder_path_fcs))) { process_fcs_data <- FALSE } else if (isFALSE(folder_path_fcs)) { process_fcs_data <- FALSE } else { process_fcs_data <- TRUE }
processed_index <- fcs_processing(folder_path = folder_path_fcs, compensation = compensation, plate_wells = plate_wells, probe1 = probe1, probe2 = probe2, posvalue_probe1 = posvalue_probe1, posvalue_probe2 = posvalue_probe2) fcs_plot(processed_index) processed_index <- processed_index[["processed_fcs"]] %>% mutate(sequence_id = paste(sample_ID, well_ID, sep = "_"), specificity = gsub("\\.", "", specificity)) %>% relocate(sequence_id)
if (process_fcs_data == TRUE) { sf_filtered_csv <- sf_filtered %>% select(file.path, sequence_id, ID, plate, well, raw.mean.quality, trimmed.mean.quality, sec.peak.CDR3, sequence) %>% left_join(processed_index, by = "sequence_id") %>% mutate(sequence_id = paste(sequence_id, specificity, sep = "_")) sf_filtered_csv %>% relocate(sequence_id, ID, plate, well, specificity, raw.mean.quality, trimmed.mean.quality, sec.peak.CDR3, sequence) %>% write.csv(paste0(output_dir, "sanger_filtered.csv"), row.names = FALSE) } else { sf_filtered_csv <- sf_filtered %>% relocate(file.path, sequence_id, ID, plate, well, raw.mean.quality, trimmed.mean.quality, sec.peak.CDR3, sequence) sf_filtered_csv %>% relocate(sequence_id, ID, plate, well, raw.mean.quality, trimmed.mean.quality, sec.peak.CDR3, sequence) %>% write.csv(paste0(output_dir, "sanger_filtered.csv"), row.names = FALSE) }
A fasta file containing all the filtered sequences was created on the folder called "quality reports".
# Fasta file creation df_to_fasta( sequence_name=sf_filtered_csv$sequence_id, sequence_strings=sf_filtered_csv$sequence, file_name="combined_sequences.fasta", output_dir=output_dir )
for (i in unique(sf_filtered$ID)) { rmarkdown::render(system.file("rmd/HC_individualized_report.Rmd", package = "scifer"), # file 2 output_file=paste(Sys.Date(), "_", i, "_report", ".html", sep = ""), output_dir=output_dir ) }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.