R/compareTranscripts.R

Defines functions orfSimilarity attrChangeAltSpliced orfDiff

Documented in attrChangeAltSpliced orfDiff orfSimilarity

#' Evaluate changes to ORFs caused by alternative splicing
#' @param orfsX orf information for 'normal' transcripts. Generated by getOrfs()
#' @param orfsY orf information for 'alternative' transcripts. Generated by getOrfs()
#' @param filterNMD filter orf information for transcripts not targeted by nmd first?
#' @param geneSimilarity compare orf to all orfs in gene?
#' @param compareUTR compare UTRs?
#' @param compareBy compare by 'transcript' isoforms or by 'gene' groups
#' @param allORFs orf information for all transcripts for novel sequence comparisons.
#' Generated by getOrfs()
#' @return data.frame with orf changes
#' @export
#' @import stringr
#' @importFrom rtracklayer import
#' @importFrom stats aggregate
#' @family transcript isoform comparisons
#' @author Beth Signal
#' @examples
#'
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' orfsProteinCoding <- getOrfs(exons[exons$gene_name=="Prex2" &
#' exons$transcript_type=="protein_coding"], BSgenome = g)
#' orfsNMD <- getOrfs(exons[exons$gene_name=="Prex2" &
#' exons$transcript_type=="nonsense_mediated_decay"], BSgenome = g)
#' orfDiff(orfsProteinCoding, orfsNMD, filterNMD=FALSE)
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, whippetDataSet=wds.exonSkip)
#'
#' orfsSkipped <- getOrfs(ExonSkippingTranscripts[ExonSkippingTranscripts$set=="skipped_exon"],
#' BSgenome = g)
#' orfsIncluded <- getOrfs(ExonSkippingTranscripts[ExonSkippingTranscripts$set=="included_exon"],
#' BSgenome = g)
#' orfDiff(orfsSkipped, orfsIncluded, filterNMD=FALSE)
orfDiff <- function(orfsX,
                    orfsY,
                    filterNMD = TRUE,
                    geneSimilarity = TRUE,
                    compareUTR = TRUE,
                    compareBy="gene",
                    allORFs=NULL){
    if(filterNMD == TRUE){
        orfChanges <- attrChangeAltSpliced(orfsX[which(orfsX$nmd_prob < 0.5),],
                                           orfsY[which(orfsY$nmd_prob < 0.5),],
                                           attribute = "orf_length",
                                           compareBy = compareBy,
                                           useMax = TRUE,
                                           compareUTR = compareUTR)

        orfChanges.filterx <- attrChangeAltSpliced(
            orfsX[which(orfsX$nmd_prob < 0.5),],
            orfsY,
            attribute = "orf_length",
            compareBy = compareBy,
            useMax = TRUE,
            compareUTR = compareUTR)
        orfChanges.filtery <-
            attrChangeAltSpliced(orfsX,
                                 orfsY[which(orfsY$nmd_prob < 0.5),],
                                 attribute = "orf_length",
                                 compareBy = compareBy,
                                 useMax = TRUE,
                                 compareUTR = compareUTR)

        orfChanges.nofilter <-
            attrChangeAltSpliced(orfsX,
                                 orfsY,
                                 attribute = "orf_length",
                                 compareBy = compareBy,
                                 useMax = TRUE,
                                 compareUTR = compareUTR)

        if(nrow(orfChanges) > 0){
            orfChanges$filtered <- "both"
        }
        if(nrow(orfChanges.filterx) > 0){
            orfChanges.filterx$filtered <- "x"
        }
        if(nrow(orfChanges.filtery) > 0){
            orfChanges.filtery$filtered <- "y"
        }
        if(nrow(orfChanges.nofilter) > 0){
            orfChanges.nofilter$filtered <- "none"
        }
        add <- which(!(orfChanges.filterx$id %in% orfChanges$id))
        orfChanges <- rbind(orfChanges, orfChanges.filterx[add,])
        add <- which(!(orfChanges.filtery$id %in% orfChanges$id))
        orfChanges <- rbind(orfChanges, orfChanges.filtery[add,])
        add <- which(!(orfChanges.nofilter$id %in% orfChanges$id))
        orfChanges <- rbind(orfChanges, orfChanges.nofilter[add,])

    }else{
        orfChanges <- attrChangeAltSpliced(orfsX,orfsY,attribute = "orf_length",
                                           compareBy=compareBy,useMax = TRUE,
                                           compareUTR = compareUTR)
        orfChanges$filtered <- FALSE
    }

    hasASidX <- grep("AS", orfsX$id)
    hasLeafIdX <- grep("dnre_", orfsX$id)
    orfsX$spliced_id <- orfsX$gene_id
    orfsX$transcript_id <- orfsX$id
    orfsX$spliced_id[hasASidX] <-
        unlist(lapply(str_split(orfsX$id[hasASidX], " "), '[[', 2))
    orfsX$transcript_id[hasASidX] <-
        unlist(lapply(str_split(orfsX$id[hasASidX], "[+]"), '[[', 1))
    orfsX$spliced_id[hasLeafIdX] <- gsub("dnre_","",
                                         orfsX$spliced_id[hasLeafIdX])
    orfsX$spliced_id[hasLeafIdX] <-
        stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                              paste0(orfsX$spliced_id[hasLeafIdX], ":")), 1, -2)
    hasASidY <- grep("AS", orfsY$id)
    hasLeafIdY <- grep("upre_", orfsY$id)
    orfsY$spliced_id <- orfsY$gene_id
    orfsY$transcript_id <- orfsY$id
    orfsY$spliced_id[hasASidY] <-
        unlist(lapply(str_split(orfsY$id[hasASidY], " "), '[[', 2))
    orfsY$transcript_id[hasASidY] <-
        unlist(lapply(str_split(orfsY$id[hasASidY], "[+]"), '[[', 1))
    orfsY$spliced_id[hasLeafIdY] <- gsub("upre_","",
                                         orfsY$spliced_id[hasLeafIdY])
    orfsY$spliced_id[hasLeafIdY] <-
        stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                              paste0(orfsY$spliced_id[hasLeafIdY], ":")), 1, -2)


    m <- match(paste0(orfsX$spliced_id,"_", orfsX$frame),
               paste0(orfsY$spliced_id,"_",orfsY$frame))
    # check that there are matches if we ignore the frame (i.e 5'utr frame-shifts)
    m.noFrame <- match(paste0(orfsX$spliced_id), paste0(orfsY$spliced_id))
    m[which(is.na(m))] <- m.noFrame[which(is.na(m))]
    if(any(is.na(m))){
        if(!(any(grepl("clu", orfsX$id)) | any(grepl("clu", orfsY$id)))){
            #hasASidX <- grep("[+]", orfsX$id)
            if(length(hasASidX) > 0){
                m2 <- match(paste0(orfsX$transcript_id[hasASidX],"_",
                                   orfsX$frame),
                            paste0(orfsY$transcript_id,"_", orfsY$frame))
                orfsY2 <- orfsY[m2,]
                orfsY2$id <- orfsX$id[hasASidX]
                orfsY2$spliced_id <- orfsX$spliced_id[hasASidX]
                orfsY <- rbind(orfsY, orfsY2)
            }
            #hasASidY <- grep("[+]", orfsY$id)
            if(length(hasASidY) > 0){
                m2 <- match(paste0(orfsY$transcript_id[hasASidY],"_",
                                   orfsY$frame),
                            paste0(orfsX$transcript_id,"_", orfsX$frame))
                orfsX2 <- orfsX[m2,]
                orfsX2$id <- orfsY$id[hasASidY]
                orfsX2$spliced_id <- orfsY$spliced_id[hasASidY]
                orfsX <- rbind(orfsX, orfsX2)
            }

            if(length(hasASidY) == 0 & length(hasASidX) == 0){
                attributeX$id <- attributeX$gene_id
                attributeY$id <- attributeY$gene_id
            }else{
                m2 <- match(paste0(orfsX$spliced_id,"_", orfsX$frame),
                            paste0(orfsY$spliced_id,"_",orfsY$frame))
                orfsX <- orfsX[which(!is.na(m2)),]
                orfsY <- orfsY[m2[which(!is.na(m2))],]
            }
        }
    }

    orfsY$id_with_len <- paste0(orfsY$spliced_id, "_", orfsY$orf_length)
    orfChanges$id_orf_length_y <- paste0(orfChanges$id, "_",
                                         orfChanges$orf_length_bygroup_y)
    my <- match(orfChanges$id_orf_length_y, orfsY$id_with_len)

    orfsX$id_with_len <- paste0(orfsX$spliced_id, "_", orfsX$orf_length)
    orfChanges$id_orf_length_x <- paste0(orfChanges$id, "_",
                                         orfChanges$orf_length_bygroup_x)
    mx <- match(orfChanges$id_orf_length_x, orfsX$id_with_len)

    x <- as.numeric(mapply(function(x,y) orfSimilarity(x,y),
                           orfsX$orf_sequence[mx],
                           orfsY$orf_sequence[my]))

    orfChanges$percent_orf_shared <- x

    maxLength <- apply(orfChanges[,c('orf_length_bygroup_y',
                                     'orf_length_bygroup_x')], 1, max)

    orfChanges$max_percent_orf_shared <-
        (maxLength - abs(orfChanges$orf_length_bygroup_x -
                             orfChanges$orf_length_bygroup_y)) / maxLength

    orfLengthShared <- orfChanges$percent_orf_shared * maxLength

    orfChanges$orf_percent_kept_x <- orfLengthShared /
        orfChanges$orf_length_bygroup_x
    orfChanges$orf_percent_kept_y <- orfLengthShared /
        orfChanges$orf_length_bygroup_y


    if(geneSimilarity == TRUE & !is.null(allORFs)){

        orfChanges$gene_id <- orfsX$gene_id[match(orfChanges$id,
                                                  orfsX$spliced_id)]
        geneMatches <- lapply(orfChanges$gene_id,
                              function(x)
                                  which(!is.na(match(allORFs$gene_id, x))))
        geneMatches <- unlist(geneMatches)

        idMatches <- mapply(function(x,y)
            rep(x,length(y)) ,(1:length(orfChanges$gene_id)),
            geneMatches)

        idMatches.y  <- match(orfChanges$id_orf_length_y[idMatches],
                              orfsY$id_with_len)
        idMatches.x  <- match(orfChanges$id_orf_length_x[idMatches],
                              orfsX$id_with_len)

        orfSimBygene.y <- as.numeric(mapply(function(x,y) orfSimilarity(x,y),
                                            allORFs$orf_sequence[geneMatches],
                                            orfsY$orf_sequence[idMatches.y]))

        orfSimBygene.x <- as.numeric(mapply(function(x,y) orfSimilarity(x,y),
                                            allORFs$orf_sequence[geneMatches],
                                            orfsX$orf_sequence[idMatches.x]))

        orfSimilarity.bygene <-
            data.frame(gene_id=allORFs$gene_id[geneMatches],
                       spliced_id_x=orfsX$spliced_id[idMatches.x],
                       spliced_id_y=orfsY$spliced_id[idMatches.y],
                       similarity_x=orfSimBygene.x,
                       similarity_y=orfSimBygene.y,
                       length_gene=allORFs$orf_length[geneMatches],
                       length_x=orfsX$orf_length[idMatches.x],
                       length_y=orfsY$orf_length[idMatches.y])

        orfSimilarity.bygeneAggX <- aggregate(similarity_x ~ spliced_id_x,
                                              orfSimilarity.bygene, max)
        orfSimilarity.bygeneAggY <- aggregate(similarity_y ~ spliced_id_y,
                                              orfSimilarity.bygene, max)

        orfChanges$gene_similarity_x <-
            orfSimilarity.bygeneAggX$similarity_x[
                match(orfChanges$id, orfSimilarity.bygeneAggX$spliced_id_x)]
        orfChanges$gene_similarity_y <-
            orfSimilarity.bygeneAggY$similarity_y[
                match(orfChanges$id, orfSimilarity.bygeneAggY$spliced_id_y)]

    }

    orfChanges$transcript_id <- NULL
    orfChanges$gene_id <- NULL
    orfChanges$id_orf_length_x <- NULL
    orfChanges$id_orf_length_y <- NULL

    return(orfChanges)
}

#' Evaluate the change in an attribute between a set of 'normal' transcripts and
#' 'alternative' transcripts
#' @param orfsX orf information for 'normal' transcripts. Generated by getOrfs()
#' @param orfsY orf information for 'alternative' transcripts. Generated by getOrfs()
#' @param attribute attribute to compare
#' @param compareBy compare by 'transcript' isoforms or by 'gene' groups
#' @param useMax use max as the summary function when multiple isoforms are aggregated?
#' If FALSE, will use min instead.
#' @param compareUTR compare the UTR lengths between transcripts?
#' Only runs if attribute = "orf_length"
#' @return data.frame with attribute changes
#' @export
#' @import stringr
#' @importFrom rtracklayer import
#' @importFrom stats aggregate
#' @family transcript isoform comparisons
#' @author Beth Signal
#' @examples
#' whippetFiles <- system.file("extdata","whippet/",
#' package = "GeneStructureTools")
#' wds <- readWhippetDataSet(whippetFiles)
#' wds <- filterWhippetEvents(wds)
#'
#' gtf <- rtracklayer::import(system.file("extdata","example_gtf.gtf",
#' package = "GeneStructureTools"))
#' exons <- gtf[gtf$type=="exon"]
#' transcripts <- gtf[gtf$type=="transcript"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' wds.exonSkip <- filterWhippetEvents(wds, eventTypes="CE",psiDelta = 0.2)
#' exons.exonSkip <- findExonContainingTranscripts(wds.exonSkip, exons,
#' variableWidth=0, findIntrons=FALSE, transcripts)
#' ExonSkippingTranscripts <- skipExonInTranscript(exons.exonSkip, exons, whippetDataSet=wds.exonSkip)
#'
#' orfsSkipped <- getOrfs(ExonSkippingTranscripts[ExonSkippingTranscripts$set=="skipped_exon"],
#' BSgenome = g)
#' orfsIncluded <- getOrfs(ExonSkippingTranscripts[ExonSkippingTranscripts$set=="included_exon"],
#' BSgenome = g)
#' attrChangeAltSpliced(orfsSkipped, orfsIncluded,attribute = "orf_length")
attrChangeAltSpliced <- function(orfsX,
                                 orfsY,
                                 attribute = "orf_length",
                                 compareBy = "gene",
                                 useMax = TRUE,
                                 compareUTR = FALSE){

    if(nrow(orfsX) > 0 & nrow(orfsY) > 0){

        if(useMax){
            aggFun <- max
        }else{
            aggFun <- min
        }

        # fix ids so spliced isoforms have same id
        if(all(grepl("[+]",c(orfsX$id, orfsY$id)))){
            asTypes <- unique(unlist(lapply(stringr::str_split(
                lapply(stringr::str_split(
                    c(orfsX$id, orfsY$id), "[+]"
                ), "[[",2), " "),
                "[[",1)))

            for(asT in asTypes){
                orfsX$id <- gsub(asT, "AS", orfsX$id)
                orfsY$id <- gsub(asT, "AS", orfsY$id)
            }
        }

        m <- match(c('id','gene_id', attribute), colnames(orfsX))

        orfsX.part <- orfsX[,m]

        attributeX <- aggregate(. ~ id+gene_id, orfsX.part, aggFun)

        hasASidX <- grep("AS", attributeX$id)
        attributeX$as_group <- attributeX$gene_id
        attributeX$transcript_id <- attributeX$id

        hasLeafIdX <- grep("dnre_", attributeX$id)
        attributeX$id[hasLeafIdX] <-
            gsub("dnre_","",attributeX$id[hasLeafIdX])
        attributeX$id[hasLeafIdX] <-
            stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                                  paste0(attributeX$id[hasLeafIdX], ":")),
                             1, -2)
        attributeX$as_group[hasASidX] <-
            unlist(lapply(str_split(attributeX$id[hasASidX], " "), '[[', 2))

        attributeX$transcript_id[hasASidX] <-
            unlist(lapply(str_split(attributeX$id[hasASidX], "[+]"), '[[', 1))

        m <- match(c('id','gene_id', attribute), colnames(orfsY))
        orfsY.part <- orfsY[,m]
        attributeY <- aggregate(. ~ id+gene_id, orfsY.part, aggFun)

        hasASidY <- grep("AS", attributeY$id)
        attributeY$as_group <- attributeY$gene_id
        attributeY$transcript_id <- attributeY$id

        hasLeafIdY <- grep("upre_", attributeY$id)
        attributeY$id[hasLeafIdY] <-
            gsub("upre_","",attributeY$id[hasLeafIdY])
        attributeY$id[hasLeafIdY] <-
            stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                                  paste0(attributeY$id[hasLeafIdY], ":")),
                             1, -2)
        attributeY$as_group[hasASidY] <-
            unlist(lapply(str_split(attributeY$id[hasASidY], " "), '[[', 2))

        attributeY$transcript_id[hasASidY] <-
            unlist(lapply(str_split(attributeY$id[hasASidY], "[+]"), '[[', 1))

        m <- match(attributeX$id, attributeY$id)
        if(any(is.na(m))){
            #hasASidX <- grep("[+]", attributeX$id)
            # if isoforms are not generated by leafcutter
            if(!(any(grepl("clu", attributeX$id)) |
                 any(grepl("clu", attributeY$id)))){
                if(length(hasASidX) > 0){
                    m2 <- match(attributeX$transcript_id[hasASidX],
                                attributeY$transcript_id)
                    attributeY2 <- attributeY[m2,]
                    attributeY2$id <- attributeX$id[hasASidX]
                    attributeY2$as_group <- attributeX$as_group[hasASidX]
                    attributeY <- rbind(attributeY, attributeY2)
                }
                #hasASidY <- grep("[+]", attributeY$id)
                if(length(hasASidY) > 0){
                    m2 <- match(attributeY$transcript_id[hasASidY],
                                attributeX$transcript_id)
                    attributeX2 <- attributeX[m2,]
                    attributeX2$id <- attributeY$id[hasASidY]
                    attributeX2$as_group <- attributeY$as_group[hasASidY]
                    attributeX <- rbind(attributeX, attributeX2)
                }
                if(length(hasASidY) == 0 & length(hasASidX) == 0){
                    attributeX$id <- attributeX$gene_id
                    attributeY$id <- attributeY$gene_id

                }else{
                    m2 <- match(attributeX$id, attributeY$id)
                    attributeX <- attributeX[which(!is.na(m2)),]
                    attributeY <- attributeY[m2[which(!is.na(m2))],]
                    m <- match(attributeX$id, attributeY$id)
                }
            }
        }

        colnames(attributeX)[3] <- "attr"
        colnames(attributeY)[3] <- "attr"

        if(compareBy == "transcript"){
            m <- match(attributeX$id, attributeY$id)
            attributeComparisons <-
                data.frame(id=attributeX$id,
                           attr_bygroup_x=attributeX$attr,
                           attr_bygroup_y=attributeY$attr[m])

            m <- match(attributeY$id, attributeX$id)
            attributeComparisonsY <-
                data.frame(id=attributeY$id,
                           attr_bygroup_x=attributeX$attr[m],
                           attr_bygroup_y=attributeY$attr)

            add <-
                which(!(attributeComparisonsY$id %in% attributeComparisons$id))

            if(length(add) > 0){
                attributeComparisons <-
                    rbind(attributeComparisons, attributeComparisonsY[add,])
            }
        }else if(compareBy=="gene"){
            attributeX2 <- aggregate(attr ~ as_group, attributeX, aggFun)
            attributeY2 <- aggregate(attr ~ as_group, attributeY, aggFun)

            m <- match(attributeX2$as_group, attributeY2$as_group)
            attributeComparisons <- data.frame(id=attributeX2[,1],
                                               attr_bygroup_x=attributeX2[,-1],
                                               attr_bygroup_y=attributeY2[m,-1])
            attributeComparisons <- attributeComparisons[which(!is.na(m)),]

        }
        colnames(attributeComparisons) <- gsub("attr",
                                               attribute,
                                               colnames(attributeComparisons))

        if(compareUTR == TRUE & attribute == "orf_length"){
            if(all(grepl("AS", orfsX$id))){
                id.x <- paste0(attributeComparisons$id,"_" ,
                               attributeComparisons$orf_length_bygroup_x)

                hasLeafIdX <- grep("dnre_", orfsX$id)
                orfsXid <- orfsX$id
                orfsXid[hasLeafIdX] <-
                    stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                                          paste0(orfsXid[hasLeafIdX], ":")),
                                     1, -2)
                orfsXid[hasLeafIdX] <- gsub("dnre_","", orfsXid[hasLeafIdX])

                if(compareBy == "gene"){
                    orfsXid <- unlist(lapply(str_split(orfsXid, " "), "[[", 2))
                    m1 <- match(id.x, paste0(orfsXid,
                                             "_", orfsX$orf_length))
                }else{
                    m1 <- match(id.x, paste0(orfsXid, "_", orfsX$orf_length))
                }
            }else{
                matchToGene <- match(attributeComparisons$id,
                                     attributeX$as_group)
                id.x <- paste0(unlist(lapply(str_split(
                    attributeX$id[matchToGene], "[+]"),"[[",1)),"_" ,
                    attributeComparisons$orf_length_bygroup_x)
                m1 <- match(id.x, paste0((orfsX$gene_id), "_",
                                         orfsX$orf_length))
            }

            if(all(grepl("AS", orfsY$id))){
                id.y <- paste0(attributeComparisons$id,"_" ,
                               attributeComparisons$orf_length_bygroup_y)

                hasLeafIdY <- grep("upre_", orfsY$id)
                orfsYid <- orfsY$id
                orfsYid[hasLeafIdY] <-
                    stringr::str_sub(gsub("\\-[^]]\\:*", ":",
                                          paste0(orfsYid[hasLeafIdY], ":")),
                                     1, -2)
                orfsYid[hasLeafIdY] <- gsub("upre_","", orfsYid[hasLeafIdY])

                if(compareBy == "gene"){
                    orfsYid <- unlist(lapply(str_split(orfsYid, " "), "[[", 2))
                    m2 <- match(id.y, paste0(orfsYid,
                                             "_", orfsY$orf_length))
                }else{
                    m2 <- match(id.y, paste0(orfsYid, "_", orfsY$orf_length))
                }
            }else{
                matchToGene <- match(attributeComparisons$id,
                                     attributeY$as_group)
                id.y <- paste0(unlist(lapply(str_split(
                    attributeY$id[matchToGene], "[+]"),"[[",1)),"_" ,
                    attributeComparisons$orf_length_bygroup_y)
                m2 <- match(id.y, paste0((orfsY$gene_id), "_", orfsY$orf_length))
            }

            attributeComparisons$utr3_length_bygroup_x <- orfsX$utr3_length[m1]
            attributeComparisons$utr3_length_bygroup_y <- orfsY$utr3_length[m2]
            attributeComparisons$utr5_length_bygroup_x <-
                orfsX$start_site_nt[m1]
            attributeComparisons$utr5_length_bygroup_y <-
                orfsY$start_site_nt[m2]
        }

        return(attributeComparisons)
    }else{
        blank <- data.frame(id=character(0),
                            attr_bygroup_x=numeric(0),
                            attr_bygroup_y=numeric(0))
        colnames(blank) <- gsub("attr", attribute, colnames(blank))
        if(compareUTR == TRUE){
            blank$utr3_length_bygroup_x <- numeric(0)
            blank$utr3_length_bygroup_y <- numeric(0)
            blank$utr5_length_bygroup_x <- numeric(0)
            blank$utr5_length_bygroup_y <- numeric(0)
        }
        return(blank)
    }
}
#' calculate percentage of orfB contained in orfA
#' @param orfA character string of ORF amino acid sequence
#' @param orfB character string of ORF amino acid sequence
#' @param substitutionCost cost for substitutions in ORF sequences.
#' Set to 1 if substitutions should be weighted equally to insertions and deletions.
#' @return percentage of orfB contained in orfA
#' @export
#' @import stringdist
#' @import stringr
#' @importFrom utils adist
#' @family ORF annotation
#' @author Beth Signal
#' @examples
#' orfSimilarity("MFGLDIYAGTRSSFRQFSLT","MFGLDIYAGTRSSFRQFSLT")
#' orfSimilarity("MFGLDIYAGTRSSFRQFSLT","MFGLDIYAFRQFSLT")
#' orfSimilarity("MFGLDIYAFRQFSLT","MFGLDIYAGTRSSFRQFSLT")
#' orfSimilarity("MFGLDIYAGTRXXFRQFSLT","MFGLDIYAGTRSSFRQFSLT")
#' orfSimilarity("MFGLDIYAGTRXXFSLT","MFGLDIYAGTRSSFRQFSLT", 1)
orfSimilarity <- function(orfA, orfB, substitutionCost=100){
    if(is.na(orfA) | is.na(orfB)){
        return(NA)
    }else{
        if(nchar(orfA) > nchar(orfB)){
            lcs <- utils::adist(orfA, orfB,
                                costs=list(ins=100,
                                           del=1,
                                           sub=substitutionCost))[1,1]
        }else if(nchar(orfA) == nchar(orfB)){
            lcs <- utils::adist(orfA, orfB,
                                costs=list(ins=1,
                                           del=1,
                                           sub=substitutionCost))[1,1]
        }else{
            lcs <- utils::adist(orfA, orfB,
                                costs=list(ins=1,
                                           del=100,
                                           sub=substitutionCost))[1,1]
        }

        if(lcs <= nchar(orfA) + nchar(orfB)){
            return(((nchar(orfA) + nchar(orfB) - lcs)/2) /
                       max(nchar(orfA), nchar(orfB)))
        }else{
            return(0)
        }
    }
}

Try the GeneStructureTools package in your browser

Any scripts or data that you put into this service are public.

GeneStructureTools documentation built on Nov. 8, 2020, 6:04 p.m.