R/PLAYRDesign.R

#' @export
PLAYRDesign.run <- function()
{
      runApp(appDir = file.path(system.file(package = "PLAYRDesign"), "shinyGUI"), launch.browser=T)
}

#' @export
PLAYRDesign.filter_refseq_file <- function(in_name, out_name)
{
      db <- readDNAStringSet(in_name)
      sel <- grepl("NM_|NR_", names(db))
      db <- db[sel]
      writeXStringSet(db, file = out_name, format = "fasta", width = 80)
}

#' @export
PLAYRDesign.convert_est_to_RData <- function(in_name, out_name)
{
      tab <- read.table(in_name, header = T, sep = "\t", comment.char = "", stringsAsFactors = F)
      names(tab) <- gsub("X\\.", "", names(tab))
      ret <- GRanges(seqnames = Rle(tab$tName), ranges = IRanges(start = tab$tStart, end = tab$tEnd), 
                     strand = Rle(tab$strand))
      
      mcols(ret) <- tab[, c("blockSizes", "tStarts")]
      names(ret) <- tab$qName
      saveRDS(ret, out_name)
}


my_load <- function(f_name)
{
      con <- file(f_name, "rb")
      retval <- unserialize(con)
      close(con)
      return(retval)
}









sort_primer_pairs <- function(tab)
{
      limits <- ddply(tab, ~id, function(x) {return(data.frame(start = min(x$start), end = max(x$start + x$len)))})
      ir <- IRanges(limits$start, limits$end)
      limits <- data.frame(id = limits$id, pair_row_id = disjointBins(ir) - 1)
      res <- merge(tab, limits)
      return(res)
}

select_primer3_output_lines <- function(s)
{
      v <- sapply(s, function(x) {return(x[[1]])})
      v <- strsplit(v, "_")
      i <- sapply(v, function(x) {length(x) > 1 && x[[2]] %in% c("PAIR", "LEFT", "RIGHT") && grepl("^[[:digit:]]*$", x[3])})
      return(s[i])
}

revcomp <- function(s)
{
      x <- DNAStringSet(s)
      return(as.character(reverseComplement(x)))
      
}


write_selected_oligos <- function(tab, selected_oligos, playr_system, f_name, start_id, gene_name)
{
      playr.systems <- read.table(system.file("PLAYR_Systems.txt", package = "PLAYRDesign"), 
                                header = T, sep = "\t", quote = "", stringsAsFactors = F)
      playr.systems <- playr.systems[playr.systems$Name == playr_system,]
      
      ret <- NULL      
      id <- start_id
      for(i in selected_oligos)
      {
            temp <- strsplit(i, "_")[[1]]
            m <- NULL
            if(length(temp) == 1)
            {
                  m <- tab[tab$id == i,]
            }
            else
            {
                  m <- tab[tab$unique_id %in% temp, ]
                  if(m[1, "start"] < m[2, "start"])
                        v <- c("LEFT", "RIGHT")
                  else
                        v <- c("RIGHT", "LEFT")
                  m$type <- v
            }
            R <- m$type == "RIGHT"
            L <- m$type == "LEFT"
            
            ret <- rbind(ret, data.frame(Name = sprintf("%s_%d_%s", gene_name, id, playr_system), 
                                         Sequence = paste(revcomp(m[L, "sequence"]), playr.systems[playr.systems$Type == "PLAYR2", "Sequence"], sep = ""),  m[L, c("tm", "start", "len")]))
            id <- id + 1
            
            #When parsing primer3 we reverse complement the right primer. Therefore it needs
            #to be reverse complemented here too.
            ret <- rbind(ret, data.frame(Name = sprintf("%s_%d_%s", gene_name, id, playr_system), 
                                         Sequence = paste(revcomp(m[R, "sequence"]), playr.systems[playr.systems$Type == "PLAYR1", "Sequence"], sep = ""), m[R, c("tm", "start", "len")]))
            id <- id + 1      
      }
      print(ret)
      ret <- data.frame(ret)
      names(ret)[1:2] <- c("Name", "Sequence")
      write.table(ret, f_name, row.names = F, quote = F, sep = "\t")
}

write_selected_oligos_old <- function(tab, playr_system, f_name, start_id, gene_name)
{
      playr.systems <- read.table(system.file("PLAYR_Systems.txt", package = "PLAYRDesign"), 
                                header = T, sep = "\t", quote = "", stringsAsFactors = F)
      playr.systems <- playr.systems[playr.systems$Name == playr_system,]
      
      x <- DNAStringSet(tab[tab$type == "LEFT", "sequence"])
      x <- as.character(reverseComplement(x))
      tab[tab$type == "LEFT", "sequence"] <- x
      ret <- NULL      
      id <- start_id
      for(i in unique(tab$id))
      {
            m <- tab[tab$id == i,]
            ret <- rbind(ret, c(sprintf("%s_%d_%s", gene_name, id, playr_system), 
                                paste(m[m$type == "RIGHT", "sequence"], playr.systems[playr.systems$Type == "PLAYR1", "Sequence"], sep = "")))
            id <- id + 1
            ret <- rbind(ret, c(sprintf("%s_%d_%s", gene_name, id, playr_system), 
                                paste(m[m$type == "LEFT", "sequence"], playr.systems[playr.systems$Type == "PLAYR2", "Sequence"], sep = "")))
            id <- id + 1
      }
      ret <- data.frame(ret)
      names(ret) <- c("Name", "Sequence")
      write.table(ret, f_name, col.names = T, row.names = F, quote = F, sep = "\t")
}



parse_primer3_output <- function(f_name)
{
      v <- readLines(f_name)
      s <- strsplit(v, "=")
      s <- select_primer3_output_lines(s)
      s <- t(sapply(s, c, simplify = T))
      
      temp <- strsplit(s[,1], "_")
      
      ret <- sapply(temp, function(x) 
            {
                  pair <- x[[3]]
                  type <- x[[2]]
                  key <- ""
                  if(length(x) > 3)
                        key <- paste(x[4:length(x)], collapse = "_")
                  else
                        key <- "POS"
                  return(c(pair, type, key))
            
      })
      ret <- t(ret)
      ret <- data.frame(ret, s[,2], stringsAsFactors = F)
      names(ret) <- c("id", "type", "key", "val")
      tab_pairs <- ret[ret$type == "PAIR",]
      tab_primers <- ret[ret$type != "PAIR",]
      ret <- list(tab_primers = tab_primers, tab_pairs = tab_pairs)
      ret <- lapply(ret, function(x) {dcast(x, id+type~key, value.var = "val")})
      ret <- lapply(ret, data_frame_factor_to_char)
      
      temp <- ret$tab_primers
      v <- strsplit(temp$POS, ",")
      v <- t(sapply(v,c))
      colnames(v) <- c("START", "LEN")
      temp$POS <- NULL
      temp <- data.frame(temp, v, stringsAsFactors = F)
      ret$tab_primers <- temp
      
      ret <- lapply(ret, function(x)
            {
                  i <- !(names(x) %in% c("type", "SEQUENCE"))
                  x[i] <- lapply(x[i], as.numeric)
                  x <- data.frame(x)
                  names(x) <- tolower(names(x))
                  return(x)
            })
      ret$tab_primers$start <- ret$tab_primers$start + 1
      #Primer3 reports the right primer as the reverse complement, so the first base is actually the last on the mRNA
      ret$tab_primers <- reverse_right_primers(ret$tab_primers)
      
      ret$tab_primers <- sort_primer_pairs(ret$tab_primers)
      ret$tab_primers <- cbind(ret$tab_primers, unique_id = 1:nrow(ret$tab_primers))
      return(ret)
}

reverse_right_primers <- function(tab)
{
      x <- tab$type == "RIGHT"
      tab[x, "sequence"] <- revcomp(tab[x, "sequence"])
      tab[x, "start"] <- tab[x, "start"] - tab[x, "len"] + 1
      
      return(tab)
}


get_sequence_characteristics <- function(f_name)
{
      s <- readDNAStringSet(f_name)
      s <- as.character(s, use.names = F)
      s <- strsplit(s, "")[[1]]
      gc <- rollapply(s, get_gc, width = 20)
      tm <- rollapply(s, get_tm, width = 20)
      return(data.frame(pos = 1:length(tm), gc, tm))
}

get_gc <- function(s)
{
      tot <- sum(s %in% c("G", "C"))
      return(tot / length(s))
}

get_tm <- function(s)
{
      tot <- sum(s %in% c("G", "C"))
      return(64.9 + (41 *(tot - 16.4)) / length(s))
      
}


      
      
run_primer3 <- function(f_name, n, len, tm, gc, product_size, playrdesign_opt)
{
      opt <- list()
      v <- c("MIN", "OPT", "MAX")
      s <- paste("PRIMER_", v, "_SIZE=", len, sep = "")
      s <- c(s, paste("PRIMER_", v, "_TM=", tm, sep = ""))
      s <- c(s, sprintf("PRIMER_MIN_GC=%f", gc[1]), sprintf("PRIMER_MAX_GC=%f", gc[2]))
      #s <- c(s, sprintf("PRIMER_OPT_GC_PERCENT=%f", gc[2]))
      s <- c(s, sprintf("PRIMER_PRODUCT_SIZE_RANGE=%s", paste(product_size, collapse = "-")))
      
      seq <- readDNAStringSet(f_name)
      seq <- as.character(seq, use.names = F)
      s <- c(s, sprintf("SEQUENCE_TEMPLATE=%s", seq))
      s <- c(s, sprintf("PRIMER_NUM_RETURN=%d", n))
      s <- c(s, sprintf("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=%s", playrdesign_opt$PRIMER3_CONFIG))
      
      template <- list.files(path = file.path(system.file(package = "PLAYRDesign")), pattern = "primer3_settings_template.txt", full.names = T)
      template <- readLines(template)
      primer3_input_fname <- paste(f_name, "PLAYRDesign_primer3_input.txt", sep = ".")
      primer3_output_fname <- paste(f_name, "PLAYRDesign_primer3_output.txt", sep = ".")
      cat(s, template, file = primer3_input_fname, sep = "\n")      
      system(sprintf("%s -output=%s %s", playrdesign_opt$PRIMER3_EXEC, primer3_output_fname, primer3_input_fname))
      
      return(parse_primer3_output(primer3_output_fname))
}


refseq_to_symbol <- function(a)
{
      a <- gsub("\\.[0-9]*$", "", a)
      tt.a <- AnnotationFuncs::translate(a, from = org.Hs.egREFSEQ2EG, to = org.Hs.egSYMBOL)
      excl <- a[!(a %in% names(tt.a))]
      if(length(excl) > 0)
      {
            print(sprintf("Can't convert %s", paste(excl, collapse = ",")))
            temp <- as.list(excl)
            names(excl) <- excl
            tt.a <- c(tt.a, excl)
      }
      return(unlist(tt.a[a]))
      
}

parse_blast_result_txt <- function(f_name, filter_same_gi)
{
      #This assumes a single query sequence
      tab <- read.table(f_name, header = F, sep = "\t", quote = "", stringsAsFactors = F)
      names(tab) <- c("queryId", "subjectId", "percIdentity", "alnLength", "mismatchCount", 
                      "gapOpenCount", "queryStart", "queryEnd", "subjectStart", "subjectEnd", "eVal", "bitScore")
      tab <- tab[(tab$queryId != tab$subjectId),]
      if(filter_same_gi)
      {
            a <- sapply(strsplit(tab$queryId, "\\|"), function(x) {x[[4]]})
            b <- sapply(strsplit(tab$subjectId, "\\|"), function(x) {x[[4]]})
            a <- refseq_to_symbol(a)
            b <- refseq_to_symbol(b)
            tab <- tab[a != b,]
            
            
      }
      strand <- rep("+", nrow(tab))
      
      print(tab)
      #strand[tab$subjectEnd < tab$subjectStart] <- "-"
      res <- GRanges(seqnames = Rle(tab$queryId), strand = Rle(strand), 
                     ranges = IRanges(start = tab$queryStart, end = tab$queryEnd), percIdentity = tab$percIdentity, eVal = tab$eVal, bitScore = tab$bitScore,
                     gapOpenCount = tab$gapOpenCount, subjectId = tab$subjectId)
      print(res)
      return(res)
}

get_seq_from_ranges <- function(gr)
{
      gen <- BSgenome.Hsapiens.UCSC.hg19
      
      s <- getSeq(gen, names = gr)
      
      return(unlist(s))
}


get_exons_for_transcript <- function(id, txdb_file)
{
      txdb <- loadDb(txdb_file)
      #Remove weird chromosomes
      tab <- select(txdb, keys = id, keytype="TXNAME", columns = columns(txdb))
      tab <- tab[grep("_", tab$EXONCHROM, invert = T),]
      u <- unique(tab$EXONCHROM)
      if(length(u) > 1)
      {
            print("Transcript maps to multiple chromosomes, only using first one")
            tab <- tab[tab$EXONCHROM == u[1],]
      }
      gr <- GRanges(seqnames = Rle(tab$EXONCHROM), ranges = IRanges(start = tab$EXONSTART, end = tab$EXONEND), 
                    strand = Rle(tab$EXONSTRAND))
      print(gr)
      return(gr)
}
      

aligned_sequence_to_index <- function(s)
{
      temp <- start(s) - 1
      s <- as.character(s)
      s <- strsplit(s, "")[[1]]
      
      ret <- sapply(s, function(x)
      {
            if(x != "-")
            {
                  temp <<- temp + 1
                  return(temp)
            }
            else
                  return(NA)
      })
      return(ret)
}


project_data_on_seq <- function(query, subject, v.subject)
{
      aln <- pairwiseAlignment(query, subject, type = "global")
      query.idx <- aligned_sequence_to_index(pattern(aln))
      subject.idx <- aligned_sequence_to_index(subject(aln))
      
      subject.idx <- subject.idx[!is.na(query.idx)]
      query.idx <- query.idx[!is.na(query.idx)]
      
      ret <- numeric(length(query))
      ret[query.idx] <- v.subject[subject.idx]
      
      return(ret)
}


get_refseq_id_from_fasta <- function(f_name)
{
      s <- names(readDNAStringSet(f_name))
      s <- strsplit(s, "\\|")[[1]][4]
      s <- strsplit(s, "\\.")[[1]][1]
      return(s)
}


get_seq_from_file <- function(f_name)
{
      return(unlist(readDNAStringSet(f_name)))
}

      

expand_est_alignments <- function(gr.est)
{
      l <- split(gr.est, names(gr.est))
      ret <- lapply(l, function(x)
      {
            start <- strsplit(mcols(x)$tStarts, ",")[[1]]
            size <- strsplit(mcols(x)$blockSizes, ",")[[1]]
            start <- as.numeric(start)
            size <- as.numeric(size)
            return(GRanges(seqnames = seqnames(x), ranges = IRanges(start = start, width = size), strand = strand(x)))
      })
      ret <- GRangesList(ret)
      print(ret)
      return(ret[names(gr.est)])
}


get_exons_skips <- function(gr.gene, gr.est)
{
      sel.est <- subsetByOverlaps(gr.est, gr.gene)
      sel.est.exp <- expand_est_alignments(sel.est)
      ret <-  numeric(length(gr.gene))
      names(ret) <- as.character(1:length(gr.gene))
      
      k <- table(queryHits(findOverlaps(gr.gene, sel.est)))
      ret[names(k)] <- k
      
      k <- table(queryHits(findOverlaps(gr.gene, sel.est.exp)))
      ret[names(k)] <- ret[names(k)] - k
      print("Exons sizes")
      print(width(gr.gene))
      print("Exons skips")
      print(ret)
      v <- width(gr.gene)
      v <- as.vector(Rle(ret, v))
      return(v)      
}


run_blast_analysis_for_seq <- function(f_name, db, filter_same_gi, playrdesign_opt)
{
      seq_length <- nchar(readDNAStringSet(f_name))
      blast.f_name <- sprintf("%s_%s.blast_out.txt", f_name, db)
      print("Running BLAST")
      system(sprintf("%s -db %s -query %s -task blastn -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore\" -out %s",
                     playrdesign_opt$BLASTN_EXEC, file.path(playrdesign_opt$BLASTN_DB, db), f_name, blast.f_name))
      print("Done")
      res <- parse_blast_result_txt(blast.f_name, filter_same_gi)
      dj <- disjoin_reduce_blast_ranges(res)
      df <- expand_ranges_to_dataframe(dj, seq_length)
      return(df)      
}

disjoin_reduce_blast_ranges <- function(gr)
{
      dj <- disjoin(gr)
      ov <- as.data.frame(findOverlaps(gr, dj))
      mc <- as.data.frame(mcols(gr))
      mc <- mc[, sapply(mc, is.numeric)]
      
      res <- ddply(ov, ~subjectHits, function(x, mc) {return(colMeans(mc[x$queryHits,]))}, mc = mc)
      mcols(dj) <- res[, names(res) != "subjectHits"]
      return(dj)
}
      
expand_ranges_to_dataframe <- function(gr, seq_length)
{
      mc <- as.data.frame(mcols(gr))
      max.end <- max(end(ranges(gr)))
      df <- data.frame(pos = 1:seq_length)
      
      for(i in 1:ncol(mc))
      {
            v <- Rle(0, seq_length)
            for(j in 1:length(ranges(gr)))
            {
                  r <- ranges(gr)[j]
                  v[r] <- mc[j, i]
            }
            df <- cbind(df, as.vector(v))
      }
      names(df) <- c("pos", names(mc))
      return(df)
}
      
      
                  
                  
                  
data_frame_factor_to_char <- function(df)
{
      i <- sapply(df, is.factor)
      df[i] <- lapply(df[i], as.character)
      return(df)
}
      
 
nolanlab/PLAYRDesign documentation built on May 23, 2019, 9:32 p.m.