R/bobR.R

Defines functions angle colonyCount gapPositions fineSegment unGap maxGroupScore sumGroupScores maxInContig readYKBLAST seqPile

Documented in colonyCount fineSegment gapPositions maxGroupScore maxInContig readYKBLAST seqPile sumGroupScores unGap

# First R Package bobhelpR
#
# Just a bunch of helper functions
# but in particular functions to deal with Y1000 BLAST data
#
# ----------------------------------------------------------
#
#                   seqPile function
#
# makes a pileup vector from a vector of sequence starts and a vector of sequence ends
#  as in DNA or protein sequences
#
# Input
#   a vector of start numbers
#   a vector of end numbers
#
# Returns a vector of the collapsed sequence runs for compiling a histogram
#

# returning as a vector of numbers

#' generate a vector of numbers based on start and end points of sequences
#'
#' @param start_vector vector of start numbers for generation of sequences
#' @param end_vector vector of end numbers for generation of sequences
#' @export

seqPile <- function(start_vector,end_vector) {
  listPile <- purrr::map2(start_vector,end_vector,seq) %>%
    purrr::reduce(c)
}

#
# ----------------------------------------------------------
#
#                readYKBLAST function
#
# reads a full BLAST report from the Y1000 BLAST server and parses into a dataframe
#
# Input
#   dataDir defaults to 'data/'
#   filename defaults to 'sequenceserver-full_tsv_report.tsv'

#' read BLAST results file from the Y1000 BLAST server
#'
#' @param file path to BLAST results text file
#' @param skip number of lines to skip to get to header
#' @param prefix leading prefix text to remove from header line
#' @export

readYKBLAST <- function(file, skip = 3, prefix = "# Fields: ") {

    header <- read_lines(file, #pluck header line out of BLAST results file
                             skip = skip, n_max = 1) %>%
    str_remove(prefix) %>% # and clean up the header names
    str_split(", ") %>%
    unlist() %>%
    str_remove_all("\\.") %>%
    str_replace_all(" ","_")

  # BLAST data file

  BLAST_data <- read_delim(file,
                           "\t", escape_double = FALSE, col_names = FALSE,
                           comment = "#", trim_ws = TRUE, na = c("", "NA", "N/A")) %>%
    set_names(header) %>%
    # select(-subject_tax_ids:-subject_super_kingdoms,   # remove columns with no data
    #        -subject_ids, -subject_gis,-subject_accs) %>% # remove seemingly duplicate columns
    separate(`query/sbjct_frames`, c("query_frame","subject_frame"), sep = '/') %>%
    mutate(s_mid = s_start + round((s_end - s_start)/2,digits = 0))
  return(BLAST_data)
}


#
# ----------------------------------------------------------
#
#                      alignmentMidpoints
#
# returns the midpoint position of the alignment wrt the subject template
#

# maxPerContigMidpoint function
#
# calculates the position on a template of the maximum alignment score from a group of contigs and scores
# useful for results from genome-wide alignment searches where there may be multiple "hits" on a contig
#
# INPUT
#   for vectors for variables:
#      contig_names
#      align_start
#      align_end
#      score
#
# OUTPUT
#   df of contig names and the midpoint wrt contig of the max BLAST hit

#' Find the maximum alignment score on each subject DNA contig contained in the BLAST data
#'
#' @param contig_names vector of contig names taken from BLAST data
#' @param align_mid vector of alignment midpoints from BLAST data
#' @param score vector of BLAST scores from BLAST data
#' @export

maxInContig <- function(contig_names, align_mid, score) {

  # capture calling variable name for contig_names
  #x=last(substitute(contig_names))
  x = as.character(substitute(contig_names))
  y = last(unlist(str_split(x,"\\$")))

  #message(paste0("input contig names are ",x))

  max_positions <- data.frame(contig_names,align_mid, score) %>%
    group_by(contig_names) %>%
    filter(score == max(score)) %>%
    select(contig_names,max_loc = align_mid) %>%
    # restore original column name from calling variable
    #rename(!!x := contig_names)
    rename(!!y := contig_names)
}


#
# ----------------------------------------------------------
#
#               alignmentGroup function
#
# INPUT
#   A Y1000 BLAST alignment file
# OUTPUT
#   A BLAST alignment file filtered to include max scoring hit on each DNA subject
#   plus lower scoring alignments that are within query DNA length of the high scoring alignment

#' group fragmented alignments based on proximity to highest score on each subject template
#'
#' @param BLASTdf dataframe of BLAST data from Y1000 project
#' @param query_length_DNA DNA length of the query sequence
#' @export

alignmentGroups <- function (BLASTdf, query_length_DNA) {

  maxPositions <- maxInContig (BLASTdf$subject_id,BLASTdf$s_mid,BLASTdf$score)

  BLASTdf %>% left_join(maxPositions) %>%
    mutate (dif_to_max = abs(s_mid - max_loc)) %>%
    filter(dif_to_max < query_length_DNA)
}

#
# ----------------------------------------------------------
#
#                  scoreSUM function
#
# INPUT
#   A list of DNA subjects - contig, scaffold, chromosome, etc
# OUTPUT
#   DF with summation of grouped BLAST alignment scores

#' Sum scores of grouped alignments
#'
#' @param BLASTdf dataframe containing BLAST results from Y1000 server
#' @export

sumGroupScores <- function(BLASTdf) {
  score_sums <- BLASTdf %>%
    group_by(subject_id) %>%
    summarise(
      score_sum = sum(score),
      n = n()
    )
}

#
# ----------------------------------------------------------
#
#                   maxGroupScores

#' find maximum score of alignment groups for each species
#'
#' @param BLASTdf data frame of Y1000 BLAST server information
#' @export

maxGroupScore <- function(BLASTdf) {
  bySpeciesMax <- sumGroupScores(BLASTdf) %>%
    mutate(copy_sub = subject_id) %>%
    separate(copy_sub,c("species",NA),sep=".fas") %>%
    group_by(species) %>%
    filter(score_sum == max(score_sum))
}


#
# ----------------------------------------------------------
#
#                       unGap
#
# INPUT
#   seq1
#   seq2
#
# OUTPUT
#   seq2 with insertions wrt seq1 removed

#' remove sequence insertions from sequence 2 based on gaps in sequence 1
#'
#' @param seq1 string containing a reference sequence
#' @param seq2 string containing a sequence with insertions to remove
#' @export

unGap <- function(seq1, seq2) {

  #validate that seq1 has gaps, otherwise return seq2
  if (str_detect(seq1, "-", negate = T)) {
    return (seq2)
  } else {
    char_seq1 <- str_split(seq1,"")[[1]]
    char_seq2 <- str_split(seq2,"")[[1]]
    return (paste0(char_seq2[char_seq1 != "-"], collapse=""))
  }

}

#
# ----------------------------------------------------------
#
#                     fineSegment
#
# INPUT
#   query sequence match start
#   query sequence from match
#   subject sequence from match
#
# Output
#   A vector of positions conserved wrt the query sequence
#

#' Return number segments of aligned sequences taking gaps into account
#'
#' @param start integer start position of the alignment with respect to the query strand
#' @param query string containing query strand sequence from alignment
#' @param subject string containing subject strand sequence from alignment
#' @export

fineSegment <- function(start,query,subject) {
  new.string <- unGap(query,subject)
    # gaps in query now removed from subject
  segments <- as.data.frame(str_locate_all(new.string,'[A-Z]{1,}'))
  pad.segments <- data.frame(
    start = segments$start+start -1,
    end = segments$end + start -1
  )
  segmentList <- seqPile(pad.segments$start, pad.segments$end)

  return(segmentList)
}

#
# ----------------------------------------------------------
#
#                      gapPositions
#
# INPUT
#   sequence with dash character as gap
#
# OUTPUT
#   positions in seqeunce that are opened for gap from - gap opens after numbered position
#
# note that this really is broken. If a gap of greater than 1 is made, it ignores anything less that that gap number and does not decrement
# the sequence appropriately
# may be able to fix with addition of negative look ahead and behind in the regex
# for instance this removes only the specified number of repeated 'p's in a string
#  str_remove_all(fruits, "(?<!p)p{3}(?!p)")
# this one removes a range
# str_remove_all(fruits, "(?<!p)p{1,3}(?!p)")

#' find all positions of gaps in a sequence
#'
#' returns a vector of positions in the sequence string representing sequence position after which a gap of \code{gapSize} or greater occur
#' numbers returned are for a completely ungapped sequence
#'
#' @param sequence a sequence string with gaps denoted by '-' from a sequence alignment of BLAST search
#' @param gapSize integer defining the number of gaps >= gapSize to consider when finding positions
#' @importFrom stringr str_locate_all str_remove_all
#' @export

gapPositions <- function(sequence, gapSize = 1) {
  if(gapSize >1) {
    underPattern <- paste0('(?<!-)-{1,',gapSize-1,'}(?!-)')
    #message(underPattern)
    sequence <- str_remove_all(sequence, underPattern)
  }
  pattern <- paste0('(?<!-)-{',gapSize,',}(?!-)')
  positionDF <- as.data.frame(str_locate_all(sequence,pattern)[[1]]) %>%
    mutate(len = end - start + 1,
           cum = cumsum(len),
           recount = end - cum)
  return(positionDF$recount)
}

# gapPositions <- function(sequence, gapSize = 1) {
#   pattern <- paste0('-{',gapSize,',}')
#   positionDF <- as.data.frame(str_locate_all(sequence,pattern)[[1]]) %>%
#     mutate(len = end - start + 1,
#            cum = cumsum(len),
#            recount = end - cum)
#   return(positionDF$recount)
# }

#
# ----------------------------------------------------------
#
#                  getYeastUniProtInfo
#
# INPUT
#   URL - defaults to 'https://www.uniprot.org/docs/yeast.txt'
#   path - defaults to "uniprot.txt"
#   skip - defaults to 58
#   widths - defaults to c(75, 20, 11, 12, 11, 5, 4, 3)
#   names - defaults to c("gene", "ORF", "swiss-prot-AC","swiss-prot-name","SGD-rec", "size-AA", "3D", "chromosome")
#
# OUTPUT
#   side effect of writing the uniprot.txt file into the working directory

#' gets yeast UniProt info from UniProt URL and saves to local file
#'
#' This function simply loads a list of the yeast genes in the Swiss UNIPROT database
#' and saves it as a file
#'
#' @param URL The web location of the file.
#' Defaults to \code{'https://www.uniprot.org/docs/yeast.txt'}
#' @param path The directory path to store the file
#' Defaults to \code{"uniprot.txt"} in working directory.
#' @importFrom RCurl getURL
#' @export

getYeastUniProtInfo <- function (URL='https://www.uniprot.org/docs/yeast.txt',path="uniprot.txt") {
  getURL(URL) %>% readr::write_file(path)
}


#
# ----------------------------------------------------------
#
#                 loads Yeast UniProt info from local file
#
# INPUT
#   path - defaults to "uniprot.txt"
#   skip - defaults to 58
#   colwidths - defaults to c(75, 20, 11, 12, 11, 5, 4, 3)
#   names - defaults to c("gene", "ORF", "swiss-prot-AC","swiss-prot-name","SGD-rec", "size-AA", "3D", "chromosome")
#
# OUTPUT
#   df containing file info

#' loads Yeast UniProt info from local file
#'
#' Function reads file \code{uniprot.txt} from current directory which contains information
#' on yeast proteins in the Swiss UniProt database. \code{uniprot.txt} is fetched from the
#' Swiss uniprot site with command \code{getYeastUniProtInfo}
#'
#' @param path The directory path to read the file
#' Defaults to \code{"uniprot.txt"} in working directory but can be supplied in function call
#' @param skip - defaults to 58
#' @param max - defaults to 6726
#' @param colwidths - defaults to \code{c(75, 20, 11, 12, 11, 5, 4, 3)}
#' @param names - defaults to \code{c("gene", "ORF", "swiss-prot-AC","swiss-prot-name","SGD-rec", "size-AA", "3D", "chromosome")}
#' @export


loadYeastUniProtInfo <- function (path="uniprot.txt", skip = 58, n_max = 6726, colwidths = c(75, 20, 11, 12, 11, 5, 4, 3),
                                  colnames = c("gene", "ORF", "swiss-prot-AC","swiss-prot-name","SGD-rec", "size-AA", "3D", "chromosome")) {
  read_fwf("uniprot.txt", skip = skip, n_max = n_max, fwf_widths(colwidths, colnames))

}

#
# ----------------------------------------------------------
#
#                 colonyCount
#
# INPUT
#   img - a single plate image
#   labelText - optional
#
# SIDE EFFECT
#   display image with text
#   display color labels for colonies
#
# OUTPUT
#   number of found objects

#' Identifies colonies on a scanned plate and returns number of colonies
#'
#' @param image A single plate image
#' @param labelText Text to display over image
#' @param maxArea Maximum size cutoff to filter out image artefacts
#' @param minArea Minimum size cutoff to filter non-colony objects
#' @param eccentricity Cutoff for colony objects
#' @export

colonyCount <- function(img,labelText="", maxArea = 800, minArea = 25, eccentricity = 0.6) {
  thr <- img < EBImage::otsu(img)  # automatic threshold level detection
  wat <- EBImage::watershed(EBImage::distmap(thr), tolerance = 1, ext = 1) # defaults


  # Statistics for all detected objects
  obj <- screenmill:::object_features(wat)

  # Review distribution of objects to select filtering parameters
  plot(density(obj$area), xlim = c(0, 1000))
  plot(density(na.omit(obj$eccen)))

  # "Colonies"
  colonies <- obj %>%
    filter(
      area < maxArea,
      area > minArea,
      eccen < eccentricity
    )

  # Unlabel non-colony objects
  result <- as.matrix(wat)
  result[which(!result %in% colonies$obj)] <- 0L

  EBImage::display(img, method = "raster")
  text(x = round(nrow(img),0), y = round(ncol(img),0),
       label=labelText,
       col="firebrick", cex=2)
  EBImage::display(colorLabels(result), method = "raster")
  return(nrow(colonies))
}

#' Computes the angle between three points
#'
#' \code{angle} computes the angle between three points
#' @param A Vector containing the xy-cooydinates of point A
#' @param B Vector containing the xy-cooydinates of point B. This point acts as the vertex of angle ABC
#' @param C Vector containing the xy-cooydinates of point C
#' @return Angle between the three points in radians
#' @export

angle<-function(A, B, C, degrees=FALSE){
  vector1 = B - A
  vector2 = C - B
  # atan2 arguments are (y,x)
  ang <- atan2(vector1[2],vector1[1]) - atan2(vector2[2],vector2[1])
  if (degrees==TRUE){
    ang <- ang * 180 / pi
  }
  return(ang)
}
RobertJDReid/bobhelpR documentation built on July 7, 2023, 9:02 a.m.