R/raceseq.R

#' Title
#'
#' @param dataset 
#'
#' @return
#' @export
#'
#' @examples
process_data <- function(dataset) {
  processed_dataset <- dataset 
  processed_dataset$tailed <- processed_dataset$tail_length > 0  #mark tailed reads
  processed_dataset$mapped <- processed_dataset$mapping_position != -1  #mark mapped reads
  processed_dataset_mapped <- processed_dataset %>% filter(mapped != 0)  #discard unmapped reads
  
  # fix of flase_no_tail - should be assigned no tail cause CTGAC was found in the
  # clipped fragment - to be fixed in python script
  #tails_data_mapped$tail_type <- as.character(tails_data_mapped$tail_type)
  #tails_data_mapped[tails_data_mapped$tail_type == "false_no_tail_no_CTGAC", ]$tail_type <- "no_tail"
  
  processed_dataset_mapped$ref_name_R5 <- as.character(processed_dataset_mapped$ref_name_R5)
  processed_dataset_mapped$ref_name_R3 <- as.character(processed_dataset_mapped$ref_name_R3)
  # tails_data_mapped_same_ref<-tails_data_mapped[tails_data_mapped$ref_name_R5==tails_data_mapped$ref_name_R3,]
  
  # mark uridylated reads
  processed_dataset_mapped$uridylated2 <- FALSE
  processed_dataset_mapped[processed_dataset_mapped$Utail_length > 0, ]$uridylated2 = TRUE
  
  #convert T to U in terminal nucleotides (for seqlogo)
  processed_dataset_mapped$terminal_nucleotides<-as.character(processed_dataset_mapped$terminal_nucleotides)
  processed_dataset_mapped$terminal_nucleotides<-gsub("T","U",processed_dataset_mapped$terminal_nucleotides)
  
  #summary(as.factor(processed_dataset_mapped$CTGAC_R5))
  
 # colnames(processed_dataset_mapped)
  #head(processed_dataset_mapped[,c(1,2,3,12,24,25,26,27,33,34,37,38)])
  
  # in further analyses use only those read which got CTGAC delimiter identified in
  # the clipped fragment
  # for reporter analyses all mapped reads got CTGAC_R5 variable = 1 (because of short reads) so they will be included in the analysis
  processed_dataset_mapped_true <- processed_dataset_mapped[processed_dataset_mapped$CTGAC_R5 > 0, ]
  processed_dataset_mapped_true$ref_name = processed_dataset_mapped_true$ref_name_R5  #use ref_name_R5 as ref_name
  
  #head(processed_dataset_mapped_true[,c(1,2,3,12,24,25,26,27,33,34,37,38)])
  
  #head(tails_data_mapped[tails_data_mapped$tail_source=='tailseq_clip_clip_R3_shorter_than_tailseq',c(1,2,26,27,40)],30)
  
  processed_dataset_mapped_true$tail_type = as.character(processed_dataset_mapped_true$tail_type)
  
  #head(processed_dataset_mapped_true[grep("A_mix",processed_dataset_mapped_true$tail_type_mixed),])
  
  #treat all mixed (heterogenous) as true tails
  #tails_data_mapped_true[grep("AU_mixed",tails_data_mapped_true$tail_type_mixed),]$tail_type = 'AU'
  #tails_data_mapped_true[grep("^A_mixed",tails_data_mapped_true$tail_type_mixed),]$tail_type = 'A_only'
  #tails_data_mapped_true[grep("^U_mixed",tails_data_mapped_true$tail_type_mixed),]$tail_type = 'U_only'
  processed_dataset_mapped_true[processed_dataset_mapped_true$tail_type=='AU',]$uridylated2 = TRUE
  processed_dataset_mapped_true[processed_dataset_mapped_true$tail_type=='U_only',]$uridylated2 = TRUE
  #if clipping was shorter than tailseq - treat as other, as it is probably stretch of As in the reference seq
  #tails_data_mapped_true[tails_data_mapped_true$tail_source=='tailseq_clip_clip_R3_shorter_than_tailseq',]$tail_type = 'no_tail'
  #tails_data_mapped_true[tails_data_mapped_true$tail_source=='tailseq_clip_clip_R3_shorter_than_tailseq',]$uridylated = FALSE
  
  
  # remove heterogenous tails from analysis
  processed_dataset_mapped_true_no_hetero = processed_dataset_mapped_true[-grep("hetero", processed_dataset_mapped_true$tail_type),
                                                            ]
  
  #head(processed_dataset_mapped_true_no_hetero,20)
  # remove other type tails (for which we can suspect they are not tails but rather origin from improper mapping/repeatmasker) from the analysis
  processed_dataset_mapped_true_no_hetero_no_other = processed_dataset_mapped_true_no_hetero[-grep("other",
                                                                                                   processed_dataset_mapped_true_no_hetero$tail_type), ]
  
  
  # treat all AG,UG or UA tails as other_no_tail
  
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$tail_type ==
                                              "AG", ]$tail_type <- "other_no_tail"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$tail_type ==
                                              "UG", ]$tail_type <- "other_no_tail"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$tail_type ==
                                              "UA", ]$tail_type <- "other_no_tail"
  processed_dataset_mapped_true_no_hetero_no_other <- processed_dataset_mapped_true_no_hetero_no_other[-grep("other",
                                                                                                             processed_dataset_mapped_true_no_hetero_no_other$tail_type), ] #remove all other from analysis
  
  
  # create classes for A-tail lengths (0,1,2-5,6-10,11-20,21-30,30+)
  processed_dataset_mapped_true_no_hetero_no_other$A_length = ""
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length ==
                                              0, ]$A_length = "0"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length ==
                                              1, ]$A_length = "1"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length %in%
                                              seq(2, 5, 1), ]$A_length = "2-5"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length %in%
                                              seq(6, 10, 1), ]$A_length = "6-10"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length %in%
                                              seq(11, 20, 1), ]$A_length = "11-20"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length %in%
                                              seq(21, 30, 1), ]$A_length = "21-30"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Atail_length >
                                              30, ]$A_length = "30+"
  processed_dataset_mapped_true_no_hetero_no_other$A_length <- factor(processed_dataset_mapped_true_no_hetero_no_other$A_length,
                                                               levels = c("0", "1", "2-5", "6-10", "11-20", "21-30", "30+"))
  
  # create classes for U-tail lengths (0,1,2,3-5,6-10,10+)
  processed_dataset_mapped_true_no_hetero_no_other$U_length = ""
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length ==
                                              0, ]$U_length = "0"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length ==
                                              1, ]$U_length = "1"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length ==
                                              2, ]$U_length = "2"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length %in%
                                              seq(3, 5, 1), ]$U_length = "3-5"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length %in%
                                              seq(6, 10, 1), ]$U_length = "6-10"
  processed_dataset_mapped_true_no_hetero_no_other[processed_dataset_mapped_true_no_hetero_no_other$Utail_length >
                                              10, ]$U_length = "10+"
  processed_dataset_mapped_true_no_hetero_no_other$U_length <- factor(processed_dataset_mapped_true_no_hetero_no_other$U_length,
                                                               levels = c("10+", "6-10", "3-5", "2", "1", "0"))
  
  
  # modify levels of tail_types to have U_only,A-only,AU or no_tail
  processed_dataset_mapped_true_no_hetero_no_other_tails <- processed_dataset_mapped_true_no_hetero_no_other
  processed_dataset_mapped_true_no_hetero_no_other_tails$tail_type <- as.character(processed_dataset_mapped_true_no_hetero_no_other_tails$tail_type)
  processed_dataset_mapped_true_no_hetero_no_other_tails$tail_type <- factor(processed_dataset_mapped_true_no_hetero_no_other_tails$tail_type,
                                                                      levels = c("U_only", "AU", "no_tail", "A_only"))
  
  
  
  
  all_data <- processed_dataset_mapped_true_no_hetero_no_other_tails %>% select(V1,tail_sequence,tail_type,Atail,Atail_length,Utail_length,tail_length,tail_source,transcript,cell_line,localization,condition,replicate,primer_name,project_name,mapping_position,exp_type,CTGAC_R5,terminal_nucleotides,uridylated,uridylated2,tailed,mapped) %>% dplyr::group_by(project_name,condition,replicate,transcript,primer_name,cell_line,uridylated)
  all_data <- all_data %>% filter(tail_length <= 64)
  return(all_data)
}


#' Title
#'
#' @param dataset 
#' @param transcript2 
#' @param exp_type2 
#' @param mapping_position_min 
#' @param mapping_position_max 
#' @param project 
#' @param conditions 
#' @param primers 
#' @param cell_lines 
#' @param localizations 
#' @param persons 
#'
#' @return
#' @export
#'
#' @examples
filter_data <- function(dataset,transcript2,exp_type2,mapping_position_min=NA,mapping_position_max=NA,project=NA,conditions=NA,primers=NA,cell_lines=NA,localizations=NA,persons=NA) 
  {
  #get filtered data from input dataset
  #first, filter by transcript and experiment type (OVR,KD,LEAP)
  transcript2 <- as.vector(transcript2)
  exp_type2 <- as.vector(exp_type2)
  project <- as.vector(project)
  cell_lines <- as.vector(cell_lines)
  primers <- as.vector(primers)
  conditions <- as.vector(conditions)
  localizations <- as.vector(localizations)
  persons <- as.vector(persons)
  test_trans <- dataset %>% dplyr::filter(transcript %in% transcript2,exp_type %in% exp_type2) 
  #if project is specified - used for filtering
  if(length(project)>0 & !is.na(project))
  {
        test_trans <- test_trans %>% dplyr::filter(project_name %in% project)
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% dplyr::filter(condition %in% conditions)
  }
  #if primers are specified - use for filtering
  if(length(primers)>0 & !is.na(primers)) {
    test_trans <- test_trans %>% dplyr::filter(primer_name %in% primers)
  }
  #if cell_lines are specified - use for filtering
  if(length(cell_lines)>0 & !is.na(cell_lines)) {
    test_trans <- test_trans %>% dplyr::filter(cell_line %in% cell_lines)
  }
  #if localization are specified - use for filtering
  if(length(localizations)>0 & !is.na(localizations)) {
    test_trans <- test_trans %>% dplyr::filter(localization %in% localizations)
  }
  #if localization are specified - use for filtering
  if(length(persons)>0 & !is.na(persons)) {
    test_trans <- test_trans %>% dplyr::filter(peron %in% persons)
  }
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position>mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position<mapping_position_max)
  }
  #finally - drop unused factor levels
  test_trans <- test_trans %>% droplevels()
  return(test_trans)
}


#' Title
#'
#' @param test_trans 
#' @param test 
#' @param facet_by 
#' @param values 
#' @param grouping_var 
#' @param p.adjust.method 
#'
#' @return
#' @export
#'
#' @examples
calculate_stats <- function(test_trans,test = 'Dunn',facet_by = NA,values=NA,grouping_var=NA,p.adjust.method = "BH")
{

  if (is.na(facet_by)) {
    if(test=='Dunn') 
      {
        stats <- kwManyOneDunnTest(test_trans[[values]],test_trans[[grouping_var]],p.adjust.method = p.adjust.method)
    } else if (test=='Tukey') {
        stats <- tukeyTest(test_trans[[values]],test_trans[[grouping_var]],p.adjust.method = p.adjust.method)
    }
    else {
      stop("Wrong test type provided as an argument")
    }
    
  } else 
  {
    test_trans <- test_trans %>% group_by(.dots = c(facet_by))
    if(test=='Dunn') 
      {
      stats_multiple = test_trans %>% dplyr::do(stats = kwManyOneDunnTest(.[[values]],.[[grouping_var]]))
    } else if(test=='Tukey') 
    {
      stats_multiple = test_trans %>% dplyr::do(stats = tukeyTest(.[[values]],.[[grouping_var]]))
    }
    else {
      stop("Wrong test type provided as an argument")
    }
    vars_test = stats_multiple %>% group_by_if(is.factor) %>% group_vars()
    stats_multiple  <- stats_multiple %>% ungroup() %>% mutate(name = paste(!!!rlang::syms(vars_test),sep="_")) 
    names(stats_multiple$stats) <- stats_multiple$name
    stats <- stats_multiple$stats
  }
  return(stats)
}


#' Analyze uridylation
#'
#' @param dataset               - dataset to analyze
#' @param transcript2           - transcript to analyze
#' @param exp_type2             - experiment type (OVR,KD,LEAP,NT)
#' @param mapping_position_min  - min position of mapping in the reference
#' @param mapping_position_max  - max position of mapping in the reference
#' @param project               - project (project_name) to include in the analysis (character of character vector)
#' @param facet_projects        - make facets based on projects (T/F)
#' @param conditions            - filter for selected conditions (character vector)
#' @param include_jitter        - include jitter dots on bar plot (T/F)
#' @param localizations 
#' @param persons 
#' @param primers 
#' @param cell_lines 
#'
#' @return                      - list containing calculated uridylation data (calculated output), Dunn test results and lot 
#' @export
#'
#' @examples
#' analyze_uridylation(all_data,"ACTB","OVR",project=c("Tailseq_4","Tailseq_5"),facet_projects==TRUE,conditions=c("CNTRL","TUT4WT","TUT7WT","MOV10"))
analyze_uridylation <- function(dataset,transcript2,exp_type2,mapping_position_min=NA,mapping_position_max=NA,project=NA,facet_projects=FALSE,conditions=NA,include_jitter=TRUE,localizations=NA,persons=NA,primers=NA,cell_lines=NA) {
  
  
  output = list() #list for storing output
  test_trans <- filter_data(dataset,transcript2 = transcript2,exp_type2 = exp_type2,conditions = conditions,project = project,mapping_position_max = mapping_position_max,mapping_position_min = mapping_position_min,cell_lines = cell_lines,primers = primers,persons = persons, localizations = localizations)
  #print(mapping_position_min)
 # print(mapping_position_max)
  #print(transcript2)
  #print(exp_type2)
  #get filtered data from input dataset
  #first, filter by transcript and experiment type (OVR,KD,LEAP)
  plot_title <- paste(exp_type2,transcript2,"uridylation")
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    plot_title <- paste(plot_title,"min map pos: ",mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    plot_title <- paste(plot_title,"max map pos: ",mapping_position_max)
  }
  
  test_trans <- test_trans %>% dplyr::group_by(condition,replicate,project_name,uridylated2)
  #print(test_trans)
  #summarize uridylation data using dplyr
  if (facet_projects == FALSE) {
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  else {
    #if using facets - group by projects also
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2,project_name) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  #print(test_trans)
  #leave only uridylation values (exclude uridylation == FALSE)
  test_trans <- test_trans %>% dplyr::filter(uridylated2==TRUE) 
  #store calculated values
  output$calculated_values = test_trans
  #select data for plot
  #test_trans %>% ungroup() %>% select(replicate,freq_urid,project_name,condition) 
  #create plot
  plot_out <- test_trans %>% ggplot(aes(x=condition)) + geom_bar(stat="identity",position="dodge",aes(y=mean_freq_urid))
  plot_out <- plot_out  + geom_errorbar(aes(ymin =  mean_freq_urid - sd_urid, ymax = mean_freq_urid + sd_urid),colour = "black", width = 0.1, position = position_dodge(0.9)) 
  plot_out <- plot_out + ggtitle(plot_title)
  plot_out <- plot_out + xlab("condition") + ylab("fraction of transcripts")
  if (include_jitter==TRUE) {
    plot_out <- plot_out + geom_jitter(aes(y=freq_urid))
  }
  
  #create facets 
  if (facet_projects==TRUE) {
    plot_out <- plot_out + facet_grid (project_name ~ .)
  }
  #perform dunn test (one to many, first condition is used as control)
  output$dunn_test <- list()
  #if using facets - perform dunn test on each project independently
  if (facet_projects==TRUE) {
    dunn_test_pval2<-c() #vector for storing dunn test pvalues 
    for (proj in levels(test_trans$project_name)) {
      #iterate tgrough projects
      test_proj <- test_trans %>% dplyr::filter(project_name==proj) #filter data for single project
      if(nrow(test_proj)>0) { 
        #if there are any data after filtering, calculate Dunn's test
        #print(proj)
        #calculate Dunn using kwManyOneDunnTest from PCMCRPlus package
        dunn_test <- kwManyOneDunnTest(test_proj$freq_urid,test_proj$condition,p.adjust.method = "BH")
       # print(dunn_test)
        #store Dunn test results in a list
        dunn_output<-dunn_test$p.value
        colnames(dunn_output)<-c("p.value (Dunn test)")
        output$dunn_test[[proj]] <- dunn_output
        dunn_test_pval <- as.data.frame(dunn_test$p.value) # get pvalues
        #dunn_test_pval2 <- c(0,dunn_test_pval[,1]) 
        dunn_test_pval2<-c(dunn_test_pval2,dunn_test_pval[,1]) #store pvalues
      }
    }
  } else #if no facets, calculate Dunn for all projects together
  {
    dunn_test <- kwManyOneDunnTest(test_trans$freq_urid,test_trans$condition,p.adjust.method = "BH")
  #  print(dunn_test)
    dunn_output<-dunn_test$p.value
    colnames(dunn_output)<-c("p.value (Dunn test)")
    output$dunn_test <- dunn_output

    dunn_test_pval <- as.data.frame(dunn_test$p.value)
    dunn_test_pval2 <- dunn_test_pval[,1] #store pvalues
  }
  
  #get plot data to build info about significance
  pg <- ggplot_build(plot_out)
  #get max position of error bars from plot data (pg$data[2]), group by PANEL (facets) and group (condition)
  plot_data<-as.data.frame(pg$data[2]) %>% dplyr::group_by(group,PANEL) %>% summarise(max_pos = max(ymax)) %>% ungroup() %>% mutate(max_pos2=max(max_pos)) %>% mutate(y=max_pos2+0.05 + max_pos2*(0.25*group)) %>% dplyr::filter(group!=1) %>% arrange(PANEL,group)
#  print(plot_data)
  #create_annotation
  if (facet_projects==TRUE) 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(project_name,condition) %>% summarise(replicate=mean(replicate)) %>% arrange(project_name,condition) %>% slice(-1)
  } else 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(condition) %>%summarise(replicate=mean(replicate)) %>% slice(-1)
  }
  #calculate ylim
  ylim_value = max(plot_data$y) +0.15 * max(plot_data$y)
  plot_out = plot_out + expand_limits(y=c(0,ylim_value))
  #get position of significance marks from plot data
  annotation$y=plot_data$y
  #store pvalues from Dunn's test
  annotation$pvalue=dunn_test_pval2
  #store start position for significance bars - always first condition - control
  annotation$start=test_trans$condition[1]
  #create significance asterisks:
  annotation$sig=''
  if(any(annotation$pvalue<=0.05)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "*"
  }
  if(any(annotation$pvalue<=0.01)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "**"
  }
  if(any(annotation$pvalue<=0.001)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "***"
  }
  #exclude from annotation all conditions without signficance
  annotation <- annotation[annotation$sig!='',]
 # print(annotation)
  #if there atre any significant differences
  if (nrow(annotation)>0) {
    plot_out <- plot_out + geom_signif(data=annotation,aes(xmin=start, xmax=condition, annotations=sig, y_position=y),textsize = 10, vjust = 0.6,manual=TRUE)
  }
  
  #print(plot_out) #print plot
  output$plot <- plot_out #store plot in output
  return(output) #return output
  
}



analyze_uridylation_test <- function(dataset,transcript2,exp_type2,mapping_position_min=NA,mapping_position_max=NA,project=NA,facet_by=NA,conditions=NA,include_jitter=TRUE,localizations=NA,persons=NA,primers=NA,cell_lines=NA)
  {
  
  facet_by <- as.vector(facet_by)
  if (!is.na(facet_by) & length(facet_by)==1) {
    facet_by<-c(facet_by,".")
  }
  output = list() #list for storing output
  test_trans <- filter_data(dataset,transcript2 = transcript2,exp_type2 = exp_type2,conditions = conditions,project = project,mapping_position_max = mapping_position_max,mapping_position_min = mapping_position_min,cell_lines = cell_lines,primers = primers,persons = persons, localizations = localizations)
  #print(mapping_position_min)
  # print(mapping_position_max)
  #print(transcript2)
  #print(exp_type2)
  #get filtered data from input dataset
  #first, filter by transcript and experiment type (OVR,KD,LEAP)
  plot_title <- paste(exp_type2,transcript2,"uridylation")
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    plot_title <- paste(plot_title,"min map pos: ",mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    plot_title <- paste(plot_title,"max map pos: ",mapping_position_max)
  }
  
  test_trans <- test_trans %>% dplyr::group_by(condition,replicate,uridylated2)
  #print(test_trans)
  #summarize uridylation data using dplyr
  if (is.na(facet_by)) {
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  else {
    test_trans <- test_trans %>% group_by(.dots = c("condition","replicate","project_name","uridylated2",facet_by[facet_by!="."]))
    #if using facets - group by projects also
    #test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2,project_name) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(.dots = c("condition","replicate",facet_by[facet_by!="."])) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(.dots = c("condition","uridylated2",facet_by[facet_by!="."])) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  #print(test_trans)
  #leave only uridylation values (exclude uridylation == FALSE)
  test_trans <- test_trans %>% dplyr::filter(uridylated2==TRUE) 
  #store calculated values
  output$calculated_values = test_trans
  #select data for plot
  #test_trans %>% ungroup() %>% select(replicate,freq_urid,project_name,condition) 
  #create plot
  plot_out <- test_trans %>% ggplot(aes(x=condition)) + geom_bar(stat="identity",position="dodge",aes(y=mean_freq_urid))
  plot_out <- plot_out  + geom_errorbar(aes(ymin =  mean_freq_urid - sd_urid, ymax = mean_freq_urid + sd_urid),colour = "black", width = 0.1, position = position_dodge(0.9)) 
  plot_out <- plot_out + ggtitle(plot_title)
  plot_out <- plot_out + xlab("condition") + ylab("fraction of transcripts")
  if (include_jitter==TRUE) {
    plot_out <- plot_out + geom_jitter(aes(y=freq_urid))
  }
  
  #create facets 
  if (!is.na(facet_by)) {
    plot_out <- plot_out + facet_grid (reformulate(facet_by[2:length(facet_by)],facet_by))
  }
  #perform dunn test (one to many, first condition is used as control) 
  #other option - Tukey's test (ANOVA)
  #single function - irrespective of facets
  output$dunn_test<-calculate_stats(test_trans,test="Dunn",facet_by = facet_by,values = "freq_urid",grouping_var = "condition")
  
  
  #get plot data to build info about significance
  pg <- ggplot_build(plot_out)
  #get max position of error bars from plot data (pg$data[2]), group by PANEL (facets) and group (condition)
  plot_data<-as.data.frame(pg$data[2]) %>% dplyr::group_by(group,PANEL) %>% summarise(max_pos = max(ymax)) %>% ungroup() %>% mutate(max_pos2=max(max_pos)) %>% mutate(y=max_pos2+0.05 + max_pos2*(0.25*group)) %>% dplyr::filter(group!=1) %>% arrange(PANEL,group)
  #  print(plot_data)
  #create_annotation
  if (!is.na(facet_by) & facet_by[1]=='project_name') 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(project_name,condition) %>% summarise(replicate=mean(replicate)) %>% arrange(project_name,condition) %>% slice(-1)
  } else 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(condition) %>%summarise(replicate=mean(replicate)) %>% slice(-1)
  }
  #calculate ylim
  ylim_value = max(plot_data$y) +0.15 * max(plot_data$y)
  plot_out = plot_out + expand_limits(y=c(0,ylim_value))
  #get position of significance marks from plot data
  annotation$y=plot_data$y
  #store pvalues from Dunn's test
  annotation$pvalue=dunn_test_pval2
  #store start position for significance bars - always first condition - control
  annotation$start=test_trans$condition[1]
  #create significance asterisks:
  annotation$sig=''
  if(any(annotation$pvalue<=0.05)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "*"
  }
  if(any(annotation$pvalue<=0.01)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "**"
  }
  if(any(annotation$pvalue<=0.001)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "***"
  }
  #exclude from annotation all conditions without signficance
  annotation <- annotation[annotation$sig!='',]
  # print(annotation)
  #if there atre any significant differences
  if (nrow(annotation)>0) {
    plot_out <- plot_out + geom_signif(data=annotation,aes(xmin=start, xmax=condition, annotations=sig, y_position=y),textsize = 10, vjust = 0.6,manual=TRUE)
  }
  
  #print(plot_out) #print plot
  output$plot <- plot_out #store plot in output
  return(output) #return output
  
}



#' Analyze uridylation using Tukey test
#'
#' @param dataset               - dataset to analyze
#' @param transcript2           - transcript to analyze
#' @param exp_type2             - experiment type (OVR,KD,LEAP,NT)
#' @param mapping_position_min  - min position of mapping in the reference
#' @param mapping_position_max  - max position of mapping in the reference
#' @param project               - project (project_name) to include in the analysis (character of character vector)
#' @param facet_projects        - make facets based on projects (T/F)
#' @param conditions            - filter for selected conditions (character vector)
#' @param include_jitter        - include jitter dots on bar plot (T/F)
#'
#' @return                      - list containing calculated uridylation data (calculated output), Dunn test results and lot 
#' @export
#'
#' @examples
#' analyze_uridylation(all_data,"ACTB","OVR",project=c("Tailseq_4","Tailseq_5"),facet_projects==TRUE,conditions=c("CNTRL","TUT4WT","TUT7WT","MOV10"))
analyze_uridylation_Tukey <- function(dataset,transcript2,exp_type2,mapping_position_min=NA,mapping_position_max=NA,project=NA,facet_projects=FALSE,conditions=NA,include_jitter=TRUE) {
  
  
  output = list() #list for storing output
  #print(mapping_position_min)
  # print(mapping_position_max)
  #print(transcript2)
  #print(exp_type2)
  #get filtered data from input dataset
  #first, filter by transcript and experiment type (OVR,KD,LEAP)
  test_trans <- dataset %>% dplyr::filter(transcript==transcript2,exp_type==exp_type2) 
  plot_title <- paste(exp_type2,transcript2,"uridylation")
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position>mapping_position_min)
    plot_title <- paste(plot_title,"min map pos: ",mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position<mapping_position_max)
    plot_title <- paste(plot_title,"max map pos: ",mapping_position_max)
  }
  #if project is specified - used for filtering
  if(!is.na(project)) {
    if (length(project==1)) {
      test_trans <- test_trans %>% dplyr::filter(project_name==project)
    } else {
      test_trans <- test_trans %>% dplyr::filter(project_name %in% project)
    }
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% dplyr::filter(condition %in% conditions)
  }
  #print(test_trans)
  #group data by co
  test_trans <- test_trans %>% dplyr::group_by(condition,replicate,project_name,uridylated2)
  #print(test_trans)
  #summarize uridylation data using dplyr
  if (facet_projects == FALSE) {
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  else {
    #if using facets - group by projects also
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2,project_name) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  #print(test_trans)
  #leave only uridylation values (exclude uridylation == FALSE)
  test_trans <- test_trans %>% dplyr::filter(uridylated2==TRUE) 
  #store calculated values
  output$calculated_values = test_trans
  #select data for plot
  #test_trans %>% ungroup() %>% select(replicate,freq_urid,project_name,condition) 
  #create plot
  plot_out <- test_trans %>% ggplot(aes(x=condition)) + geom_bar(stat="identity",position="dodge",aes(y=mean_freq_urid))
  plot_out <- plot_out  + geom_errorbar(aes(ymin =  mean_freq_urid - sd_urid, ymax = mean_freq_urid + sd_urid),colour = "black", width = 0.1, position = position_dodge(0.9)) 
  plot_out <- plot_out + ggtitle(plot_title)
  plot_out <- plot_out + xlab("condition") + ylab("fraction of transcripts")
  if (include_jitter==TRUE) {
    plot_out <- plot_out + geom_jitter(aes(y=freq_urid))
  }
  
  #create facets 
  if (facet_projects==TRUE) {
    plot_out <- plot_out + facet_grid (project_name ~ .)
  }
  #perform dunn test (one to many, first condition is used as control)
  output$tukey_test <- list()
  #if using facets - perform dunn test on each project independently
  if (facet_projects==TRUE) {
    tukey_test_pval2<-c() #vector for storing dunn test pvalues 
    for (proj in levels(test_trans$project_name)) {
      #iterate tgrough projects
      test_proj <- test_trans %>% dplyr::filter(project_name==proj) #filter data for single project
      if(nrow(test_proj)>0) { 
        #if there are any data after filtering, calculate Dunn's test
        #print(proj)
        #calculate Dunn using kwManyOneDunnTest from PCMCRPlus package
        tukey_test <- tukeyTest(test_proj$freq_urid,test_proj$condition,p.adjust.method = "BH")
        # print(dunn_test)
        #store Dunn test results in a list
        tukey_output<-tukey_test$p.value
        
        output$tukey_test[[proj]] <- tukey_output
        dunn_test_pval <- as.data.frame(dunn_test$p.value) # get pvalues
        #dunn_test_pval2 <- c(0,dunn_test_pval[,1]) 
        tukey_test_pval2<-c(tukey_test_pval2,as.vector(tukey_test$p.value[,1])) #store pvalues
      }
    }
  } else #if no facets, calculate Dunn for all projects together
  {
    tukey_test <- tukeyTest(test_trans$freq_urid,test_trans$condition,p.adjust.method = "BH")
    #  print(dunn_test)
    tukey_output<-tukey_test$p.value
    #colnames(tukey_output)<-c("p.value (Tukey test)")
    output$tukey_test <- tukey_output
    
    tukey_test_pval2 <- as.vector(tukey_test$p.value[,1])

  }
  
  #get plot data to build info about significance
  pg <- ggplot_build(plot_out)
  #get max position of error bars from plot data (pg$data[2]), group by PANEL (facets) and group (condition)
  plot_data<-as.data.frame(pg$data[2]) %>% dplyr::group_by(group,PANEL) %>% summarise(max_pos = max(ymax)) %>% ungroup() %>% mutate(max_pos2=max(max_pos)) %>% mutate(y=max_pos2+0.05 + max_pos2*(0.25*group)) %>% dplyr::filter(group!=1) %>% arrange(PANEL,group)
  #  print(plot_data)
  #create_annotation
  if (facet_projects==TRUE) 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(project_name,condition) %>% summarise(replicate=mean(replicate)) %>% arrange(project_name,condition) %>% slice(-1)
  } else 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(condition) %>%summarise(replicate=mean(replicate)) %>% slice(-1)
  }
  #calculate ylim
  ylim_value = max(plot_data$y) +0.15 * max(plot_data$y)
  plot_out = plot_out + expand_limits(y=c(0,ylim_value))
  #get position of significance marks from plot data
  annotation$y=plot_data$y
  #store pvalues from Dunn's test
  annotation$pvalue=tukey_test_pval2
  #store start position for significance bars - always first condition - control
  annotation$start=test_trans$condition[1]
  #create significance asterisks:
  annotation$sig=''
  if(any(annotation$pvalue<=0.05)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "*"
  }
  if(any(annotation$pvalue<=0.01)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "**"
  }
  if(any(annotation$pvalue<=0.001)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "***"
  }
  #exclude from annotation all conditions without signficance
  annotation <- annotation[annotation$sig!='',]
  # print(annotation)
  #if there atre any significant differences
  if (nrow(annotation)>0) {
    plot_out <- plot_out + geom_signif(data=annotation,aes(xmin=start, xmax=condition, annotations=sig, y_position=y),textsize = 10, vjust = 0.6,manual=TRUE)
  }
  
  #print(plot_out) #print plot
  output$plot <- plot_out #store plot in output
  return(output) #return output
  
}


#' Plot mapping positions
#'
#' @param dataset                - input dataset 
#' @param trans                  - transcript to analyze
#' @param exp_type2              - experiment type (OVR,KD,LEAP,NT)
#' @param conditions             - conditions to include in the analysis
#' @param mapping_position_min   - minimal mapping position to include on plot
#' @param mapping_position_max   - maximum mapping position to include on plot
#' @param facet_projects         - show facets  on projects
#' @param facet_replicates       - show facets on replicates 
#' @param project 
#'
#' @return                       - plot with mapping positions
#' @export
#'
#' @examples
plot_mapping_positions <- function(dataset,trans,exp_type2,conditions = NA,mapping_position_min = NA,mapping_position_max = NA,project = NA, facet_projects = FALSE,facet_replicates = FALSE) {
  #filter input dataset to include only required transcript and experiment type
  test_trans <- dataset %>% dplyr::filter(transcript==trans)
  test_trans <- test_trans %>% dplyr::filter(exp_type==exp_type2)
  #create plot title
  plot_title <- paste(exp_type2,trans,sep=",")
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position>mapping_position_min)
    plot_title <- paste(plot_title,"min map pos: ",mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position<mapping_position_max)
    plot_title <- paste(plot_title,"max map pos: ",mapping_position_max)
  }
  #if project is specified - used for filtering
  
  if (length(project)>0 & !is.na(project)) {
    test_trans <- test_trans %>% dplyr::filter(project_name %in% project)
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% dplyr::filter(condition %in% conditions)
    plot_title <- paste(plot_title,paste(conditions,collapse=", "),sep="; ")
  }
  test_trans$condition <- as.character(test_trans$condition)
  test_trans$condition <- as.factor(test_trans$condition)
  
  #create plot
  plot_out <- ggplot(test_trans,aes(x=mapping_position,group=condition,colour=condition)) 
  plot_out <- plot_out + stat_bin(binwidth=1,aes(x=mapping_position,y=..ncount..),geom="line") 
  plot_out <- plot_out +  ggtitle(plot_title)
  plot_out <- plot_out + xlab("mapping position")
  plot_out <- plot_out + ylab("count")
  #create facets (if specified)
  if (facet_projects==TRUE) {
    plot_out <- plot_out + facet_grid (project_name ~ .)
  }
  #if not faceting by projects, facet by replicates (if specified)
  else if (facet_replicates==TRUE) {
    plot_out <- plot_out + facet_grid (replicate ~ .)
  }
  #print(plot_out)
  return(plot_out)
}



#' Plot tail length disribution
#'
#' @param dataset               - input dataset
#' @param trans                 - transcript to analyze
#' @param exp_type2             - experiment type (OVR,KD,LEAP,NT)
#' @param conditions            - conditions to include (vector)
#' @param mapping_position_min  - minimal mapping position to include
#' @param mapping_position_max  - maximal mapping position to include
#' @param facet_conditions      - facet by conditions
#' @param facet_replicates      - facet by replicates
#' @param max_tail_length       - max tail length to include
#' @param min_tail_length       - min tail length to include
#' @param facet_tail_types      - facet by tail types
#' @param AUtail                - specify A or U tail part only ('Atail' or 'Utail', other values ignored)
#'
#' @return                      - plot with tail lengths distribution
#' @export
#'
#' @examples
plot_tail_length_distribution <- function(dataset,trans,exp_type2,conditions = NA,mapping_position_min = NA,mapping_position_max = NA,facet_conditions = FALSE,facet_replicates = FALSE,max_tail_length = NA, min_tail_length = NA, facet_tail_types = FALSE,AUtail = NA,binsize=1) {
  test_trans <- dataset %>% filter(transcript==trans)
  test_trans <- test_trans %>% filter(exp_type==exp_type2)
  
  plot_title <- trans
  if (!is.na(AUtail)) {
    if (AUtail == 'Atail') {
      test_trans <- test_trans %>% mutate(tail_length = Atail_length)
      plot_title <- paste(plot_title,"A tail length only",sep=",")
    } else if (AUtail =='Utail') {
      test_trans <- test_trans %>% mutate(tail_length = Utail_length)
      plot_title <- paste(plot_title,"U tail length only",sep=",")
    }
  }
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% filter(mapping_position>mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% filter(mapping_position<mapping_position_max)
  }
  if (!is.na(min_tail_length)) {
    test_trans <- test_trans %>% filter(tail_length>min_tail_length)
  }
  if (!is.na(max_tail_length)) {
    test_trans <- test_trans %>% filter(tail_length<max_tail_length)
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% filter(condition %in% conditions)
  }

  test_trans$condition <- as.character(test_trans$condition)
  test_trans$condition <- as.factor(test_trans$condition)
  test_trans$tail_type <- as.character(test_trans$tail_type)
  test_trans$tail_type <- as.factor(test_trans$tail_type)
  

  
  plot_out <- ggplot(test_trans,aes(x=tail_length,group=tail_type,colour=tail_type)) 
  plot_out <- plot_out + stat_bin(binwidth=binsize,aes(x=tail_length,y=..ncount..,colour=tail_type),geom="line") 
  plot_out <- plot_out +  ggtitle(plot_title)
  #create facets 
  if (facet_conditions==TRUE) {
    plot_out <- plot_out + facet_grid (condition ~ .)
  }
  else if (facet_replicates==TRUE) {
    plot_out <- plot_out + facet_grid (replicate ~ .)
  }
  else if (facet_tail_types==TRUE) {
    plot_out <- plot_out + facet_grid (tail_type ~ .)
  }
 # print(plot_out)
  return(plot_out)
}


#' Plot tail length disribution 2
#'
#' @param dataset               - input dataset
#' @param trans                 - transcript to analyze
#' @param exp_type2             - experiment type (OVR,KD,LEAP,NT)
#' @param conditions            - conditions to include (vector)
#' @param mapping_position_min  - minimal mapping position to include
#' @param mapping_position_max  - maximal mapping position to include
#' @param facet_conditions      - facet by conditions
#' @param facet_replicates      - facet by replicates
#' @param max_tail_length       - max tail length to include
#' @param min_tail_length       - min tail length to include
#' @param facet_tail_types      - facet by tail types
#' @param AUtail                - specify A or U tail part only ('Atail' or 'Utail', other values ignored)
#'
#' @return                      - plot with tail lengths distribution
#' @export
#'
#' @examples
plot_tail_length_distribution2 <- function(dataset,trans,exp_type2,conditions = NA,mapping_position_min = NA,mapping_position_max = NA,facet_conditions = FALSE,facet_replicates = FALSE,max_tail_length = NA, min_tail_length = NA, facet_tail_types = FALSE,AUtail = NA,binsize=1) {
  test_trans <- dataset %>% filter(transcript==trans)
  test_trans <- test_trans %>% filter(exp_type==exp_type2)
  
  plot_title <- trans
  if (!is.na(AUtail)) {
    if (AUtail == 'Atail') {
      test_trans <- test_trans %>% mutate(tail_length = Atail_length)
      plot_title <- paste(plot_title,"A tail length only",sep=",")
    } else if (AUtail =='Utail') {
      test_trans <- test_trans %>% mutate(tail_length = Utail_length)
      plot_title <- paste(plot_title,"U tail length only",sep=",")
    }
  }
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% filter(mapping_position>mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% filter(mapping_position<mapping_position_max)
  }
  if (!is.na(min_tail_length)) {
    test_trans <- test_trans %>% filter(tail_length>min_tail_length)
  }
  if (!is.na(max_tail_length)) {
    test_trans <- test_trans %>% filter(tail_length<max_tail_length)
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% filter(condition %in% conditions,tail_length>0)
  }
  
  test_trans$condition <- as.character(test_trans$condition)
  test_trans$condition <- as.factor(test_trans$condition)
  test_trans$tail_type <- as.character(test_trans$tail_type)
  test_trans$tail_type <- as.factor(test_trans$tail_type)
  
  
  
  plot_out <- ggplot(test_trans,aes(x=tail_length,group=condition,colour=condition)) 
  plot_out <- plot_out + stat_bin(binwidth=binsize,aes(x=tail_length,y=..count..,colour=condition),geom="line") 
  plot_out <- plot_out +  ggtitle(plot_title)
  #create facets 
  if (facet_conditions==TRUE) {
    plot_out <- plot_out + facet_grid (condition ~ .)
  }
  else if (facet_replicates==TRUE) {
    plot_out <- plot_out + facet_grid (replicate ~ .)
  }
  else if (facet_tail_types==TRUE) {
    plot_out <- plot_out + facet_grid (tail_type ~ .)
  }
  # print(plot_out)
  return(plot_out)
}



#' Plot tail length disribution 2
#'
#' @param dataset               - input dataset
#' @param trans                 - transcript to analyze
#' @param exp_type2             - experiment type (OVR,KD,LEAP,NT)
#' @param conditions            - conditions to include (vector)
#' @param mapping_position_min  - minimal mapping position to include
#' @param mapping_position_max  - maximal mapping position to include
#' @param AUtail                - specify A or U tail part only ('Atail' or 'Utail', other values ignored)
#' @param facet_projects 
#' @param uridylated_only 
#' @param include_jitter 
#' @param project 
#'
#' @return                      - plot with tail lengths distribution
#' @export
#'
#' @examples
plot_mean_tail_length <- function(dataset,trans,exp_type2,conditions = NA,mapping_position_min = NA,mapping_position_max = NA,facet_projects = FALSE,AUtail = NA,uridylated_only = FALSE,include_jitter = TRUE,project = NA) {
  test_trans <- dataset %>% filter(transcript==trans)
  test_trans <- test_trans %>% filter(exp_type==exp_type2)
  
  plot_title <- trans
  if (!is.na(AUtail)) {
    if (AUtail == 'Atail') {
      test_trans <- test_trans %>% mutate(tail_length = Atail_length)
      plot_title <- paste(plot_title,"A tail length only",sep=",")
    } else if (AUtail =='Utail') {
      test_trans <- test_trans %>% mutate(tail_length = Utail_length)
      plot_title <- paste(plot_title,"U tail length only",sep=",")
    }
  }
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% filter(mapping_position>mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% filter(mapping_position<mapping_position_max)
  }
  #if project is specified - used for filtering
  if(!is.na(project)) {
    if (length(project==1)) {
      test_trans <- test_trans %>% dplyr::filter(project_name==project)
    } else {
      test_trans <- test_trans %>% dplyr::filter(project_name %in% project)
    }
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% filter(condition %in% conditions,tail_length>0)
  }
  
  
  if(uridylated_only) {
    test_trans <- test_trans %>% filter(uridylated2==TRUE)
    
  }
  test_trans$condition <- as.character(test_trans$condition)
  test_trans$condition <- as.factor(test_trans$condition)
  test_trans$tail_type <- as.character(test_trans$tail_type)
  test_trans$tail_type <- as.factor(test_trans$tail_type)
  
  
  test_trans <- test_trans %>% dplyr::group_by(condition,replicate,project_name)
  #print(test_trans)
  #summarize uridylation data using dplyr
  if (facet_projects == FALSE) {
    test_trans <- test_trans %>% dplyr::summarize(mean_tail_length_rep=mean(tail_length)) %>% ungroup() %>% dplyr::group_by(condition) %>% dplyr::mutate(mean_tail_length = mean(mean_tail_length_rep),sd_tail_length = sd(mean_tail_length_rep)) 
  }
  else {
    #if using facets - group by projects also
    test_trans <- test_trans %>% dplyr::summarize(mean_tail_length_rep=mean(tail_length)) %>% ungroup() %>% dplyr::group_by(condition) %>% dplyr::mutate(mean_tail_length = mean(mean_tail_length_rep),sd_tail_length = sd(mean_tail_length_rep)) 
  }
  
  
  plot_out <- test_trans %>% ggplot(aes(x=condition)) + geom_bar(stat="identity",position="dodge",aes(y=mean_tail_length))
  plot_out <- plot_out  + geom_errorbar(aes(ymin =  mean_tail_length - sd_tail_length, ymax = mean_tail_length + sd_tail_length),colour = "black", width = 0.1, position = position_dodge(0.9)) 
  plot_out <- plot_out + ggtitle(plot_title)
  plot_out <- plot_out + xlab("condition") + ylab("fraction of transcripts")
  if (include_jitter==TRUE) {
    plot_out <- plot_out + geom_jitter(aes(y=mean_tail_length_rep))
  }
  #create facets 
  if (facet_projects==TRUE) {
    plot_out <- plot_out + facet_grid (project_name ~ .)
  }
  
  # print(plot_out)
  return(plot_out)
}



#' analyze uridylation with more faceting options
#'
#' @param dataset 
#' @param transcript2 
#' @param exp_type2 
#' @param mapping_position_min 
#' @param mapping_position_max 
#' @param project 
#' @param facet_projects 
#' @param conditions 
#' @param include_jitter 
#' @param primer_name 
#' @param cell_line 
#'
#' @return
#' @export
#'
#' @examples
analyze_uridylation_fac <- function(dataset,transcript2,exp_type2,mapping_position_min=NA,mapping_position_max=NA,project=NA,facet_projects=FALSE,conditions=NA,include_jitter=TRUE,primer_name=NA,cell_line=NA,primers=NA,cell_lines=NA) {
  
  
  output = list() #list for storing output
  #print(mapping_position_min)
  # print(mapping_position_max)
  #print(transcript2)
  #print(exp_type2)
  #get filtered data from input dataset
  #first, filter by transcript and experiment type (OVR,KD,LEAP)
  test_trans <- dataset %>% dplyr::filter(transcript==transcript2,exp_type==exp_type2) 
  plot_title <- paste(exp_type2,transcript2,"uridylation")
  #if min and max mapping positions are specified - used for filtering
  if (!is.na(mapping_position_min)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position>mapping_position_min)
    plot_title <- paste(plot_title,"min map pos: ",mapping_position_min)
  }
  if (!is.na(mapping_position_max)) {
    test_trans <- test_trans %>% dplyr::filter(mapping_position<mapping_position_max)
    plot_title <- paste(plot_title,"max map pos: ",mapping_position_max)
  }
  #if project is specified - used for filtering
  if(!is.na(project)) {
    if (length(project==1)) {
      test_trans <- test_trans %>% dplyr::filter(project_name==project)
    } else {
      test_trans <- test_trans %>% dplyr::filter(project_name %in% project)
    }
  }
  #if conditions are specified - use for filtering
  if(length(conditions)>0 & !is.na(conditions)) {
    test_trans <- test_trans %>% dplyr::filter(condition %in% conditions)
  }
  if(length(primers)>0 & !is.na(primers)) {
    test_trans <- test_trans %>% dplyr::filter(primer_name %in% primers)
  }
  if(length(cell_lines)>0 & !is.na(cell_lines)) {
    test_trans <- test_trans %>% dplyr::filter(cell_line %in% cell_lines)
  }
  #print(test_trans)
  #group data by co
  test_trans <- test_trans %>% dplyr::group_by(condition,replicate,project_name,uridylated2)
  #print(test_trans)
  #summarize uridylation data using dplyr
  if (facet_projects == TRUE) {
     test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2,project_name) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  } else 
  {
    test_trans <- test_trans %>% dplyr::summarize(n_urid=n()) %>% ungroup() %>% dplyr::group_by(condition,replicate,project_name) %>% dplyr::mutate(freq_urid = n_urid/sum(n_urid)) %>% dplyr::group_by(condition,uridylated2) %>% dplyr::mutate(mean_freq_urid = mean(freq_urid), sd_urid = sd(freq_urid))
  }
  #print(test_trans)
  #leave only uridylation values (exclude uridylation == FALSE)
  test_trans <- test_trans %>% dplyr::filter(uridylated2==TRUE) 
  #store calculated values
  output$calculated_values = test_trans
  #select data for plot
  #test_trans %>% ungroup() %>% select(replicate,freq_urid,project_name,condition) 
  #create plot
  plot_out <- test_trans %>% ggplot(aes(x=condition)) + geom_bar(stat="identity",position="dodge",aes(y=mean_freq_urid))
  plot_out <- plot_out  + geom_errorbar(aes(ymin =  mean_freq_urid - sd_urid, ymax = mean_freq_urid + sd_urid),colour = "black", width = 0.1, position = position_dodge(0.9)) 
  plot_out <- plot_out + ggtitle(plot_title)
  plot_out <- plot_out + xlab("condition") + ylab("fraction of transcripts")
  if (include_jitter==TRUE) {
    plot_out <- plot_out + geom_jitter(aes(y=freq_urid))
  }
  
  #create facets 
  if (facet_projects==TRUE) {
    plot_out <- plot_out + facet_grid (project_name ~ .)
  }
  #perform dunn test (one to many, first condition is used as control)
  output$dunn_test <- list()
  #if using facets - perform dunn test on each project independently
  if (facet_projects==TRUE) {
    dunn_test_pval2<-c() #vector for storing dunn test pvalues 
    for (proj in levels(test_trans$project_name)) {
      #iterate tgrough projects
      test_proj <- test_trans %>% dplyr::filter(project_name==proj) #filter data for single project
      if(nrow(test_proj)>0) { 
        #if there are any data after filtering, calculate Dunn's test
        #print(proj)
        #calculate Dunn using kwManyOneDunnTest from PCMCRPlus package
        dunn_test <- kwManyOneDunnTest(test_proj$freq_urid,test_proj$condition,p.adjust.method = "BH")
        # print(dunn_test)
        #store Dunn test results in a list
        dunn_output<-dunn_test$p.value
        colnames(dunn_output)<-c("p.value (Dunn test)")
        output$dunn_test[[proj]] <- dunn_output
        dunn_test_pval <- as.data.frame(dunn_test$p.value) # get pvalues
        #dunn_test_pval2 <- c(0,dunn_test_pval[,1]) 
        dunn_test_pval2<-c(dunn_test_pval2,dunn_test_pval[,1]) #store pvalues
      }
    }
  } else #if no facets, calculate Dunn for all projects together
  {
    dunn_test <- kwManyOneDunnTest(test_trans$freq_urid,test_trans$condition,p.adjust.method = "BH")
    #  print(dunn_test)
    dunn_output<-dunn_test$p.value
    colnames(dunn_output)<-c("p.value (Dunn test)")
    output$dunn_test <- dunn_output
    
    dunn_test_pval <- as.data.frame(dunn_test$p.value)
    dunn_test_pval2 <- dunn_test_pval[,1] #store pvalues
  }
  
  #get plot data to build info about significance
  pg <- ggplot_build(plot_out)
  #get max position of error bars from plot data (pg$data[2]), group by PANEL (facets) and group (condition)
  plot_data<-as.data.frame(pg$data[2]) %>% dplyr::group_by(group,PANEL) %>% summarise(max_pos = max(ymax)) %>% ungroup() %>% mutate(max_pos2=max(max_pos)) %>% mutate(y=max_pos2+0.05 + max_pos2*(0.25*group)) %>% dplyr::filter(group!=1) %>% arrange(PANEL,group)
  #  print(plot_data)
  #create_annotation
  if (facet_projects==TRUE) 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(project_name,condition) %>% summarise(replicate=mean(replicate)) %>% arrange(project_name,condition) %>% slice(-1)
  } else 
  {
    annotation <- test_trans %>% dplyr::filter(uridylated2==TRUE) %>% dplyr::group_by(condition) %>%summarise(replicate=mean(replicate)) %>% slice(-1)
  }
  #calculate ylim
  ylim_value = max(plot_data$y) +0.15 * max(plot_data$y)
  plot_out = plot_out + expand_limits(y=c(0,ylim_value))
  #get position of significance marks from plot data
  annotation$y=plot_data$y
  #store pvalues from Dunn's test
  annotation$pvalue=dunn_test_pval2
  #store start position for significance bars - always first condition - control
  annotation$start=test_trans$condition[1]
  #create significance asterisks:
  annotation$sig=''
  if(any(annotation$pvalue<=0.05)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "*"
  }
  if(any(annotation$pvalue<=0.01)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "**"
  }
  if(any(annotation$pvalue<=0.001)) {
    annotation[annotation$pvalue<=0.05,]$sig <- "***"
  }
  #exclude from annotation all conditions without signficance
  annotation <- annotation[annotation$sig!='',]
  # print(annotation)
  #if there atre any significant differences
  if (nrow(annotation)>0) {
    plot_out <- plot_out + geom_signif(data=annotation,aes(xmin=start, xmax=condition, annotations=sig, y_position=y),textsize = 10, vjust = 0.6,manual=TRUE)
  }
  
  #print(plot_out) #print plot
  output$plot <- plot_out #store plot in output
  return(output) #return output
  
}





#analyze_uridylation()
plot_tail_length_distribution(all_data,"LEAP_AU","LEAP",conditions=c("19A","19A3U"),max_tail_length = 30, min_tail_length=10,facet_conditions = T,AUtail = 'Ataila')

plot_tail_length_distribution(all_data,"LEAP_AU","LEAP",conditions=c("26A","26A2U","26A4U","26A6U","26A14U"),max_tail_length = 42, min_tail_length=20,facet_conditions = T,AUtail = 'Ataila',binsize = 2
                            )


plot_data <- LEAP_data %>% filter(Utail_length>0,condition %in% cond,(tail_type=='AU' & condition!='26A') | (tail_type=='A_only' & condition=='26A')) %>% group_by(condition,Utail_length) %>% count() %>% group_by(condition) %>% mutate(all_reads = sum(n)) %>% mutate(fraction=n/all_reads)  %>% mutate(fraction2=fraction/max(fraction))
smaegol/raceseq_scripts documentation built on May 28, 2019, 5:40 p.m.