R/prep_info_functions.R

Defines functions general_info

Documented in general_info

######
#' Provides summary table for all data.sets.
#'
#' Returns data frame sumamrizing general information about the data sets.
#' Function returns: Protein states, timepoints, number of replicates,  # peptides, % coveregae, average peptide length and redundancy.
#'
#' @param filepath filepath to the input file. Input file is All_results table from HDX_Examiner, where all the fields are marked for export.
#' @return Returns summary table.
#' @examples
#' file_nm<-system.file("extdata", "All_results_table.csv", package = "HDXBoxeR")
#' a<- general_info(file_nm)
#' @export
general_info<- function(filepath){

  states=arguments_call1(filepath)
  times=arguments_call2(filepath, states)
  replicates=arguments_call3(filepath, states, times)

  a<-arg_df(filepath)
  rownames(a)<-1:dim(a)[1] ##name rows

  ##loop below will go through Protein states, timepoints and Experiments to get replicates
  ##it will save a dataframe in wide format instead of long format, result of this loop is dataframe named "b"

  ##creates temporary df, temp1, with Protein states going through all unique protein states

  summ<-c()
  b<-c()
  ##creates temporary df, temp1, with Protein states going through all unique protein states
  for (state in states){
    temp1<-a[which(a$Protein.State ==state),]
    summ<-c(summ, state)
    st_l<-c()
    nbs=0
    tmps<-paste(as.vector(unique(temp1$Deut.Time)),collapse=" ")
    summ<-c(summ, tmps)
bp2<-c()
    summ<-c(summ, length(unique(temp1$Experiment))/length(as.vector(unique(temp1$Deut.Time))))
    for (time in unique(temp1$Deut.Time)){##

      temp2<-temp1[which(temp1$Deut.Time ==time ),]##creates temporary df, temp2 from one state of protein with the same timepoints
      nb=0
      nbs=nbs+1
      df_nm_st<-paste(time, "_", nbs,sep="")
      st_l<-c(st_l, df_nm_st)
      bs<-c()

      for (exp in unique(temp2$Experiment)[1:replicates]){
        nb=nb+1
        df_nm<-paste("b",nb,sep="")
        temp3<-temp2[which(temp2$Experiment == exp),]
        n_tmp<-names(temp3)
        nms<-c(n_tmp[1],paste("t",time,n_tmp[2],"_",nb,sep=""),paste("t",time,n_tmp[3],"_",nb,sep=""),n_tmp[4:8] ,
               paste("t",time, "_",n_tmp[9],"_",nb,sep=""), paste("t",time, "_", n_tmp[10],"_",nb,sep="")) ## creates names for the dataframe
        colnames(temp3)<-nms
        assign(df_nm, temp3)
        bs<-c(bs, df_nm) ##creates number of data.frames that equals to number of replicates
        df_List<-mget(bs)
        ##will merge all the replicates dataframes to bp dataframe
        bp<-Reduce(function(x, y) merge(x, y, by = c('Protein.State', 'Start','End', 'Sequence', 'Search.RT',
                                                     'Charge')), df_List)}
      assign(df_nm_st, bp)
    }
    df_List2<-mget(st_l)
    bp2<-Reduce(function(x, y) merge(x, y, by = c('Protein.State', 'Start','End', 'Sequence', 'Search.RT',
                                                  'Charge')), df_List2)

    summ<-c(summ, dim(bp2)[1])

    cvr<-coverage_residue(bp2,start_col = 2, end_col = 3)[min(bp2$Start):max(bp2$End)]
    cvrLong<-coverage_residue(bp2,start_col = 2, end_col = 3)
    ppt_cov<-round((length(cvr)-length(which(cvr==0)))/length(cvr)*100, digits=2)
    av<-round(mean(bp2$End-bp2$Start),digits=2)

    redund<-c()
    for (nb1 in 1:dim(bp2)[1]){
      r1<-sum(cvrLong[bp2[nb1,2]:bp2[nb1,3]])/(bp2[nb1,3]-bp2[nb1,2]+1)
      redund<-c(redund, r1)
    }
    summ<-c(summ,  ppt_cov, av, round(mean(redund), digits=2))
    b=rbind(b, bp2)
  }

  sum1<-data.frame(matrix(summ,nrow=length(unique(a$Protein.State)), byrow = TRUE))
  names(sum1)<-c("Protein.State", "Timepoints", "# replicates", "# peptides", "peptide coverage %", "<Peptide Length>", "<Redundancy>")
  return(sum1)}
mkajano/HDXBoxeR documentation built on Aug. 15, 2024, 9:12 a.m.