R/read.pdb.R
In protein8k: Perform Analysis and Create Visualizations of Proteins

Documented in read.pdb

#'read.pdb
#'
#'@description Read in a Protein Data Bank file
#'
#'@usage read.pdb(fileName, createAsS4 = TRUE)
#'
#'@param fileName character string for location and name of file to be read.
#'
#'@param createAsS4 Logical indicating whether to create the new protein object
#'  as S4 or not. Defaults to TRUE if not specified. This argument is optional.
#'
#'@details Reads a Protein Data Bank file (PDB) from the given location. The
#'  function then parses the file and creates a new object of the Protein class.
#'  This object can be either defined as an S3 or S4 object if different
#'  capabilities are required.
#'
#'@returns A new protein object as either an S3 or S4 object.
#'
#'@returns In general terms, the new object will be a list of two, a data frame
#'  containing the atomic record, and a list of header elements.
#'
#'@format A Protein object. List comprised of several sublists and dataframes
#'\itemize{
#'  \item{header: }{List of 2, Header Line and Title \itemize{
#'    \item{header_line: }{List of 3, Classification, depDate, and idCode \itemize{
#'      \item{classifiation: }{Classification of the Protein in the PDB}
#'      \item{depDat: }{Date the PDB was deposited or created}
#'      \item{idCode: }{4 digit identifier for the PDB. Always unique.}
#'      }}
#'    \item{title: }{The title of the PDB.}
#'    }}
#'  \item{structure: }{Dataframe of 16 variables \enumerate{
#'    \item{record_type:}{Type of record in this section. Generally ATOM or HETATM}
#'    \item{serial_num: }{The serial number for the position of the atom in the sequence}
#'    \item{atom_name: }{A name to identify the atom in a structure}
#'    \item{alt_location_id: }{}
#'    \item{residue_name: }{3 character identifier for a residue}
#'    \item{chain_id: }{}
#'    \item{residue_seq_num: }{Number representing where in the sequence a residue is. }
#'    \item{insert_residue_code: }{}
#'    \item{x_ortho_coord: }{X coordinate in Ångstroms on an orthogonal plane}
#'    \item{y_ortho_coord: }{Y coordinate in Ångstroms on an orthogonal plane}
#'    \item{z_ortho_coord: }{Z coordinate in Ångstroms on an orthogonal plane}
#'    \item{occupancy: }{}
#'    \item{temp_factor: }{The amount of overall error in the measurement of an atom.}
#'    \item{segment_id: }{}
#'    \item{element_symbol: }{Periodic symbol representing an atom.}
#'    \item{charge: }{Charge of the given atom. Can be +, -, or none at all}
#'  }}
#'}
#'
#'@export read.pdb
#'@import methods

# Read a raw pdb file into a protein class representation
#TODO: Add more fault checking.
read.pdb <- function(fileName, createAsS4 = TRUE) {
  #Read file, seperate by line
  #fileName = "1aieH" #Only for development purposes
  file <- scan(file = fileName, what = "character", sep = "\n")

  #TODO: Ensure read file matches PDB format, if not throw error

  # Cleaning the PDB file
  # What data do we want to grab?
  #   Header contains unstructured meta data
  #   Body of data contains atom type and position data

  # Seperate the header and body ###############################################
  # Find first index of each tag in file body
  #bodyTagFirstIndices <- c(which(startsWith(file, "ATOM"))[1],
  #                         which(startsWith(file, "TER"))[1],
  #                         which(startsWith(file, "HETATM"))[1])

  #Find the first index among all tags
  #beginBodyIndex <- min(bodyTagFirstIndices)

  #Find END tag
  #endTag <- which(startsWith(file, "END"))

  #Grab each section of the file for individual parsing
  #File Body
  #coord_sec <- file[beginBodyIndex:(endTag - 1)] #Commented out to test below

  #Pull just records for the Title Section
  title_sec <- file[c(which(startsWith(file, "HEADER")),
                      which(startsWith(file, "OBSLTE")),
                      which(startsWith(file, "TITLE ")),
                      which(startsWith(file, "SPLIT")),
                      which(startsWith(file, "CAVEAT")),
                      which(startsWith(file, "COMPND")),
                      which(startsWith(file, "SOURCE")),
                      which(startsWith(file, "KEYWDS")),
                      which(startsWith(file, "EXPDTA")),
                      which(startsWith(file, "NUMMDL")),
                      which(startsWith(file, "MDLTYP")),
                      which(startsWith(file, "AUTHOR")),
                      which(startsWith(file, "REVDAT")),
                      which(startsWith(file, "SPRSDE")),
                      which(startsWith(file, "REMARK")))]

  #Pull just those records that contain Coordinate Section records
  coord_sec <- file[c(which(startsWith(file, "ATOM  ")),
                      which(startsWith(file, "HETATM")),
                      which(startsWith(file, "ANISOU")),
                      which(startsWith(file, "TER   ")))]

  # Parse each part of the file ################################################
  # Parse File Header
  pdb_header <- pdb.parseHeader(title_sec)

  # Parse file body
  protein_structure <- pdb.parseCoordinates(coord_sec)

  # Return all associated pieces as single new protein object

  if(createAsS4 == TRUE) {
    #S4 object creation
    protein <- new("Protein", header = pdb_header,
                   structure = protein_structure)
  } else {
    #S3 object creation
    protein <- Protein(pdb_header, protein_structure)
  }

  return(protein)
}

#Parsing functions #############################################################

# Creates a new data frame representing molecule sequence, position, and names,
#   along with other necessary data.
# TODO: Add documentation regarding columns and return data
pdb.parseCoordinates <- function(fileBody) {
  # This block works for ATOM, HETATM, and TER. TER does not break the code, but
  #   does introduce NA's in most columns since it does not fill all fields
  #Subset for each column, convert to correct type if necessary
  record_type <- substr(fileBody, 1, 6)

  #Parse ATOM and HETATM Records like so
  if(record_type == "ATOM  " || record_type == "HETATM") {
    serial_num <- as.integer(substr(fileBody, 7, 11))
    atom_name <- substr(fileBody, 13, 16)
    alt_location_id <- substr(fileBody, 17, 17)
    residue_name <- substr(fileBody, 18, 20)
    chain_id <- substr(fileBody, 22, 22)
    residue_seq_num <- as.integer(substr(fileBody, 23, 26))
    insert_residue_code <- substr(fileBody, 27, 27)
    x_ortho_coord <- as.numeric(substr(fileBody, 31, 38))
    y_ortho_coord <- as.numeric(substr(fileBody, 39, 46))
    z_ortho_coord <- as.numeric(substr(fileBody, 47, 54))
    occupancy <- as.numeric(substr(fileBody, 55, 60))
    temp_factor <- as.numeric(substr(fileBody, 61, 66))
    segment_id <- substr(fileBody, 73, 76)
    element_symbol <- substr(fileBody, 77, 78)
    charge <- as.factor(substr(fileBody, 79, 80))

    protein_structure <- data.frame(record_type, serial_num, atom_name, residue_name,
                                    alt_location_id, chain_id, residue_seq_num,
                                    insert_residue_code, x_ortho_coord, y_ortho_coord,
                                    z_ortho_coord, occupancy, temp_factor, segment_id,
                                    element_symbol, charge)
  }

  #Parse ANISOU records separately
  if(record_type == "ANISOU") {
    serial_num <- as.integer(substr(fileBody, 7, 11))
    atom_name <- substr(fileBody, 13, 16)
    alt_location_id <- substr(fileBody, 17, 17)
    residue_name <- substr(fileBody, 18, 20)
    chain_id <- substr(fileBody, 22, 22)
    residue_seq_num <- as.integer(substr(fileBody, 23, 26))
    insert_residue_code <- substr(fileBody, 27, 27)
    U1_1 <- as.numeric(substr(fileBody, 29, 35))
    U2_2 <- as.numeric(substr(fileBody, 36, 42))
    U3_3 <- as.numeric(substr(fileBody, 43, 49))
    U1_2 <- as.numeric(substr(fileBody, 50, 56))
    U1_3 <- as.numeric(substr(fileBody, 57, 63))
    U2_3 <- as.numeric(substr(fileBody, 64, 70))
    element_symbol <- substr(fileBody, 77, 78)
    charge <- as.factor(substr(fileBody, 79, 80))

    anisou_temp_factors <- data.frame(record_type, serial_num, atom_name,
                                      alt_location_id, residue_name, chain_id,
                                      residue_seq_num, insert_residue_code, U1_1,
                                      U2_2, U3_3, U1_2, U1_3, U2_3, element_symbol,
                                      charge)

    #Join the coordinate data with ANISOU Records
    dplyr::left_join(protein_structure, anisou_temp_factors, by = serial_num)
  }

  attr(protein_structure, "Protein_Structure")

  return(protein_structure)
}

# TODO: Fill with code
# Separate file header into parts and store as list maybe?
pdb.parseHeader <- function(fileHeader) {
  #Header Line #################################################################
  header_line_raw <- fileHeader[which(startsWith(fileHeader, "HEADER"))]
  #Separate header line components
  #   classification field
  classification_hdr <- substr(header_line_raw, 11, 50)
  #   Deposition date
  date_hdr_string <- substr(header_line_raw, 51, 59)
  date_hdr <- as.Date(date_hdr_string, "%d-%b-%y") #Format is dd-MMM-yy
  #   ID Code
  id_code <- substr(header_line_raw, 63, 66)
  # Combine header line as list
  header_line <- list(classification_hdr, date_hdr, id_code)
  names(header_line) <- c("classification", "depDate", "idCode")

  #OBSLTE RECORDS ##############################################################
  # TODO: Need to complete
  #TITLE Records ###############################################################
  title <- substr(fileHeader[which(startsWith(fileHeader, "TITLE"))], 11, 80)
  title <- gsub("\n", "", title)

  #SPLIT Records ###############################################################
  #TODO: Need to complete

  #CAVEAT Records ##############################################################
  #TODO: Need to complete

  #COMPND Records ##############################################################
  #TODO: Need to complete

  #SOURCE Records ##############################################################
  #TODO: Need to complete

  #KEYWDS Records ##############################################################
  #TODO: Need to complete

  #EXPDTA Records ##############################################################
  #TODO: Need to complete

  #NUMMDL Records ##############################################################
  #TODO: Need to complete

  #MDLTYP Records ##############################################################
  #TODO: Need to complete

  #AUTHOR Records ##############################################################
  #TODO: Need to complete

  #REVDAT Records ##############################################################
  #TODO: Need to complete

  #SPRSDE Records ##############################################################
  #TODO: Need to complete

  #JRNL Records ################################################################
  #TODO: Need to complete

  #REMARK Records ##############################################################
  #TODO: Need to complete


  #Create the list of header components ########################################
  header <- list(header_line, title)
  names(header) <- c("header_line", "title")

  return(header)
}