R/getFundsDescription.R

#' Get funds description scarped from manapensija.lv
#'
#' Get fund name, pillar and type (active/balanced/conservative). But unlike
#' \code{\link{getFundsSummary}}, the information comes from the web-page and
#' not from the data itself.
#'
#' @export
#' @import dplyr XML
#'
#' @return Data frame with fund name, pillar and type.
getFundsDescription <- function(){
  require(XML)
  require(dplyr)

  # download html data
  pillar2Url <-
    "http://www.manapensija.lv/en/2nd-pension-pillar/statistics/"
  docPillar2 <- htmlTreeParse(pillar2Url, useInternal = TRUE)
  rootNode <- xmlRoot(docPillar2)

  # get type of funds (active/balanced/conservative)
  fundTypes <- xpathSApply(
    rootNode,
    "//tbody[@class='tablesorter-infoOnly']//div[@class='chapter']",
    xmlValue)

  # get nodes that contain funds of each type
  fundTypeNodes <- getNodeSet(rootNode,"//tbody[@class='data']")
  # pre-allocate data frame
  fundData <- data.frame(index = integer(),
                         label = character(),
                         pillar = integer(),
                         type = character(),
                         stringsAsFactors = F)
  # loop through these nodes
  for(fundTypeIndex in seq_along(fundTypeNodes)){
    # obtain node
    fundTypeNode <- fundTypeNodes[[fundTypeIndex]]
    # get fund name
    fundType <- fundTypes[fundTypeIndex]

    # obtain fund names that are in this node
    fundNames <- xpathSApply(fundTypeNode, "tr/td[@class='plan']", xmlValue)
    nFunds <- length(fundNames) # get the number of funds

    # get fund URLs from which we extract fund indeces
    fundTypeUrls <- xpathSApply(fundTypeNode, "tr/td[@class='plan']/a",
                                xmlAttrs)
    fundTypeUrls <- lapply(fundTypeUrls,function(x) x[1])
    # get a list of positions of indeces in strings
    positionsInUrls <- gregexpr("[0-9][0-9]?$",fundTypeUrls)
    indeces <- integer(length = nFunds) # pre-allocate
    for(i in seq_along(fundTypeUrls)){
      indeces[i] <- as.integer(substr(
        x = fundTypeUrls[i],
        start = positionsInUrls[[i]][1],
        stop = positionsInUrls[[i]][1]+attr(positionsInUrls[[i]],
                                            "match.length")-1))
    }


    # create data frame for current type
    tempDf <- data.frame(index = indeces,
                         name = fundNames,
                         pillar = 2,
                         type = fundType,
                         stringsAsFactors = F)
    # append it to data frame with all funds
    fundData <- rbind(fundData,tempDf)
  }

  # change some columns to factors
  fundData$name <- as.factor(fundData$name)
  fundData$type <- as.factor(fundData$type)

  # sort on fund index
  fundData <- arrange(fundData,index)

  return(select(fundData, name, pillar, type))
}
nickto/PensionFundsLv documentation built on May 23, 2019, 5:08 p.m.