hdd_walkthrough.R
In hdd: Easy Manipulation of Out of Memory Data Sets

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
	echo = FALSE,
	eval = TRUE,
	collapse = TRUE,
	comment = "#>"
)

## -----------------------------------------------------------------------------
# We conditionnaly run the code pieces
if(!file.exists("C:/Users/laurent.berge/DATA/MAG/Authors.txt")){
	message("This vignette takes the example of a data set of over 13GB on disk, and thus cannot be run non-locally.")
	knitr::opts_chunk$set(eval = FALSE)
}

## -----------------------------------------------------------------------------
#  suppressPackageStartupMessages(library(data.table))
#  library(hdd)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  library(hdd)
#  peek("_path/authors.txt")

## ---- results='asis'----------------------------------------------------------
#  tab = head(peek("C:/Users/laurent.berge/DATA/MAG/Authors.txt", view = FALSE))
#  if("pdf_document" %in% rmarkdown::all_output_formats(knitr::current_input())){
#    # tab = as.data.table(lapply(tab, function(x) fplot:::truncate_string(iconv(x, to="ASCII", sub = "£"), method = "trimMid", trunc = 16)))
#    tab = as.data.table(lapply(tab, iconv, to="ASCII", sub = "£"))
#  }
#  knitr::kable(tab)
#  

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  col_names = c("AuthorId", "Rank", "NormalizedName", "DisplayName",
#  			  "LastKnownAffiliationId",
#  			  "PaperCount", "CitationCount", "CreatedDate")
#  txt2hdd("_path/authors.txt", # The text file
#  		# dirDest: The destination of the HDD data => must be a directory
#  		dirDest = "_path/hdd_authors",
#  		chunkMB = 500, col_names = col_names)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  authors = hdd("_path/hdd_authors")
#  summary(authors)

## -----------------------------------------------------------------------------
#  authors = hdd("C:/Users/laurent.berge/DATA/MAG/HDD/authors")
#  summary(authors)

## ---- echo = TRUE-------------------------------------------------------------
#  head(authors)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  
#  fun_ascii = function(x){
#  	# selection of the first 3 columns
#  	res = x[, 1:3]
#  	# selection of only ascii names
#  	res[!is.na(iconv(NormalizedName, to = "ASCII"))]
#  }
#  
#  col_names = c("AuthorId", "Rank", "NormalizedName", "DisplayName",
#  			  "LastKnownAffiliationId",
#  			  "PaperCount", "CitationCount", "CreatedDate")
#  txt2hdd("_path/authors.txt", dirDest = "_path/hdd_authors_ascii",
#  		chunkMB = 500, col_names = col_names,
#  		preprocessfun = fun_ascii)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  authors_ascii = hdd("_path/hdd_authors_ascii")
#  head(authors_ascii)

## -----------------------------------------------------------------------------
#  authors_ascii = authors[1:50, 1:3]
#  authors_ascii = authors_ascii[!is.na(iconv(NormalizedName, to = "ASCII"))]
#  head(authors_ascii)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  names_einstein = authors[grepl("\\beinstein\\b", NormalizedName),
#  						 NormalizedName]
#  length(names_einstein)
#  head(names_einstein)

## -----------------------------------------------------------------------------
#  load("C:/Users/laurent.berge/Google Drive/R_packages/hdd/_DATA/names_einstein.RData")
#  length(names_einstein)
#  head(names_einstein)

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  authors[!is.na(iconv(NormalizedName, to = "ASCII")), 1:3,
#  		newfile = "_path/hdd_authors_ascii"]

## ---- echo = TRUE-------------------------------------------------------------
#  names_first = authors[1, NormalizedName, file = 1:.N]
#  head(names_first)

## ---- echo = TRUE-------------------------------------------------------------
#  names_last = authors[.N, NormalizedName, file = 1:.N]
#  head(names_last)

## ---- echo = TRUE, error = TRUE-----------------------------------------------
#  author_id = authors$AuthorId

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  # to read the full data set into memory:
#  base_authors = readfst("_path/hdd_authors")
#  # Alternative way
#  authors_hdd = hdd("_path/hdd_authors")
#  base_authors = authors_hdd[]

## ---- echo = TRUE, eval = FALSE-----------------------------------------------
#  # x: the original data set
#  # y: the data set you want to merge to x
#  cartesian_merge = function(x){
#  	merge(x, y, allow.cartesian = TRUE)
#  }
#  
#  hdd_slice(x, fun = cartesian_merge,
#  		  dir = "_path/result_merge", chunkMB = 100)
#