scripts/fixes/view_upload_bz2_fix_2012_07_18.R

library(wpd)
library(dplyr)
library(urltools)
library(utf8)
library(DBI)
library(RPostgreSQL)
library(data.table)

page_title_files_bz2 <- "/data/wpd/2012//pagecounts-2012-07-18.bz2"


# preparing loop stats
start_time_global  <- Sys.time()
sum_counter <- 0


# report time script started
cat("--- START --- ", as.character(start_time_global), " --- \n")




i <- 1

  start_time <- Sys.time()

  # date
  date <-
    gsub(
      x           = page_title_files_bz2[i],
      pattern     = "(^.*?-)(\\d{4}-\\d{2}-\\d{2})(.bz2)",
      replacement = "\\2"
    )

  # clean up database before putting in data
  wpd_get_query(
    paste0(
      "delete from page_views_traffic",
      " where traffic_date = '", date,"'"
    ),
    con = wpd_connect("pm")
  )

  wpd_get_queries(
    queries =
      paste0(
        "delete from page_views_", wpd_languages,
        " where page_view_date = '", date, "'"
      ),
    con = wpd_connect("pm")
  )



  # open file connection
  bz_con <-
    bzfile(
      description = page_title_files_bz2[i],
      open        = "rb"
    )


  # set initial loop values
  counter    <- 0
  n_lines    <- 100000
  lines      <- ""
  lines_filter <- data.frame()


  # read first chunk of lines
  while ( length(lines) > 0 ){

    counter      <- counter + 1
    lines        <- readLines(con = bz_con, n = n_lines)
    sum_counter  <- counter * n_lines

    if( sum_counter < 58800000 ){
      next
    }


    lines_list <- wpd_dump_lines_to_df_list(lines)

    res <-
      lapply(
        X   = lines_list,
        FUN =
          function(df){
            wpd_upload_pageview_counts(
              page_name       = utf8_encode(df$page_name),
              page_view_count = df$page_view_count,
              page_view_date  = date,
              page_language   = df$lang[1]
            )
          }
      )



    # report on progress
    cat(
      "\n - ", page_title_files_bz2[i], "-",
        format(sum_counter, big.mark = ",", scientific = FALSE),
      " ~",
      "| ",
      round(
        (
          difftime(Sys.time(), start_time, units = "secs") /
            (sum_counter)) * 1000000, 1
        ), "sec/Mio",
      "| \u2211",
      round(
        difftime(Sys.time(), start_time, units = "mins"),
        1
      ), "min"
    )
  }
petermeissner/wikipediadumbs documentation built on Nov. 5, 2019, 12:19 a.m.