R/subset_geogs.R

#' @title Data subsetting by geography
#' @description Does data/revenue subsetting by geography and across revenue streams based on artist categories.
#' @param dat An overall dataset from which to subset
#' @param art_list A \code{list} of artist categories
#' @param geography A character scalar. Specifies which geography you would like subsetting for
#' @param tier_breaks A character vector. The aggretate revenue breaks for tiers. Defaults to \code{c(20000, 5000, 1000)}.
#' @return A \code{list} of data frames.
#' @export
subset_geogs <- function(dat, art_list, geography, tier_breaks= c(20000, 5000, 1000)) {
  require(dplyr)
  # subset data and artist lists to specific geography
  dat2 <- dat[dat$geography == geography & dat$artistid != 770, ]
  art_list <- art_list[art_list$geography == geography, ]
  
  
  # create artist tiers
  art_p <- list(); art_d <- list()
  
  art_p[[1]] <- art_list[art_list$physical >= tier_breaks[1], 1]
  art_p[[2]] <- art_list[art_list$physical >= tier_breaks[2] & art_list$physical < tier_breaks[1], 1]
  art_p[[3]] <- art_list[art_list$physical >= tier_breaks[3] & art_list$physical < tier_breaks[2], 1]
  
  art_d[[1]] <- art_list[art_list$all_digi >= tier_breaks[1], 1]
  art_d[[2]] <- art_list[art_list$all_digi >= tier_breaks[2] & art_list$all_digi < tier_breaks[1], 1]
  art_d[[3]] <- art_list[art_list$all_digi >= tier_breaks[3] & art_list$all_digi < tier_breaks[2], 1]
  
  geo_p <- list(); geo_d <- list()
  
  # create tiers in data
  dat2$p_tier <- ifelse(dat2$artistid %in% art_p[[1]]$artistid, 1, 
                        ifelse(dat2$artistid %in% art_p[[2]]$artistid, 2,
                               ifelse(dat2$artistid %in% art_p[[3]]$artistid, 3, NA)))
  dat2$d_tier <- ifelse(dat2$artistid %in% art_d[[1]]$artistid, 1, 
                        ifelse(dat2$artistid %in% art_d[[2]]$artistid, 2,
                               ifelse(dat2$artistid %in% art_d[[3]]$artistid, 3, NA)))
  
  #title_format2 and return
  dat2$title_format2 <- interaction(dat2$title, dat2$format2)
  
  # error checking for iTunes Europe and CMG
  if (geography != "iTunes_EUR") {
    # do subsetting    
    geo_p[[1]] <- dat2[dat2$artistid %in% art_p[[1]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T))
    geo_p[[1]] <- dcast(geo_p[[1]], ... ~ revtype, value.var= "rev")
    #
    geo_p[[2]] <- dat2[dat2$artistid %in% art_p[[2]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T))
    geo_p[[2]] <- dcast(geo_p[[2]], ... ~ revtype, value.var= "rev")
    #
    geo_p[[3]] <- dat2[dat2$artistid %in% art_p[[3]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T))
    geo_p[[3]] <- dcast(geo_p[[3]], ... ~ revtype, value.var= "rev")
    
    geo_p <- lapply(geo_p, function(x) {x[!is.na(x$physical),]})
  }
  
  if (geography != "CMG") {
    ##
    geo_d[[1]] <- dat2[dat2$artistid %in% art_d[[1]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T), units= sum(units, na.rm=T))
    geo_d[[1]] <- dcast(geo_d[[1]], ... ~ revtype, value.var= "rev") 
    #
    geo_d[[2]] <- dat2[dat2$artistid %in% art_d[[2]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T), units= sum(units, na.rm=T))
    geo_d[[2]] <- dcast(geo_d[[2]], ... ~ revtype, value.var= "rev") 
    #
    geo_d[[3]] <- dat2[dat2$artistid %in% art_d[[3]]$artistid, ] %>%
      group_by(date2, dt_scale, albumid, title, format2,
               yr_since_rel, revtype) %>%
      summarise(rev= sum(rev, na.rm=T), units= sum(units, na.rm=T))
    geo_d[[3]] <- dcast(geo_d[[3]], ... ~ revtype, value.var= "rev") 
    
    geo_s <- lapply(geo_d, function(x) {x[!is.na(x$streaming),]})
    geo_d <- lapply(geo_d, function(x) {x[!is.na(x$digital),]})
  } 
  
  # return
  if (geography != "iTunes_EUR" & geography != "CMG") {
    return(list(physical= do.call("rbind", geo_p), digital= do.call("rbind", geo_d), 
                stream= do.call("rbind", geo_s)))
  } else if (geography == "iTunes_EUR" ) {
    return(list(digital= do.call("rbind", geo_d), stream= do.call("rbind", geo_s)))
  } else if (geography == "CMG") {
    return(do.call("rbind", geo_p))
  } else {
    return(NULL)
  }
}

#' @title Data subsetting by geography
#' @description Does data/revenue subsetting by geography and across revenue streams based on artist categories.
#' @param dat An overall dataset from which to subset
#' @param art_list A list of artist categories
#' @param geography A character scalar. Specifies which geography you would like subsetting for
#' @param tier_breaks A character vector. The aggretate revenue breaks for tiers. Defaults to \code{c(20000, 5000, 1000)}.
#' @return A \code{list} of data frames, one for each revenue type / stream.
#' @export
distinct_geogs <- function(dat, art_list, geography, tier_breaks= c(20000, 5000, 1000)) {
  require(dplyr)
  # subset data and artist lists to specific geography
  dat2 <- dat[dat$geography == geography & dat$artistid != 770, ]
  art_list <- art_list[art_list$geography == geography, ]
  
  # create artist tiers
  art_p <- list(); art_d <- list()
  
  art_p[[1]] <- art_list[art_list$physical >= tier_breaks[1], 1]
  art_p[[2]] <- art_list[art_list$physical >= tier_breaks[2] & art_list$physical < tier_breaks[1], 1]
  art_p[[3]] <- art_list[art_list$physical >= tier_breaks[3] & art_list$physical < tier_breaks[2], 1]
  
  art_d[[1]] <- art_list[art_list$all_digi >= tier_breaks[1], 1]
  art_d[[2]] <- art_list[art_list$all_digi >= tier_breaks[2] & art_list$all_digi < tier_breaks[1], 1]
  art_d[[3]] <- art_list[art_list$all_digi >= tier_breaks[3] & art_list$all_digi < tier_breaks[2], 1]
  
  # create tiers in data
  dat2$p_tier <- ifelse(dat2$artistid %in% art_p[[1]]$artistid, 1, 
                        ifelse(dat2$artistid %in% art_p[[2]]$artistid, 2,
                               ifelse(dat2$artistid %in% art_p[[3]]$artistid, 3, NA)))
  dat2$d_tier <- ifelse(dat2$artistid %in% art_d[[1]]$artistid, 1, 
                        ifelse(dat2$artistid %in% art_d[[2]]$artistid, 2,
                               ifelse(dat2$artistid %in% art_d[[3]]$artistid, 3, NA)))
  
  #title_format2 and return
  dat2$title_format2 <- interaction(dat2$title, dat2$format2)
  
  return(split(dat2, factor(dat2$revtype)))
}
alexWhitworth/concord documentation built on May 11, 2019, 11:25 p.m.