R/firm-clustering.R

Defines functions grouping.makefiner grouping.append grouping.computeobj grouping.classify.once grouping.classify grouping.getMeasures kmeansW.repeat grouping.infos

#' extract information
#' @export
grouping.infos <- function(cdata) {

  cdata[, age:= min(aret) - birthyear]

  rr = cdata[(fid!="") & (aret==min(aret)),{
    r = list()
    r$ni = .N
    r$nj = length(unique(fid))

    r$educ1 = .SD[educ==1,.N]
    r$educ2 = .SD[educ==2,.N]
    r$educ3 = .SD[educ==3,.N]

    r$worker_0_30   = .SD[  (age <= 30) ,.N]
    r$worker_30_50  = .SD[  (age > 30) & (age <=50) ,.N]
    r$worker_50_100 = .SD[  (age >50) ,.N]

    r$female = .SD[female=="female",.N]
    r$cohort    = .SD[, mean(birthyear)]

    r$mw     = mean(lw)
    r$wsd    = var(lw)
    r$bwfsd  = .SD[,rep(mean(lw),.N),fid][,var(V1)]
    r$bwcsd  = 0

    r$bwfq50 = .SD[, rep(quantile(lw,0.5),.N), fid][,var(V1)]
    r$bwcq50 = 0
    r$bwfq10 = .SD[, rep(quantile(lw,0.1),.N), fid][,var(V1)]
    r$bwcq10 = 0
    r$bwfq90 = .SD[, rep(quantile(lw,0.9),.N), fid][,var(V1)]
    r$bwcq90 = 0

    r$bwfvar = .SD[, rep(var(lw),.N), fid][,var(V1,na.rm=T)]
    r$bwcvar = 0

    r$pm      = .SD[,list(prod=prod[1]),fid][,mean(log(prod),na.rm=T)]
    r$psd     = .SD[,list(prod=prod[1]),fid][,var(log(prod),na.rm=T)]

    r$sizem    = .SD[,.N,fid][,mean(N)]
    r$sizem2   = .SD[,rep(.N,.N),fid][,mean(V1)]
    r$sizemed  = .SD[,.N,fid][,median(N)+0.0]
    r$sizemed2 = .SD[,rep(.N,.N),fid][,median(V1)+0.0]

    r
  }, clus]

  rr2 = cdata[(fid!="") & (aret==min(aret)),{
    r = list()
    r$ni = .N
    r$nj = length(unique(fid))

    r$educ1 = .SD[educ==1,.N]
    r$educ2 = .SD[educ==2,.N]
    r$educ3 = .SD[educ==3,.N]

    r$worker_0_30   = .SD[  (age <= 30) ,.N]
    r$worker_30_50  = .SD[  (age > 30) & (age <=50) ,.N]
    r$worker_50_100 = .SD[  (age >50) ,.N]

    r$female = .SD[female=="female",.N]
    r$cohort    = .SD[, mean(birthyear)]

    r$mw      = mean(lw)
    r$wsd     = var(lw)
    r$bwfsd   = .SD[,rep(mean(lw),.N),fid][, var(V1)]
    r$bwcsd   = .SD[,rep(mean(lw),.N),clus][,var(V1)]

    r$bwfq50 = .SD[, rep(quantile(lw,0.5),.N), fid][,var(V1)]
    r$bwcq50 = .SD[, rep(quantile(lw,0.5),.N), clus][,var(V1)]
    r$bwfq10 = .SD[, rep(quantile(lw,0.1),.N), fid][,var(V1)]
    r$bwcq10 = .SD[, rep(quantile(lw,0.1),.N), clus][,var(V1)]
    r$bwfq90 = .SD[, rep(quantile(lw,0.9),.N), fid][,var(V1)]
    r$bwcq90 = .SD[, rep(quantile(lw,0.9),.N), clus][,var(V1)]

    r$bwfvar = .SD[, rep(var(lw),.N), fid][,var(V1,na.rm=T)]
    r$bwcvar = .SD[, rep(var(lw),.N), clus][,var(V1,na.rm=T)]

    r$pm      = .SD[,list(prod=prod[1]),fid][,mean(log(prod),na.rm=T)]
    r$psd     = .SD[,list(prod=prod[1]),fid][,sd(log(prod),na.rm=T)]

    r$sizem    = .SD[,.N,fid][,mean(N)]
    r$sizem2   = .SD[,rep(.N,.N),fid][,mean(V1)]
    r$sizemed  = .SD[,.N,fid][,median(N)+0.0]
    r$sizemed2 = .SD[,rep(.N,.N),fid][,median(V1)+0.0]

    r
  }]

  rr2$clus=0
  rr = rbind(rr,rr2)
  setkey(rr,clus)

  return(rr)
}

#' internal function that runs Kmean with multiple starting values
kmeansW.repeat <- function(x, centers, weight = rep(1, nrow(x)), iter.max = 100, nstart = 1,step=20) {

  flog.info("running weigthed kmeans step=%i total=%i",step, nstart)
  flog.info("nobs=%i nmeasures=%i", dim(x)[1], dim(x)[2] )

  best = Inf
  best_clus = NA
  allres = data.frame()

  for (i in 1:round(nstart/step)) {

    clus = kmeansW(x,centers,weight,iter.max,nstart=step)
    tot = sum(clus$withinss)
    allres = rbind(allres,data.frame(i=i,tot=tot,nstrat=step))
    if (tot < best) {
      best_clus = clus
      best = tot
      flog.info("[%2i%%] tot=%f best=%f <<<<", round(100*i*step/nstart), tot,  best)
    } else {
      flog.info("[%2i%%] tot=%f best=%f", round(100*i*step/nstart), tot,  best)
    }
  }
  return(list(best=best_clus,all=allres))
}

#' Extract the measurement matrix to be given to the
#' classification algorithm
#' @export
grouping.getMeasures <- function(sim,y_var="y1",measure="ecdf",Nw=40,model=NA,exclude_movers=FALSE,user_fn=NA) {

  sdata = sim$sdata
  if (exclude_movers) {
    wids=sim$jdata[,wid]
    sdata = sdata[!(wid %in% wids)]
  }

  # extract dep variable
  sdata <- sdata[,ydep := get(y_var)]

  N = sdata[,length(unique(f1))]
  qs = seq(1/(Nw+1),Nw/(Nw+1),l=Nw)
  flog.info("processing %i firms",N)

  # fix quantiles for wages
  wv    = as.numeric(sdata[,quantile(ydep,probs = qs)])
  t.cum = function(Y,W) {
    nj = length(W)
    R = rep(0,nj)
    for (j in 1:nj) {R[j] = sum(Y<=W[j])}
    return(R)
  }
  t.discrepancy = function(Y,W,E) {
    nj = length(W)
    R = rep(0,nj)
    for (j in 1:nj) {R[j] = sum(  ( (Y<=W[j]) - E[j])^2) }
    return(R)
  }

  rd=0
  # from here we have to be careful with the ordering of the firms
  # in the weighting matrix and in the measurement matrix

  flog.info("computing measures...")
  # for each firm get the empirical cdf
  if (str_detect(measure,"user_")) {
    rs = sdata[,user_fn(ydep),f1]
    M      = acast(rs,f1~m,value.var = "value")/sqrt(Nw)
  } else if (measure == "quant") {
    rs     = sdata[,list(cdf = quantile(ydep,probs=qs), s = 1:Nw,N=.N),f1]
    M      = acast(rs,f1~s,value.var = "cdf")/sqrt(Nw)
  } else if (measure == "ecdf") {
    # for each firm get the quantiles
    rs     = sdata[,list(cdf = as.numeric(t.cum(ydep,wv)/.N), s = 1:Nw),f1]
    M      = acast(rs,f1~s,value.var = "cdf")/sqrt(Nw)
    #rd     = sdata[, W[f1]*sum(t.discrepancy(ydep,wv,sqrt(Nw)*M[f1,]))/(.N^2),f1][,sum(V1)]/(N*Nw)
  } else if (measure == "ecdf_true") {
    # for each firm get the quantiles
    rs     = sdata[,list(cdf = pnorm(wv,mean= am[1] + psi[1], sd= sqrt( (model$eta_sd_m0 + model$eta_sd_m1*am[1])^2 + model$eps_sd^2)), s = 1:Nw),f1]
    M      = acast(rs,f1~s,value.var = "cdf")/sqrt(Nw)
    rd     = 0
  } else if (measure == "meanvar") {
    # use means instead
    rs_mean = sdata[,list(mean(ydep),var(ydep)),f1]
    M      = rs_mean[,list(V1,V2)]
  } else if (measure == "meanvar_true") {
    # use means instead
    rs_mean = sdata[,list( model$v_m0 + model$v_m1*am[1] + psi[1] ,(model$eta_sd_m0 + model$eta_sd_m1*am[1]))^2 + model$eps_sd^2 ,f1]
    M       = rs_mean[,list(V1,V2)]
  } else if (measure == "frequency") {
    # use means instead
    if (length(unique(sdata$ydep)) > 100) {
      stop("dependent does seem to have discrete support")
    }
    M       = acast(sdata, f1 ~ ydep, fun.aggregate = length, value.var="ydep" )
    M / rowSums(M)
    rs = data.frame()
  } else {
    stop("unkown measure")
  }

  flog.info("computing weights...")
  # extract weights, in the same order as the M matrix
  fids = rownames(M)
  W   = rep(0,length(fids))
  sdata.size = sdata[,.N,f1]
  sdata.size[,f1s:=paste(f1)]
  setkey(sdata.size,f1s)
  W = sdata.size[fids,N]

  # for (i in 1:length(fids)) {
  #   W[i] = sdata.size[f1==fids[i],N]
  # }

  # rescale the weights
  # W      = W/sqrt(sum(W^2))

  tsize = sum(W^2)/sum(W)
  if (is.na(rd)) rd=0;

  sdata[,ydep:=NULL]

  return(list(M=M,W=W,discrepency=rd,measure=measure,Nw=Nw,tsize=tsize,N=N,fids=rownames(M),rs=rs,wage.quant=wv))
}

#' clusters firms based on their cross-sectional wage distributions
#'
#' @param sdata cross sectional data, needs a column j (firm id) and w (log wage)
#' @param Nw number of points to use for wage distributionsdsd
#' @param ksupp vector of different number of groups to try
#' @param nstart (default:1000) total number of starting values
#' @param iter.max (default:100) max nunmber of step for each repetition
#' @param M  you can pass the matrix measurements, requires also weights W (pass on the truth for instance)
#' @param measures specify the type of measures to use (mean and var, quantiles, etc...)
#' @export
grouping.classify <- function(measures,ksupp = ceiling( (1:(60)^(1/1.3))^1.3),nstart=1000,iter.max=200,stop=FALSE,verbose=1,cval=1) {

  N = measures$N
  flog.info("clustering T=%f, Nw=%i , measure=%s",measures$tsize,measures$Nw,measures$measure);
  ksupp = setdiff(ksupp,1)

  rk = data.frame()
  rk_all = list()
  # we evaluate all values
  if (stop==FALSE) {
    for (k in ksupp) {
      if (k>nrow(measures$M)) next;  # go to next element if not enough row
      if (k<2) next;
      kres = kmeansW(measures$M,centers = k,weight=measures$W,nstart = nstart,iter.max = iter.max)
      rk=rbind(rk,data.frame(k=k,Q=sum(kres$withinss/N)))
      flog.info("k=%i WSS=%f nstart=%i nfrims=%i",k,sum(kres$withinss),nstart,N)
      rk_all[[k]] = kres$cluster
      if ((stop==TRUE) & (sum(kres$withinss/N) < cval*measures$discrepency)) break;
    }

    # search more efficiently
  } else {
    k_max = pmin(max(ksupp),nrow(measures$M)-1)
    k_min = pmax(min(ksupp),2)

    #evaluate k_max
    kres = kmeansW(measures$M,centers = k_max,weight=measures$W,nstart = nstart,iter.max = iter.max)
    flog.info("k=%i WSS=%f nstart=%i nfrims=%i",k,sum(kres$withinss),nstart,N)
    rk=rbind(rk,data.frame(k=k_max,Q=sum(kres$withinss/N)))
    kres_max = kres; rk_all[[k_max]] = kres$cluster
    #evaluate K_min
    kres = kmeansW(measures$M,centers = k_min,weight=measures$W,nstart = nstart,iter.max = iter.max)
    flog.info("k=%i WSS=%f nstart=%i nfrims=%i",k,sum(kres$withinss),nstart,N)
    rk=rbind(rk,data.frame(k=k_min,Q=sum(kres$withinss/N)))
    kres_min = kres; rk_all[[k_min]] = kres$cluster

    if ((sum(kres_min$withinss/N) > cval*measures$discrepency) &
        (sum(kres_max$withinss/N) < cval*measures$discrepency)) {
      for (i in k_min:k_max) {
        k_mid = floor( (k_max + k_min)/2)

        if ( k_mid == k_min) {
          break;
        } else {
          kres = kmeansW(measures$M,centers = k_mid,weight=measures$W,nstart = nstart,iter.max = iter.max)
          flog.info("k=%i WSS=%f nstart=%i nfrims=%i",k,sum(kres$withinss),nstart,N)
          rk=rbind(rk,data.frame(k=k_mid,Q=sum(kres$withinss/N)))
          rk_all[[k_mid]] = kres$cluster

          # check which interval to pick
          if (sum(kres$withinss/N) < cval*measures$discrepency) {
            k_max = k_mid
            kres_max = kres
          } else{
            k_min = k_mid
            kres_min = kres
          }
        }
      } # endfor
    }
  }

  rk$cval    = cval
  rk$tsize   = measures$tsize
  rk$vh      = measures$discrepency
  rk$Nw      = ncol(measures$M)
  rk$measure = measures$measure

  # extract first to below discrepency
  if (any(rk$Q<cval*measures$discrepency)) {
    best_k = min(subset(rk,Q<cval*measures$discrepency)$k)
  } else {
    best_k = max(ksupp)
  }
  return(list(summary=rk,all=rk_all,best_k=best_k,best_cluster=rk_all[[best_k]],discrepency=measures$discrepency,measures=measures))
}

#' clusters firms based on their cross-sectional wage distributions
#'
#' @param sdata cross sectional data, needs a column j (firm id) and w (log wage)
#' @param Nw number of points to use for wage distribution
#' @param k  number of groups
#' @param nstart (default:1000) total number of starting values
#' @param iter.max (default:100) max number of step for each repetition
#' @param measures object created using grouping.getMeasures
#' @param step step size in the repeating
#' @export
grouping.classify.once <- function(measures,k=10,nstart=1000,iter.max=200,step=20) {

  N = measures$N
  flog.info("clustering T=%f, Nw=%i , measure=%s",measures$tsize,measures$Nw,measures$measure);
  kres = kmeansW.repeat(measures$M,centers = k,weight=measures$W,nstart = nstart,iter.max = iter.max,step=step)$best
  rk = data.frame(k=k,Q=sum(kres$withinss/N))
  flog.info("k=%i WSS=%f nstart=%i nfrims=%i",k,sum(kres$withinss),nstart,N)

  rk$cval    = 0
  rk$tsize   = measures$tsize
  rk$vh      = measures$discrepency
  rk$Nw      = ncol(measures$M)
  rk$measure = measures$measure

  return(list(summary=rk,cluster=kres$cluster,best_cluster=kres$cluster,kres=kres,measures=measures))
}

#' Compute the objective function of the clustering
grouping.computeobj <- function(grps) {

  # we collect the info
  rs = grps$measures$rs

  # we attach the weights
  W        = grps$measures$W
  names(W) = grps$measures$fids
  W = as.list(W)
  rs[, weight:= W[[f1]],f1]

  # attach the groups
  G        = grps$cluster
  names(G) = grps$measures$fids
  G = as.list(G)
  rs[, j1 := G[[f1]],f1]

  # compute centroids
  rs[, center := wt.mean(cdf,weight), list(j1,s)]

  # compute weighted sum of squares
  rs[, .N*wt.mean((cdf - center  )^2,weight)]
}


#' Append result of a grouping to a data-set
#'
#' @param sim a data set with jdata and sdata
#' @param clus a clustering result from the classify function
#' @param drop if true firms in sim, not in cluster are dropped
#'
#' @export
grouping.append <- function(sim,clus,drop=F,sort=T){

  # remove cluster if any
  sim$jdata[, j1:=0]
  sim$jdata[, j2:=0]
  sim$sdata[, j1:=0]
  sim$sdata[, j2:=0]

  # create a temporary table to link
  dd = data.table(f = names(clus), j = as.integer(clus))

  # linking
  setkey(dd,f)
  setkey(sim$sdata,f1)
  sim$sdata[,j1 := dd[sim$sdata,j]]
  setkey(sim$jdata,f1)
  sim$jdata[,j1 := dd[sim$jdata,j]]
  setkey(sim$jdata,f2)
  sim$jdata[,j2 := dd[sim$jdata,j]]
  sim$sdata[,j2:=j1]

  # dropping if asked
  if (drop==T) {
    nvals = sim$sdata[,sum(is.na(j1*j2))] + sim$jdata[,sum(is.na(j1*j2))]
    if (nvals>0) {
      flog.warn(" %i observations are missing clusters",nvals)
    }
    sim$jdata <- sim$jdata[j1!=0][j2!=0][!is.na(j1)][!is.na(j2)]
    sim$sdata <- sim$sdata[j1!=0][!is.na(j1)][!is.na(j2)]
  }

  # sort if asked
  if (sort==T) {
    sim <- cluster.order(sim)
  }

  return(sim)
}

#' Gives their won cluster to firm with many movers
#'
#' @param clus a clustering result from the classify function
#' @param sim a data set with jdata and sdata
#'
#' @export
grouping.makefiner <- function(clus,sim,nm=30){

  # we find firms with at least nm movers
  large.firms = sim$jdata[,list(f1=c(f1,f2))][,.N,f1][N>=nm,f1]
  cmax = max(max(clus))

  flog.info("found %i firms to give their own cluster",length(large.firms))

  if (length(large.firms)==0) return(clus);

  for (i in 1:length(large.firms)) {
    I = which(names(clus)==large.firms[[i]])
    clus[[I]] = cmax+i
  }

  return(clus)
}
tlamadon/smfe-pub documentation built on March 21, 2022, 2:23 p.m.