R/dummies.R

Defines functions dummy_column make_dummies get_dummies

Documented in get_dummies

dummy_column<-function(df,column,coding='2015',ids='stidstd'){
  #Para la codificación en 2015 se utiliza una codificacion dummy estandar agregando
  #variables  dummy para valores perdidos (NA) y omitidos (99), omitiendo la columna
  #correspondiente a la opción 1
  # NA        00...01 <- n+1 variables dummies
  # opcion 1  00...00
  # opcion 2  10...00
  #
  # opcion n  0...100
  # 99        00...10
  if (coding=='2015'){
    dfo<-df%>%
      mutate_at(column,~paste(column,eval(as.symbol(column)),sep='_dummy'))%>%
      mutate(val = 1)%>%
      tidyr::spread(key=column,value = val,fill=0)
  }
  #Para 2012, 2009 y 2006 se generan dos variables dummies
  # value ->    value - mean    0
  #  NA o 99 -> 0               1
  else{
    tempcol<-df[[column]]
    tempcol[tempcol>=99]<-NA
    meanc = mean(tempcol,na.rm = TRUE)
    df[[column]]<-tempcol
    normcolumn<-paste(column,'dummynorm',sep='_')
    nacolumn <-paste(column,'dummyna',sep='_')
    dfo<-df%>%
      mutate(!!normcolumn :=ifelse(is.na(eval(as.symbol(column))),
                                   0,
                                   eval(as.symbol(column))-meanc))%>%
      mutate(!!nacolumn := ifelse(is.na(eval(as.symbol(column))),1,0))
  }
  return(dfo%>%select(ids,contains('dummy')))
}

#Funcion que calcula las variables dummies de la lista vdums y las codifica
#de acuerdo a coding que puede ser de acuerdo a 2015 o 2012
make_dummies <- function(df,vdums,coding='2015') {
  purrr::map(vdums, dummy_column, df = df,coding = coding) %>%
    purrr::reduce(inner_join,by = 'stidstd')  %>%
    select(stidstd,contains('dummy'))%>%
    select(-contains('_dummy1'))
}

#' @title get_dummies
#' @description Esta es una funcion de PFSkit
#' @details Esta funcion es parte del paquete PFS-kit
#' @examples
#' -------
#' @export
# library(readxl)
# library(tidyverse)
# library(dplyr)
#Función que codifica las respuestas del cuestionario contexual
get_dummies<-function(dfQ,dfC){
  not_reached_DR<-rowSums(dfC%>%select(contains('PR'))>3,na.rm = T)/length(colnames(dfC%>%select(contains('PR'))))
  not_reached_DS<-rowSums(dfC%>%select(contains('PS'))>3,na.rm = T)/length(colnames(dfC%>%select(contains('PS'))))
  not_reached_DM<-rowSums(dfC%>%select(contains('PM'))>3,na.rm = T)/length(colnames(dfC%>%select(contains('PM'))))
  not_reached <- as.data.frame(cbind('stidstd'=dfC$idUser_int,
                                     not_reached_DR,
                                     not_reached_DS,
                                     not_reached_DM))
  schooltams<-dfC%>%group_by(knotionCode)%>%summarise(tam=n())%>%ungroup()
  #view(schooltams)
  schoolmax<-schooltams%>%filter(tam==max(schooltams$tam))
  #Seleccionamos todas las variables categoricas del cuestionario de PFS
  PISAQ <- read.csv("data/PISA_QUESTIONARIE.csv")%>%filter(atype=='RADIO')
  #Descartamos las preguntas referentes al estado y pais de nacimiento
  PISACAT <- PISAQ%>%filter(options>0,options<33)
  #Seleccionamos los items de 2015 que se codifican de acuerdo a la información
  #compartida en el Anexo B del reporte tecnico de pisa de 2015
  PISACAT2015 <- PISACAT%>%filter(year==15)
  #Seleccionamos los items de 2012, 2009 y 2006 que se codifican de acuerdo al
  #reporte tecnico de 2012
  PISACAT2012 <- PISACAT%>%filter(year<15)
  #Seleccionamos las respuestas de los items correspondientes a 2015
  dataQ2015<-select(dfQ,c('stidstd',as.character(PISACAT2015[['inputid']])))
  #Seleccionamos los items correspondientes a 2012, 2009 y 2006
  dataQOthers<-select(dfQ,c('stidstd',as.character(PISACAT2012[['inputid']])))
  #Codificamos los items de 2015
  ST022Q01_15v<-dataQ2015[['ST022Q01_15']]
  ST022Q01_15v[ST022Q01_15v==1]<-156
  ST022Q01_15v[ST022Q01_15v==3]<-474
  ST022Q01_15v[ST022Q01_15v==5]<-451
  ST022Q01_15v[ST022Q01_15v==6]<-852
  dataQ2015[['ST022Q01_15']]<-ST022Q01_15v
  dummies2015<-make_dummies(dataQ2015,as.character(PISACAT2015[['inputid']]))
  #Codificamos los demás items
  dummiesOther<-make_dummies(dataQOthers,as.character(PISACAT2012[['inputid']]),'2012')
  dummiesContext<-inner_join(dummies2015,dummiesOther,by='stidstd')
  AGE<-get_AGE(dfQ)%>%select(stidstd,AGE)%>%filter(!is.na(AGE))
  HISEI<-get_HISEI()%>%select(stidstd,HISEI)%>%filter(!is.na(HISEI))
  #AGE_2<-dfQ%>%select(stidstd,ST126Q01_15,ST021Q01_15)
  #limpiamos los datos
  #Edad al entrar a la primaria
  ST126Q01_15v<-dfQ[['ST126Q01_15']]
  ST126Q01_15v[ST126Q01_15v >= 99] <- 99
  ST126Q01_15v[ST126Q01_15v <= 3] <- 1
  ST126Q01_15v[as.logical((ST126Q01_15v >=9)*(ST126Q01_15v< 99))] <- 9
  dfQ[['ST126Q01_15']]<-ST126Q01_15v
  #Si tú NO naciste en México, ¿qué edad tenías cuando llegaste a México?
  ST021Q01_15v<-dfQ[['ST021Q01_15']]
  ST021Q01_15v<-ST021Q01_15v+1
  ST021Q01_15v[ST021Q01_15v>=99]<-99
  dfQ[['ST021Q01_15']]<-ST021Q01_15v

  AGE_2<-dummy_column(dfQ,'ST126Q01_15')%>%select(-ST126Q01_15_dummy1)
  AGE_3<-dummy_column(dfQ,'ST021Q01_15')%>%select(-ST021Q01_15_dummy1)
  dummiespc<-AGE_3%>%inner_join(AGE_2)%>%
    inner_join(AGE)%>%inner_join(dummiesContext)
  pcv<-prcomp(dummiespc%>%select(-stidstd))
  explainvar<-min(which(cumsum(pcv$sdev**2/sum(pcv$sdev**2)) >= 0.95))
  pcvv<-pcv$x[,1:explainvar]
  namesc<-paste('pcvv',1:explainvar,sep="")
  #print(pcvv)
  #print(namesc)
  colnames(pcvv)<-namesc
  #print(cumsum(pcv$sdev**2/sum(pcv$sdev**2)))
  dfpcv<-as.data.frame(cbind('stidstd'=dummiespc$stidstd,pcvv))
  # print(explainvar)
  bookletsvars<-dummy_column(dfC,'idBooklet','2015','idUser_int')%>%
    rename('stidstd'=idUser_int)
  referencebooklet<-bookletsvars%>%filter(idBooklet_dummyPISA7==1)
  ind<-match(referencebooklet$stidstd,bookletsvars$stidstd)
  bookletsvars[ind,-1] <- -1
  bookletsvars<-bookletsvars%>%select(-idBooklet_dummyPISA7)

  schoolvars<-dummy_column(dfC,'knotionCode','2015','idUser_int')%>%
    rename('stidstd'=idUser_int)
  varname<-paste('knotionCode_dummy',schoolmax$knotionCode,sep = "")
  referenceschool<-schoolvars%>%filter(eval(as.symbol(varname))==1)
  indsc<-match(referenceschool$stidstd,schoolvars$stidstd)
  schoolvars[indsc,-1] <- -1
  schoolvars<-schoolvars%>%select(-c(varname))
  gendervars<-dummy_column(dfC,'gender','2015','idUser_int')%>%
    rename('stidstd'=idUser_int)

  # print(nrow(gendervars))
  # print(nrow(bookletsvars))
  # print(nrow(HISEI))
  # print(nrow(not_reached))
  # print(nrow(dfpcv))
  # print(nrow(schoolvars))
  regresors<-gendervars%>%inner_join(bookletsvars)%>%
    inner_join(HISEI)%>%
    inner_join(not_reached)%>%
    inner_join(dfpcv)%>%
    inner_join(schoolvars)
  return(regresors)
}

# dataQ2015<-get_dummies(input_pfs_context_mex_2018,
#                        input_pfs_cognitive_mex_2018)
knotion/PFSkit documentation built on Feb. 12, 2020, 12:16 p.m.