inst/parsers/wrds-Open-source-cross-sectional-data.R

## largely pulled from https://github.com/OpenSourceAP/CrossSection
## From the paper: Chen and Zimmermann (2020), "Open source cross-sectional asset pricing"
# !!! Be sure to properly cite and credit in whatever is integrated in the package !!!

library(RPostgres)
if (!exists("wrds")) {
  wrds <- dbConnect(Postgres(),
                    host='wrds-pgdata.wharton.upenn.edu',
                    port=9737,
                    dbname='wrds',
                    user=rstudioapi::askForPassword("Database username"),
                    password=rstudioapi::askForPassword("Database password"),
                    sslmode='require')
}



## Packages and settings --------------------------------------------------
options(stringsAsFactors = FALSE)

library(stringi)
library(magrittr)

numRowsToPull = -1  # Set to -1 to get all data, set to positive value for testing

## Compustat
 # Lookup codes here:
 # https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/index.cfm
# Compustat annual --------------------------------------------------------

# Variables to download
varsCompustatA = c("aco", "act", "ajex", "am", "ao", "ap", "at", "capx", "ceq", "che", "cogs", "csho", "cshrc", "dcpstk", "dcvt", "dlc", "dlcch", "dltis",
                   "dltr", "dltt", "dm", "dp", "drc", "drlt", "dv", "dvc", "dvp", "dvpa", "dvpd", "dvpsx_c", "dvt", "ebit", "ebitda", "emp", "epspi", "epspx",
                   "fatb", "fatl", "ffo", "fincf", "fopt", "gdwl", "gdwlia", "gdwlip", "gwo", "ib", "ibc", "intan", "invt",
                   "ivao", "ivncf", "ivst", "lco", "lct", "lo", "lt", "mib", "msa", "ni", "nopi", "oancf", "ob", "oancf", "oiadp", "oibdp", "pi", "ppegt",
                   "ppent", "prcc_c", "prcc_f", "prstkc", "prstkcc", "pstk", "pstkl", "pstkrv",  "re", "rect", "recta", "revt", "sale", "scstkc", "seq", "spi",
                   "sstk", "tstkp", "txdi", "txditc", "txfo", "txfed", "txp", "txt", "xacc", "xad", "xint", "xrd", "xpp", "xsga"
)

a_Compustat <-
  # Build query
  paste0("a.", varsCompustatA) %>%
  paste0(collapse = ',') %>%
  paste0("select a.gvkey, a.datadate, a.conm, a.fyear, a.tic, a.cusip, a.naicsh, a.sich, ",
         .,
         " from COMP.FUNDA as a
           where a.consol = 'C'
           and a.popsrc = 'D'
           and a.datafmt = 'STD'
           and a.curcd = 'USD'
           and a.indfmt = 'INDL' ", collapse = "") %>%
  str_replace_all("[\r\n]" , "") %>%
  # Send query
  dbSendQuery(conn = wrds, statement = .) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(a_Compustat, file = 'sandbox/data/CompustatAnnual.csv')
save(a_Compustat, file = 'sandbox/data/CompustatAnnual.RData')



# Compustat Quarterly -----------------------------------------------------

# Variables to download
varsCompustatQ = c("acoq", "actq", "ajexq", "apq", "atq", "ceqq", "cheq", "cogsq", "cshoq", "cshprq", "dpq", "dlcq", "dlttq", "dpq",
                   "drcq", "drltq", "dvpsxq", "epspiq", "epspxq", "fopty", "gdwlq", "ibq", "invtq", "intanq", "ivaoq", "lcoq", "lctq",
                   "loq", "ltq", "mibq", "niq", "oancfy", "oiadpq", "oibdpq", "piq", "ppentq", "ppegtq", "prstkcy", "prccq", "pstkq", "rdq", "req",
                   "rectq", "revtq", "saleq", "seqq", "sstky", "txdiq",
                   "txditcq", "txpq", "txtq", "xaccq", "xintq", "xsgaq", "xrdq")

q_Compustat <-
  # Build query
  paste0("a.", varsCompustatQ) %>%
  paste0(collapse = ',') %>%
  paste0("select a.gvkey, a.datadate, a.fyearq, a.fqtr, a.datacqtr, a.datafqtr, ",
         .,
         " from COMP.FUNDQ as a
           where a.consol = 'C'
           and a.popsrc = 'D'
           and a.datafmt = 'STD'
           and a.curcdq = 'USD'  -- Only USD?
           and a.indfmt = 'INDL' ", collapse = "") %>%
  str_replace_all("[\r\n]" , "") %>%
  # Send query
  dbSendQuery(conn = wrds, statement = .) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(q_Compustat, file = 'sandbox/data/CompustatQuarterly.csv')
save(q_Compustat, file = 'sandbox/data/CompustatQuarterly.RData')

# Compustat pensions ------------------------------------------------------

pensions = dbSendQuery(conn = wrds,
                       statement =
                         "select a.gvkey, a.datadate, a.paddml, a.pbnaa, a.pbnvv, a.pbpro, a.pbpru, a.pcupsu, a.pplao, a.pplau
                     from comp.aco_pnfnda as a
                     where a.indfmt = 'INDL'
                     and a.popsrc = 'D'
                     and a.datafmt = 'STD'
                     and a.consol = 'C'
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(pensions, file = 'sandbox/data/CompustatPensions.csv')
save(pensions, file = 'sandbox/data/CompustatPensions.RData')




# Compustat segments ------------------------------------------------------

segments = dbSendQuery(conn = wrds, statement =
                         "select a.gvkey, a.datadate, a.stype, a.sid, a.sales, a.srcdate, a.naicsh, a.sics1, a.snms
                       from compseg.wrds_segmerged as a
                       "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(segments, file = 'sandbox/data/CompustatSegmentData.csv')
save(segments, file = 'sandbox/data/CompustatSegmentData.Rdata')


# Compustat Customer Segments ---------------------------------------------

seg_customer = dbSendQuery(
  conn=wrds,
  statement="select a.*
  from compseg.wrds_seg_customer as a
  "
) %>%
  dbFetch(res, n = numRowsToPull)

  #mutate(datadate = srcdate) %>%
  #select(-srcdate)

#data.table::fwrite(seg_customer, file = 'sandbox/data/CompustatSegmentDataCustomers.csv')
save(seg_customer, file = 'sandbox/data/CompustatSegmentDataCustomers.RData')

# Compustat short interest ------------------------------------------------

shortinterest = dbSendQuery(conn = wrds,
                            statement =
                              "select a.gvkey, a.iid, a.shortint, a.shortintadj, a.datadate
                       from comp.sec_shortint as a
                       "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(shortinterest, file = 'sandbox/data/CompustatShortInterest.csv')
save(shortinterest, file = 'sandbox/data/CompustatShortInterest.RData')




# CRSP monthly ------------------------------------------------------------

# Follows in part: https://wrds-www.wharton.upenn.edu/pages/support/research-wrds/macros/wrds-macro-crspmerge/

m_crsp = dbSendQuery(conn = wrds, statement =
                       "select a.permno, a.permco, a.date, a.ret, a.retx, a.vol, a.shrout, a.prc, a.cfacshr, a.bidlo, a.askhi,
                     b.shrcd, b.exchcd, b.siccd, b.ticker, b.shrcls,  -- from identifying info table
                     c.dlstcd, c.dlret                                -- from delistings table
                     from crsp.msf as a
                     left join crsp.msenames as b
                     on a.permno=b.permno
                     and b.namedt<=a.date
                     and a.date<=b.nameendt
                     left join crsp.msedelist as c
                     on a.permno=c.permno
                     and date_trunc('month', a.date) = date_trunc('month', c.dlstdt)
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(m_crsp, file = 'sandbox/data/mCRSP.csv')
save(m_crsp, file = 'sandbox/data/mCRSP.RData')


# CRSP Distributions ------------------------------------------------------
m_dist = dbSendQuery(conn = wrds, statement =
                       "select d.permno, d.divamt, d.distcd, d.facshr, d.rcrddt
                     from crsp.msedist as d") %>%
  dbFetch(n = numRowsToPull)

#data.table::fwrite(m_dist, file = 'sandbox/data/mCRSPdistributions.csv')
save(m_dist, file = 'sandbox/data/mCRSPdistributions.RData')


# CRSP daily --------------------------------------------------------------

d_crsp = dbSendQuery(conn = wrds, statement =
                       "select a.permno, a.date, a.ret, a.vol, a.shrout, a.prc, a.cfacshr
                     from crsp.dsf as a
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(d_crsp, file = 'sandbox/data/d_CRSP.csv')
save(d_crsp, file = 'sandbox/data/d_CRSP.RData')



# Credit ratings ----------------------------------------------------------

ratings = dbSendQuery(conn = wrds, statement =
                        "select gvkey, datadate, splticrm -- , spsdrm, spsticrm
                     from comp.adsprate
                     -- where splticrm is not null"
)  %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(ratings, file = 'sandbox/data/ratings.csv')
save(ratings, file = 'sandbox/data/ratings.RData')


# CRSP acquisitions -------------------------------------------------------

acq = dbSendQuery(conn = wrds, statement =
                    "select a.permno, a.distcd, a.exdt, a.acperm
                      from crsp.msedist as a
                      "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(acq, file = 'sandbox/data/mCRSPDistributionInfo.csv')
save(acq, file = 'sandbox/data/mCRSPDistributionInfo.RData')

# IBES --------------------------------------------------------------------

# EPS analyst expectations
eps = dbSendQuery(conn = wrds,
                  statement =
                    "
                  select a.ticker, a.statpers, a.measure, a.fpi, a.numest, a.medest,
                         a.meanest, a.stdev, a.fpedats, a.actual, a.anndats_act
                  from ibes.statsum_epsus as a
                  where a.fpi = '0' or a.fpi = '1' or a.fpi = '6'
                 "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(eps, file = 'sandbox/data/IBES.csv')
save(eps, file = 'sandbox/data/IBES.RData')

# EPS unadjusted actuals
epsAct = dbSendQuery(conn = wrds,
                     statement =
                       "
                  select a.ticker, a.statpers, a.measure, a.int0a, a.shout, a.fy0a
                  from ibes.actpsumu_epsus as a
                  where measure = 'EPS'
                  "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(epsAct, file = 'sandbox/data/IBESUnadjustedActuals.csv')
save(epsAct, file = 'sandbox/data/IBESUnadjustedActuals.RData')


# IBES analyst recommendations file
recd = dbSendQuery(conn = wrds,
                   statement =
                     "
                       select a.ticker, a.estimid, a.ereccd, a.etext, a.ireccd, a.itext, a.emaskcd,
                              a.amaskcd, a.anndats
                       from ibes.recddet as a
                       where a.usfirm = '1'
                       "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

#data.table::fwrite(recd, file = 'sandbox/data/IBES_Recommendations.csv')
save(recd, file = 'sandbox/data/IBES_Recommendations.RData')



# Linking tables ----------------------------------------------------------

# Manual attempt
# ccm = dbSendQuery(conn = wrds,
#                   statement =
#                     "
#                      select distinct a.gvkey, a.conm, a.tic, a.cusip, -- a.cik, a.sich, a.naicsh, -- a.tsymbol, (tsymbol adds duplicates)
#                             b.lpermno, b.lpermco, b.linkdt, b.linkenddt, b.liid, b.linkprim
#                      from compd.funda as a, crsp.ccmxpf_lnkhist as b
#                      where a.gvkey=b.gvkey
#                      and b.linkprim in ('P', 'C')
#                      and b.LINKTYPE in ('LU', 'LC')
#                      and a.datadate>= b.linkdt
#                      and (a.datadate <= b.linkenddt or b.linkenddt is null)
#                      and a.consol = 'C'
#                      and a.popsrc = 'D'
#                      and a.datafmt = 'STD'
#                      and a.curcd = 'USD'
#                      and a.indfmt = 'INDL'
#                      "
# ) %>%
#   # Pull data
#   dbFetch(n = -1) %>%
#   # If more than one row for lpermno, date range, keep prioritized link
#   distinct()

# Replicating WRDS web interface query (see email correspondence with WRDS)
ccm = dbSendQuery(conn = wrds,
                  statement =
                    "
                     select a.gvkey, a.conm, a.tic, a.cusip, a.cik, a.sic, a.naics, b.linkprim, b.linktype, b.liid,
                            b.lpermno, b.lpermco, b.linkdt, b.linkenddt
                     from comp.names as a
                     inner join crsp.ccmxpf_lnkhist as b
                     on a.gvkey = b.gvkey
                     where b.linktype in ('LC', 'LU')
                     and b.linkprim in ('P', 'C')
                     order by a.gvkey
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)


#data.table::fwrite(ccm, file = 'sandbox/data/CCMLinkingTable.csv')
save(ccm, file = 'sandbox/data/CCMLinkingTable.RData')

# IPO dates ---------------------------------------------------------------
tmp <- tempfile()
download.file('https://site.warrington.ufl.edu/ritter/files/2019/05/age19752019.xlsx',
              destfile = tmp,
              method = 'curl')

#ipos = read_excel(path = tmp) %>%
#  transmute(Founding = Found,
#            Offerdate = `Offer date`,
#            CRSPperm = PERM)

#data.table::fwrite(ipos, file = 'sandbox/data/IPODates.csv')

# FRED data ---------------------------------------------------------------

# CPI
#cpi = fredr(series_id = 'CPIAUCSL') %>%
#  select(-series_id)

#data.table::fwrite(cpi, file = 'sandbox/data/CPI.csv')

# GNP deflator
#gnpdefl  = fredr(series_id = 'GNPCTPI') %>%
#  select(-series_id)

#data.table::fwrite(gnpdefl, file = 'sandbox/data/GNPCTPI.csv')


# Broker-Dealer financial assets and liabilities
#temp = fredr(series_id = 'BOGZ1FL664090005Q') %>%
#  transmute(date,
#            assets = value) %>%
#  full_join(fredr(series_id = 'BOGZ1FL664190005Q') %>%
#              transmute(date,
#                        liab = value)) %>%
#  full_join(fredr(series_id = 'BOGZ1FL665080003Q') %>%
#              transmute(date,
                        equity = value)) %>%
#  filter(year(date) >= 1968) %>%
#  mutate(
#    lev = assets/equity  # /(assets-liab)
#    , levfacnsa = log(lev) - log(dplyr::lag(lev,1))
#    , qtr = quarter(date)
#    , year = year(date)
#  )

## compute seasonal adjustment
# temp0 = temp
# tempw1 = temp0 %>%
#  select(year,levfacnsa,qtr) %>%
#  spread(qtr,levfacnsa)

# tempmat = tempw1[,2:5] %>% as.matrix
# rownames(tempmat) = tempw1$year

# qtrmeanavail = array(0L,dim(tempmat))
# for (t in seq(3,dim(tempmat)[1])){
#  qtrmeanavail[t,] = colMeans(tempmat[1:t-1,],na.rm=T)
#}

## adjust in wide matrix format and reshape back (last step)
#tempmatsa = tempmat - qtrmeanavail
#tempw2 = tempmatsa %>% as.data.frame %>%
#  mutate(year = rownames(tempmatsa))

#templong = tempw2 %>%
#  gather(qtr,levfac,-year) %>%
#  mutate(year = as.numeric(year), qtr = as.numeric(qtr)) %>%
#  arrange(year,qtr)

# brokerLev = templong %>% filter(!is.na(levfac))

# data.table::fwrite(brokerLev, file = 'sandbox/data/brokerLev.csv')


# 3-month T-bill rate (quarterly)
#temp = fredr(series_id = 'TB3MS',
#             frequency = 'q',
#             aggregation_method = 'avg') %>%
#  transmute(
#    TbillRate3M = value/100
#    , qtr = quarter(date)
#    , year = year(date))

# data.table::fwrite(temp, file = 'sandbox/data/TBill3M.csv')

# VIX data
# vix = fredr(series_id = 'VXOCLS') %>%
#  select(-series_id)

# data.table::fwrite(vix, file = 'sandbox/data/VIX.csv')

# Factors (Fama French, Liquidity) ----------------------------------------

# Daily
dff = dbSendQuery(conn = wrds,
                  statement =
                    "
                     select date, mktrf, smb, hml, rf, umd
                     from ff.factors_daily
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

data.table::fwrite(dff, file = 'sandbox/data/dFamaFrench.csv')


# Monthly
mff = dbSendQuery(conn = wrds,
                  statement =
                    "
                     select date, mktrf, smb, hml, rf, umd
                     from ff.factors_monthly
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

data.table::fwrite(mff, file = 'sandbox/data/mFamaFrench.csv')

# Liquidity factor (Use more recent data from Pastor's website?)
# https://faculty.chicagobooth.edu/lubos.pastor/research/liq_data_1962_2018.txt

mLiquidity = dbSendQuery(conn = wrds,
                         statement =
                           "
                         select date, ps_innov
                         from ff.liq_ps
                         "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

data.table::fwrite(mLiquidity, file = 'sandbox/data/mLiquidityFactor.csv')

# Market returns ----------------------------------------------------------

# Value-weighted and equal-weighted market returns from CRSP (monthly)
mMarket = dbSendQuery(conn = wrds,
                      statement =
                        "
                     select date, vwretd, ewretd, usdval
                     from crsp.msi
                     "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

data.table::fwrite(mMarket, file = 'sandbox/data/mMarket.csv')

# Value-weighted and equal-weighted market returns from CRSP (daily)
dMarket = dbSendQuery(conn = wrds,
                      statement =
                        "
                      select date, vwretd, ewretd, usdval
                      from crsp.dsi
                      "
) %>%
  # Pull data
  dbFetch(n = numRowsToPull)

data.table::fwrite(dMarket, file = 'sandbox/data/dMarket.csv')



# OptionMetrics  --------------------------------------------------------

# in a separate file because it's so involved
# takes about 3 hours
# tic = proc.time()
# source("11_DownloadOptionsAndProcess.R")
# timer_min = (proc.time() - tic)/60
# timer_min


# 13F data ----------------------------------------------------------------

# also in a separate file because it's so involved
# takes about 20 min
# print("13f download and process")
# tic = proc.time()
# source("1c_Download13FAndProcess.R")
# timer_min = (proc.time() - tic)/60
# timer_min


# Patent citation data ----------------------------------------------------

tmp <- tempfile()
# Download 1
download.file("http://www.nber.org/~jbessen/dynass.dta.zip",
              destfile = tmp,
              method = 'auto')

dynass = haven::read_dta(unz(tmp,"dynass.dta"))

data.table::fwrite(dynass, file = 'sandbox/data/dynass.csv')

# Download 2
download.file("http://www.nber.org/~jbessen/cite76_06.dta.zip",
              destfile = tmp,
              method = 'auto')

cite76_06 = haven::read_dta(unz(tmp,"cite76_06.dta"))

data.table::fwrite(cite76_06, file = 'sandbox/data/cite76_06.csv')

# Download 3
download.file("http://www.nber.org/~jbessen/pat76_06_assg.dta.zip",
              destfile = tmp,
              method = 'auto')

pat76_06_assg = haven::read_dta(unz(tmp,"pat76_06_assg.dta"))

data.table::fwrite(pat76_06_assg, file = 'sandbox/data/pat76_06_assg.csv')


# Q Factor Model ----------------------------------------------------------

df = read.csv(file = 'http://global-q.org/uploads/1/2/2/6/122679606/q5_factors_daily_2019.csv') %>%
  select(-R_EG)

data.table::fwrite(df, file = 'sandbox/data/D_qfactor.csv')


# Input-Output data -------------------------------------------------------

# Make table before 1997
download.file("https://apps.bea.gov/industry/xls/io-annual/IOMake_Before_Redefinitions_1963-1996_Summary.xlsx",
              destfile = 'sandbox/data/IOMake_Before_Redefinitions_1963-1996_Summary.xlsx',
              mode = 'wb')

# Use table before 1997
download.file("https://apps.bea.gov/industry/xls/io-annual/IOUse_Before_Redefinitions_PRO_1963-1996_Summary.xlsx",
              destfile = 'sandbox/data/IOUse_Before_Redefinitions_PRO_1963-1996_Summary.xlsx',
              mode = 'wb')

# Tables starting in 1997
tmp = tempfile()
download.file("https://apps.bea.gov//industry/iTables%20Static%20Files/AllTablesSUP.zip",
              destfile = tmp,
              method = 'auto')

unzip(tmp,
      files = c('Supply_1997-2018_SUM.xlsx', 'Use_SUT_Framework_1997-2018_SUM.xlsx'),
      exdir = '../DataRaw')
JustinMShea/ExpectedReturns documentation built on Sept. 9, 2023, 9:41 p.m.