data-raw/main.R

#!/usr/bin/Rscript --vanilla 

#-------------------------------------------------------------------------------
#
# Stanford CS PhD Graduates
# Upwork Contract #15366706
#
# Main code
#
# Code style follows 'Google R Style Guide'
# https://google-styleguide.googlecode.com/svn/trunk/Rguide.xml
#
#-------------------------------------------------------------------------------

setwd("~/Dropbox/tools/techbios/data-raw")

rm(list = ls(all = TRUE))
gc(reset = TRUE)
set.seed(12345)

library(stringi)
library(plyr)
library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(RSQLite)

# Data Setup--------------------------------------------------------------------

db <- dplyr::src_sqlite("stanford_cs_phd.db")

company <- tbl(db, "Company") %>% collect()
person <- tbl(db, "Person") %>% collect()
experience <- tbl(db, "Experience") %>% collect()
education <- tbl(db, "Education") %>% collect()
major <- tbl(db, "Major") %>% collect()
degree <- tbl(db, "Degree") %>% collect()
title <- tbl(db, "Title") %>% collect()
school <- tbl(db, "School") %>% collect()

# Data Pre-processing-----------------------------------------------------------

education %<>%
  mutate(StartYear = suppressWarnings(as.numeric(StartYear)),
         EndYear = suppressWarnings(as.numeric(EndYear))
         )

school %<>%
  mutate(School = 
           stri_replace_all_fixed(School, "\n", "") %>%
               stri_trim_both()
         )

experience %<>%
  mutate(StartYear = suppressWarnings(as.numeric(StartYear)),
         EndYear = suppressWarnings(as.numeric(EndYear)),
         StartMonth = stri_trans_totitle(StartMonth)) %>%
         mutate(StartMonth = factor(StartMonth, levels = month.name))

# Stanford_ID
stanford_id <- school %>%
  filter(stri_detect_fixed(School, "Stanford")) %$%
  SchoolId

# CS_ID
cs_id <- major %>%
  filter(stri_detect_regex(Major, "(Computer)|(Computing)|(Software)|(Information)|(Informatica)|(Informatics)")) %$%
  MajorId

# PhD_ID
phd_id <- degree %>%
  mutate(phd = 
           stri_replace_all_regex(Degree, "[^a-zA-Z]", "") %>%
           stri_trans_tolower()) %>%
  filter(stri_detect_regex(phd, "(phd)|(doctorate)")) %$%
  DegreeId

# Founder_ID
founder_id <- title %>%
    filter(stri_detect_regex(Title,
                             "(Founder)|(founder)|(Owner)|(owner)|(Partner)|(Creator)")) %$%
  TitleId

# Engineer ID
engineer_id <- title %>%
    filter(stri_detect_regex(Title,
                             "(Engineer)|(Developer)|(Programmer)|(Software)|(Architect)")) %$%
  TitleId

####################################################
#' Get the list of Stanford CS PhD Graduates (GRADS)
#'##################################################

devtools::use_data(person, pkg = "..", overwrite = TRUE)
devtools::use_data(education, pkg = "..", overwrite = TRUE)

GRADS <- 
  inner_join(person, education, by = c("PersonId" = "PersonID")) %>%
  filter(SchoolId %in% stanford_id,
         MajorId %in% cs_id,
         DegreeId %in% phd_id,
         Ongoing == 0)

id_list <- list(
    "founder_id" = founder_id,
    "engineer_id" =  engineer_id,
    "phd_id" = phd_id,
    "cs_id" = cs_id,
    "stanford_id" = stanford_id
)

devtools::use_data(id_list, pkg = "..", overwrite = TRUE)

# RESEARCH QUESTIONS------------------------------------------------------------


## library(techbios)    GRADS <- techbios::StanfordCSgradsPhD
##     data("experience")
##     data("person")



##     career_choices <- GRADS %>%
##         select(PersonId, GraduationYear = EndYear) %>%
##         left_join(experience, by = c("PersonId" = "PersonID")) %>%
##         inner_join(person, by = c("PersonId" = "PersonId")) %>%
##             mutate(
##                 FullName = stri_c(Name, Surname, sep = " "),
##                 Choice = ifelse(TitleID %in% founder_id, "Founder", "Employee") %>%
##             factor(levels = c("Employee", "Founder"))) %>%
##          distinct() %>%
##          group_by(PersonId) %>%
##          arrange(FullName, StartYear, StartMonth) %>%
##              do({
##                  .person <- .    
##                  .person %>%
##                      mutate(PreviousJobs = 0:(n() - 1),
##                             EntrepreneurialJobs = lag(cumsum(as.numeric(Choice) - 1), default = 0),
##                             EmployeeJobs = PreviousJobs - EntrepreneurialJobs)
##              }) %>%
##                  select(Choice, 
##                         TimeDecisionMade = StartYear,
##                         ClassYear = GraduationYear,
##                         FullName,
##                         PreviousJobs,
##                         EntrepreneurialJobs, 
##                         EmployeeJobs)









# Q1: Number of Stanford PhD graduates by year----------------------------------
library(dplyr)
devtools::use_data(GRADS, pkg = "..", overwrite = TRUE)
devtools::use_data(experience, pkg = "..", overwrite = TRUE)

# Q2: Fraction of graduates over time having title "founder" or "co-founder" in work histories post grad
#devtools::use_data(df_2, overwrite = TRUE)

# Q3: Career Trajectory---------------------------------------------------------
GraduatesByYear <- 
  inner_join(person, education, by = c("PersonId" = "PersonID")) %>% 
  group_by(PersonId, Name, Surname) %>%
  summarize(GradYear = max(EndYear, na.rm = TRUE)) %>% 
  na.omit() %>%
  ungroup() %>%
  mutate(rank.order = rank(GradYear, ties.method = "random"))

df_3 <- inner_join(
  inner_join(GraduatesByYear, experience, by = c("PersonId" = "PersonID")),
  company, by = c("CompanyID" = "CompanyId")) %>%
  filter(StartYear >= GradYear) %>%
  mutate(type = ifelse(TitleID %in% engineer_id, "Engineer",
                       ifelse(TitleID %in% founder_id, "Founder", "Other"))) %>%
  group_by(PersonId) %>%
  do({
    person_data <- .
    person_data %>%
      mutate(any.founder = any(type %in% "Founder"),
             any.engineer = any(type %in% "Engineer"))
  })

devtools::use_data(df_3, pkg = "..", overwrite = TRUE)


# Q4: Timeline of a single graduate---------------------------------------------

# Finds maximum year in the dataset
max_year <- max(experience$EndYear, na.rm = TRUE)

df_4 <- inner_join(experience, company, by = c("CompanyID" = "CompanyId")) %>%
  inner_join(title, by = c("TitleID" = "TitleId")) %>%
  inner_join(person, by = c("PersonID" = "PersonId")) %>%
  mutate(FullName = stri_c(Name, Surname, sep = " "),
         EndYear = mapvalues(EndYear, from = NA, to = max_year))

only_grads <- GRADS %>%
  select(PersonId, GraduateYear = EndYear) %>%
  na.omit()

df_4_limited <- df_4 %>%
  left_join(only_grads, by = c("PersonID" = "PersonId")) %>%
  filter(StartYear >= GraduateYear)

devtools::use_data(df_4, pkg = "..", overwrite = TRUE)
devtools::use_data(df_4_limited, pkg = "..", overwrite = TRUE)


# Q5: Career choices------------------------------------------------------------
# Create a data set of individual choices about careers at different points in time

# I'd like to create a dataset consisting of every "career" entry, with the following columns:
# | Choice | TimeDecisionMade | ClassYear | IndividualName
# The chocie should be either employee or founder.
# The time decision made should be when they started that job/position, their class year (from Stanford) and their individual name. 
# Please create an R function that returns this data frame.

# Three more columns to add to this dataset:
# (1) Count of previous jobs at that moment in time
# (2) Count of previous entrepreneurial jobs
# (3) Count of previous employee jobs 


##########################
#' Tech Incubator Datasets 
##########################


 
# stargazer(df.school %>% filter(n > 15 & !School %in% c("Y Combinator\n", "YCombinator\n", "YC", "TechCrunch\n")),
#           title = "Schools of Y Combinator / Techcrunch founders",
#           summary = FALSE,
#           label = "tab:schools", 
#           column.labels = c("Company", "N"), 
#           out = "../../writeup/tables/schools.tex",
#           rownames = FALSE)




## # df.1 <- startups %>%
## #   filter(src == "yc") %>%
## #   group_by(cohort) %>%
## #   summarise(startups = n()) %>%
## #   ungroup()
## # 
## # g.per.yc.cohort <- ggplot(data = df.1) +
## #   geom_line(aes(x = 1:nrow(df.1), y = startups)) +
## #   scale_x_continuous(breaks = 1:nrow(df.1), labels = df.1$cohort) +
## #   theme_bw() +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
## #   xlab("Cohort") + ylab("Number of Start-Ups")
## # 
## # writeImage(g.per.yc.cohort, "per_yc_cohort", width = 7, height = 4)


# devtools::use_data(career_transitions, pkg = "..", overwrite = TRUE)


## library(data.table)

## df.cohort <- data.table(founders)[, list(num.grads = .N,
##                                          num.founders = sum(was_founder), 
##                                          num.engineers = sum(was_engineer),
##                                          num.both = sum(was_founder & was_engineer),
##                                          frac.founders = sum(was_founder)/.N,
##                                          frac.engineers = sum(was_engineer)/.N
##                                          ), by  = graduation_year]

## ggplot(data = subset(df.cohort, num.grads > 6), aes(x = graduation_year, y = frac.founders)) + geom_line()

## ggplot(data = subset(df.cohort, num.grads > 6), aes(x = graduation_year, y = frac.engineers)) + geom_line()




## ggplot(data = df_2, aes(x = year, y = value)) +
##   geom_line(aes(colour = type), size = 1) +
##   ggtitle("Fraction of founders & engineeres") +
##   ylab("Fraction, % of total graduation pool") +
##   xlab("Year") +
##   theme_light() +
##   theme(legend.title = element_blank(),
##         legend.key = element_blank())
## ggsave(filename = "output/Q2_fraction_of_founders_engineers_by_year.png", width = 9, height = 6)



## #
## # THE CODE BELOW IS ADOPTED FROM PREVIOUS PROJECT
## #


## # library(survival)
## # library(lfe)
## # library(dplyr)
## # library(tidyr)
## # library(stringr)
## # library(lubridate)
## # library(ggplot2)
## # library(magrittr)
## # library(scales)
## # library(binom)
## # library(stargazer)
## # library(data.table)
## # library(JJHmisc) # non-cran resource

## # system("rm ../../writeup/parameters.tex")
## # addParam <- genParamAdder("../../writeup/parameters.tex")


## # df.company <- 
## #   inner_join(experience, company, 
## #              by = c("CompanyID" = "CompanyId")) %>%
## #   group_by(CompanyName) %>%
## #   summarise(n = length(unique(PersonID))) %>%
## #   arrange(desc(n)) %>%
## #   mutate(CompanyName = plyr::mapvalues(CompanyName, "McKinsey & Company", "McKinsey"))
## # 
## # stargazer(df.company %>% filter(n > 9 & !CompanyName %in% c("Stanford University")),
## #           title = "Employers of Stanford CS PhDs graduates",
## #           summary = FALSE,
## #           label = "tab:companies", 
## #           column.labels = c("Company", "N"), 
## #           out = "writeup/tables/companies.tex",
## #           rownames = FALSE)
## # system("sed -i 's/cc/lr/g' ../../writeup/tables/companies.tex")

## # educational data
## # school$SchoolID <- school$SchoolId

## # df.school <- 
## #   inner_join(education, school, by = c("SchoolId")) %>%
## #   group_by(School) %>%
## #   summarise(n = n()) %>%
## #   arrange(desc(n))
## # 
## # stargazer(df.school %>% filter(n > 15 & !School %in% c("Y Combinator\n", "YCombinator\n", "YC", "TechCrunch\n")),
## #           title = "Schools of Y Combinator / Techcrunch founders",
## #           summary = FALSE,
## #           label = "tab:schools", 
## #           column.labels = c("Company", "N"), 
## #           out = "../../writeup/tables/schools.tex",
## #           rownames = FALSE)
## # system("sed -i 's/cc/lr/g' ../../writeup/tables/schools.tex")

## # # Imputes TC cohorts
## # tc_cohorts <- 
## #   read.table("../../data/tc_cohort.csv", sep = ",", header = TRUE, na.strings = "#N/A") %>% 
## #   na.omit() %>%
## #   mutate(year = str_match(Time.of.Techstar, "[0-9]{1,4}"),
## #          season = str_match(Time.of.Techstar, "[[:alpha:]]{1,6}")) %>%
## #   mutate(season = plyr::mapvalues(season, 
## #                                   from = c("Spring", "Fall", "Summer", NA, "Winter"), 
## #                                   to = c("S", "W", "S", "W", "W")),
## #          class_code = str_c(season, str_sub(year, 3, 4))) %>%
## #   tbl_df()
## # 
## # tc_codes <- 
## #   startups %>% 
## #   inner_join(tc_cohorts, by = "Name") %>%
## #   select(Name, class_code, src)
## # 
## # startups %<>% 
## #   left_join(tc_codes, by = c("Name", "src")) %>%
## #   mutate(Class = ifelse(src == "yc", as.character(Class), class_code)) %>%
## #   select(-class_code) %>%
## #   na.omit()
## # 
## # startups %<>%
## #   tidyr::extract(Class, c("Season", "Year"), "([SWPF])([0-9]{2})") %>%
## #   mutate(Year = 2000 + as.numeric(Year)) %>%
## #   unite(cohort, Year, Season, remove = FALSE)
## # 
## # founders %<>% na.omit()
## # 
## # degree %<>%
## #   mutate(undergraduate = 
## #            stri_detect_regex(Degree, "(^BS)|(^B.S)|(^Bachelor)|(^BA)|(^B.A.)|(^B.)"))
## # 
## # major %<>%
## #   mutate(CS = 
## #            str_detect(Major, "(Computer)|(Computing)|(Software)|(Information)|(Informatica)|(Informatics)"),
## #          STEM = 
## #            str_detect(Major, "(Engineering)|(Physics)|(Mathematics)|(Technology)|(Medicine)|(Biophysics)") & !CS,
## #          Social =
## #            str_detect(Major, "(Finance)|(Business)|(Economics)|(Economic)|(Public)|(Political)|(Accounting)|(Psychology)") & !CS & !STEM,
## #          Other = !CS & !STEM & !Social) %>%
## #   gather(Field, Value, -MajorId, -Major) %>%
## #   filter(Value == TRUE) %>%
## #   select(-Value) %>%
## #   arrange(MajorId)
## # 
## # education %<>%
## #   mutate(DegreeId = as.integer(DegreeId),
## #          MajorId = as.integer(MajorId),
## #          StartYear = as.numeric(StartYear),
## #          EndYear = as.numeric(EndYear))
## # 
## # experience %<>%
## #   mutate(StartYear = ifelse(StartYear != "", as.numeric(StartYear), year(Sys.Date())),
## #          EndYear = as.numeric(
## #            ifelse(EndYear %in% c("Present", "", "actuel", "Actualidad", "Heute"),
## #                   year(Sys.Date()), EndYear)),
## #          StartMonth = ifelse(!StartMonth %in% month.name, "January", StartMonth),
## #          EndMonth = ifelse(!EndMonth %in% month.name, "December", EndMonth)) %>%
## #   rowwise() %>%
## #   mutate(StartMonth = which(month.name %in% StartMonth),
## #          EndMonth = which(month.name %in% EndMonth)) %>%
## #   mutate(years = (EndYear*12 + EndMonth - StartYear*12 - StartMonth + 1) / 12) %>%
## #   ungroup() %>% 
## #   tbl_df()
## # 
## # title %<>%
## #   mutate(Founder = 
## #            str_detect(Title, "(Founder)|(founder)|(Owner)|(owner)|(Partner)|(Creator)"),
## #          Business = str_detect(Title, "(CEO)|(CTO)|(VP)|(President)|(Vice)|(Product)") &
## #            !Founder,
## #          Designer = 
## #            str_detect(Title, "(Designer)|(designer)|(Art )|(art )|(Creative)|(creative)|(Photographer)|(Graphic)|(Design)") &
## #            !Founder & !Business,
## #          Developer =
## #            str_detect(Title, "(Engineer)|(Developer)|(Programmer)|(Software)") &
## #            !Founder & !Business & !Designer,
## #          Other = !Founder & !Business & !Designer & !Developer) %>%
## #   gather(Field, Value, -src, -TitleId, -Title) %>%
## #   filter(Value == TRUE) %>%
## #   select(-Value) %>%
## #   arrange(TitleId)
## # 
## # # Questions--------------------------------------------------------------------
## # 
## # # Q1---------------------------------------------------------------------------
## # # Show number of Y combinator companies per cohort
## # df.1 <- startups %>%
## #   filter(src == "yc") %>%
## #   group_by(cohort) %>%
## #   summarise(startups = n()) %>%
## #   ungroup()
## # 
## # g.per.yc.cohort <- ggplot(data = df.1) +
## #   geom_line(aes(x = 1:nrow(df.1), y = startups)) +
## #   scale_x_continuous(breaks = 1:nrow(df.1), labels = df.1$cohort) +
## #   theme_bw() +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
## #   xlab("Cohort") + ylab("Number of Start-Ups")
## # 
## # writeImage(g.per.yc.cohort, "per_yc_cohort", width = 7, height = 4)
## # 
## # # Q2---------------------------------------------------------------------------
## # # Show mean founder team size per cohort
## # df.2 <- inner_join(founders, startups, by = c("Name", "src")) %>%
## #   group_by(cohort, Name) %>%
## #   summarise(founders = n()) %>%
## #   group_by(cohort) %>%
## #   summarise(mean_team = mean(founders),
## #             max_team = max(founders),
## #             min_team = min(founders), 
## #             se = sd(founders) / sqrt(n())) %>%
## #   mutate(lower = mean_team - 2*se,
## #          upper = mean_team + 2*se)
## # 
## # g.team.size <- ggplot(data = df.2, aes(x = 1:nrow(df.2))) +
## #   geom_line(aes(y = mean_team)) +
## #   geom_line(aes(y = max_team), colour = "red") +
## #   geom_line(aes(y = min_team), colour = "blue") + 
## #   geom_errorbar(aes(ymax = upper, ymin = lower),
## #                 alpha = 0.4, width = 0.3) +
## #   scale_x_continuous(breaks = 1:nrow(df.2), labels = df.2$cohort) +
## #   theme_bw() +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
## #   xlab("Cohort") + ylab("Mean Team Size")
## # 
## # writeImage(g.team.size, "team_size", width = 7, height = 4)
## # 
## # 
## # # Q3---------------------------------------------------------------------------

# Plot overall distribution of undergraduate majors of all founders 
# and also faceted by cohort

## df.3a %<>% cbind(binom.confint(x = c(df.3a$n),
##                                n = sum(df.3a$n), 
##                                conf.level = 0.95,
##                                tol = 1e-8,
##                                methods = "exact") %>%
##                    select(lower, upper)) %>%
##   mutate(lower_n = round(lower * sum(n)),
##          upper_n = round(upper * sum(n)))

## g.major <- ggplot(data = df.3a) +
##   geom_bar(aes(x = factor(Major), y = n), stat = "identity", colour = "black", fill = "grey") + 
##   geom_errorbar(aes(x = factor(Major), ymin = lower_n, ymax = upper_n), width = 0.25) +
##   scale_x_discrete(labels = abbreviate(df.3a$Major, 20)) +
##   theme_bw() + 
##   xlab("Major") + 
##   ylab("Frequency")

## writeImage(g.major, "college_major", width = 7, height = 3)
## # 
## # df.3b <- all.persons %>%
## #   group_by(Major, cohort) %>%
## #   summarise(n = n()) %>%
## #   ungroup() %>%
## #   arrange(cohort, desc(n)) %>%
## #   group_by(cohort) %>%
## #   mutate(percentage = n / sum(n))
## # 
## # df.3b$Major <- factor(df.3b$Major, levels = df.3a$Major)
## # df.3b$cohort_level <- as.numeric(factor(df.3b$cohort))
## # 
## # g.college.major.by.cohort <- ggplot(data = df.3b) +
## #   geom_bar(aes(x = factor(Major), y = n), stat = "identity", alpha = 0.9) + 
## #   facet_wrap(~ cohort) + 
## #   theme_bw() + 
## #   xlab("Major") + 
## #   ylab("Frequency")
## # 
## # writeImage(g.college.major.by.cohort, "college_major_by_cohort", width = 7, height = 3)
## # 
## # g.college.major.by.cohort.lines <- ggplot(data = df.3b) + 
## #   theme_bw() +
## #   geom_line(aes(x = cohort_level, y = n, colour = Major)) +
## #   scale_x_discrete(breaks = 1:length(unique(df.3b$cohort)), labels = unique(df.3b$cohort)) +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
## #   xlab("Cohort") + ylab("Frequency")
## # 
## # writeImage(g.college.major.by.cohort.lines, "college_major_by_cohort_lines", width = 7, height = 3)
## # 
## # # Q4---------------------------------------------------------------------------
## # # Plot years of work experience at time of start of y combinator session 
## # # and faceted by cohort
## # 
## # df.4 <- inner_join(all.persons, experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   group_by(PersonId, cohort) %>%
## #   filter((Year*12 + Start) > (StartYear*12 + StartMonth)) %>%
## #   mutate(fraction = ifelse((EndYear*12 + EndMonth) <= (Year*12 + Start), 
## #                            years,
## #                            (Year*12 + Start - StartYear*12 - StartMonth) / 12)) %>%
## #   summarise(prior = min(sum(fraction), max(EndYear) - min(StartYear) + 1))
## # 
## # df.4 <- left_join(all.persons, df.4, by = c("PersonId", "cohort")) %>%
## #   select(PersonId, cohort, prior) %>%
## #   distinct()
## # 
## # g.founder.experience <- ggplot(data = df.4) +
## #   geom_histogram(aes(x = prior), binwidth = 3, alpha = 0.9, fill = "gray", colour = "black") +
## #   xlab("Years of experience") + 
## #   ylab("Frequency") +
## #   theme_bw()
## # 
## # writeImage(g.founder.experience, "founder_experience", width = 7, height = 3)
## # 
## # df.4$cohort_level <- as.numeric(factor(df.4$cohort))
## # 
## # g.founder.experience.by.cohort <- ggplot(data = df.4) +
## #   geom_histogram(aes(x = prior), binwidth = 3, alpha = 0.9) +
## #   facet_wrap(~ cohort) +
## #   xlab("Years of experience") + 
## #   ylab("Frequency") +
## #   theme_bw()
## # 
## # writeImage(g.founder.experience.by.cohort, "founder_experience_by_cohort", width = 7, height = 3)
## # 
## # df.4 %<>%
## #   group_by(cohort) %>%
## #   mutate(q25 = quantile(prior, c(.25), na.rm = TRUE),
## #          q50 = quantile(prior, c(.50), na.rm = TRUE),
## #          q75 = quantile(prior, c(.75), na.rm = TRUE),
## #          mean = mean(prior, na.rm = TRUE)) %>%
## #   na.omit()
## # 
## # g.founder.experience.by.cohort.lines <- ggplot(data = df.4) +
## #   theme_bw() +
## #   geom_point(aes(x = cohort_level, y = prior)) +
## #   geom_ribbon(aes(x = cohort_level, ymin = q25, ymax = q75), 
## #               alpha = 0.5, fill = "lightgreen") +
## #   geom_line(aes(x = cohort_level, y = q50, colour = "50%")) +
## #   geom_line(aes(x = cohort_level, y = mean, colour = "Mean"), linetype = "dashed") + 
## #   scale_x_discrete(breaks = 1:length(unique(df.4$cohort)), labels = unique(df.4$cohort)) +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
## #   theme(legend.title = element_blank(), legend.key = element_blank()) + 
## #   xlab("Cohort\n* Green area is 25% and 75% percentiles") +
## #   ylab("Years of experience") + 
## #   ggtitle("Experience of co-founders by cohort")
## # 
## # writeImage(g.founder.experience.by.cohort.lines, "founder_experience_by_cohort_lines", width = 7, height = 3)
## # 
## # # Q5---------------------------------------------------------------------------
## # # What fraction of founders specifically majored in CS?
## # 
## # df.3a$n[df.3a$Major == "CS"] / sum(df.3a$n)
## # 
## # df.5 <- df.3b %>%
## #   group_by(cohort) %>%
## #   mutate(CS = ifelse(Major == "CS", 1, 0)) %>%
## #   summarise(fraction = sum(CS * n) / sum(n))
## # 
## # g.cs.majors <- ggplot(data = df.5) +
## #   geom_line(aes(x = 1:nrow(df.5), y = fraction)) +
## #   scale_x_continuous(breaks = 1:nrow(df.5), labels = df.5$cohort) +
## #   scale_y_continuous(label = percent, limits = c(0,1)) +
## #   theme_bw() +
## #   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
## #   xlab("Cohort") +
## #   ylab("Fraction of founders with degree in CS") 
## # 
## # 
## # writeImage(g.cs.majors, "cs_majors", width = 7, height = 3)
## # 
## # 
## # # Q6---------------------------------------------------------------------------
## # # Create a visualization that shows the timeline of a single 
## # # Y combinator start-up - birth, founders leaving, death and exit
## # 
## # .person <- 339
## # 
## # df.6 <- inner_join(experience, company, by = c("CompanyID" = "CompanyId", "src" = "src")) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   filter(PersonID == .person)
## # 
## # .title <- person %>% 
## #   filter(PersonId == .person) %$%
## #   str_c(Name, Surname, sep = " ")
## # 
## # g.person <- ggplot(data = df.6) +
## #   geom_rect(aes(xmin = StartYear, xmax = EndYear,
## #                 ymin = ExperienceID, ymax = ExperienceID + 1), fill = "grey", alpha = 0.5) +
## #   geom_text(aes(x = (StartYear + EndYear)/2, y = ExperienceID + 3/4, label = CompanyName)) +
## #   geom_text(aes(x = (StartYear + EndYear)/2, y = ExperienceID + 1/4, label = Title)) +
## #   theme_bw() + 
## #   theme(axis.text.y = element_blank()) + 
## #   xlab("Year") + ylab("") + ggtitle(.title)
## # 
## # print(g.person)
## # 
## # 
## # # Q7---------------------------------------------------------------------------
## # # What fraction of founders worked as a software engineer, developer, programmer
## # # or equivalent prior to Y combinator?
## # 
## # df.7 <- inner_join(all.persons, experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   group_by(PersonId) %>%
## #   filter((Year*12 + Start) > (StartYear*12 + StartMonth)) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   summarise(was_developer = "Developer" %in% Field) %>%
## #   group_by(was_developer) %>%
## #   summarise(n = n())
## # 
## # df.7$percentage <- df.7$n / sum(df.7$n)
## # df.7
## # 
## # g.was.developer <- ggplot(data = df.7) +
## #   geom_bar(aes(x = was_developer, y = percentage, width = 0.5), stat = "identity", alpha = 0.9) +
## #   theme_bw() +
## #   xlab("Was developer prior Y-Combinator?") + ylab("Percentage")
## # 
## # writeImage(g.was.developer, "was_developer", width = 7, height = 3)
## # 
## # 
## # 
## # # Q8---------------------------------------------------------------------------
## # # What fraction of founders worked as a software engineer, developer, programmer 
## # # or equivalent after Y combinator?
## # 
## # df.8 <- inner_join(all.persons, experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   group_by(PersonId) %>%
## #   filter((Year*12 + Start + 3) < (StartYear*12 + StartMonth)) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   summarise(become_developer = "Developer" %in% Field) %>%
## #   group_by(become_developer) %>%
## #   summarise(n = n())
## # 
## # df.8$percentage <- df.8 %$% n / sum(df.8$n)
## # df.8
## # 
## # g.later.developer <- ggplot(data = df.8) +
## #   geom_bar(aes(x = become_developer, y = percentage, width = 0.5), stat = "identity", alpha = 0.9) +
## #   theme_bw() +
## #   xlab("Become developer after Y-Combinator?") + ylab("Percentage")
## # 
## # writeImage(g.later.developer, "later_developer", width = 7, height = 3)
## # 
## # 
## # 
## # # Q9---------------------------------------------------------------------------
## # # What fraction of founder teams have a founder who was either a engineer of some kind or
## # # had an undergraduate or graduate degree in a technical field?
## # 
## # df.9 <- inner_join(all.persons, experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   group_by(PersonId, First.Name, Last.Name, Name) %>%
## #   filter((Year*12 + Start) > (StartYear*12 + StartMonth)) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   summarise(technical_guy = "Developer" %in% Field | "CS" %in% Major | "STEM" %in% Major) %>%
## #   group_by(Name) %>%
## #   summarise(has_technical_guys = sum(technical_guy) > 0) %>%
## #   group_by(has_technical_guys) %>%
## #   summarise(percentage = n()) %>%
## #   mutate(percentage = percentage / sum(percentage))
## # 
## # df.9
## # 
## # g.any.eng <- ggplot(data = df.9) +
## #   geom_bar(aes(x = has_technical_guys, y = percentage, width = 0.5), stat = "identity", alpha = 0.9) +
## #   theme_bw() +
## #   xlab("Has engineer or co-founder with technical degree?") + ylab("Percentage")
## # 
## # writeImage(g.any.eng, "any_eng", width = 7, height = 3)
## # 
## # 
## # hasEng <- subset(df.9, has_technical_guys)$percentage
## # addParam("hasEng", 100*round(hasEng, 2))
## # 
## # # Q10--------------------------------------------------------------------------
## # # For each Y combinator cohort, show the fractions of founders that: 
## # # (a) are still at their original start-up 
## # # (b) are the founder/co-founder of another start-up
## # # (c) have an employee role at some other company
## # 
## # df.10 <- inner_join(experience, company, by = c("CompanyID" = "CompanyId", "src" = "src")) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   inner_join(all.persons, by = c("PersonID" = "PersonId", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   select(cohort, Year, Season, Start, PersonID, Name, CompanyName, StartYear, StartMonth, EndYear, EndMonth, Title, Field) %>%
## #   distinct() %>%
## #   mutate(after_startup = (EndYear*12 + EndMonth) > (Year*12 + Start)) %>%
## #   mutate(still_in_startup = (Name == CompanyName) & after_startup,
## #          founder_of_another_startup = (Name != CompanyName) & (CompanyName %in% startups$Name) & after_startup,
## #          employee_at_other_company = (Name != CompanyName) & !(CompanyName %in% startups$Name) & (Field != "Founder") & after_startup) %>%
## #   group_by(cohort, PersonID) %>%
## #   summarise(still_in_startup = any(still_in_startup),
## #             founder_of_another_startup = any(founder_of_another_startup),
## #             employee_at_other_company = any(employee_at_other_company)) %>%
## #   group_by(cohort) %>%
## #   summarise(employee_at_other_company = sum(employee_at_other_company) / n(),
## #             still_in_startup = sum(still_in_startup) / n(),
## #             founder_of_another_startup = sum(founder_of_another_startup) / n()) 
## # 
## # df.10
## # 
## # df.10 %<>% gather(key, value, -cohort)
## # .x <- rep(1:(nrow(df.10)/3), 3)
## # 
## # pretty.roles <- list("employee_at_other_company" = "Employee at another company", "still_in_startup" = "At original startup", "founder_of_another_startup" = "Founder of another startup")
## # df.10$role <- with(df.10, as.vector(unlist(pretty.roles[key])))
## # 
## # 
## # g.where.now <- ggplot(data = df.10) +
## #   geom_bar(aes(x = .x, y = value), stat = "identity", position = "dodge", colour = "black", fill = "grey") +
## #   facet_wrap(~role) + 
## #   scale_x_continuous(breaks = 1:nrow(df.1), labels = df.1$cohort) +
## #   theme_bw() +
## #   xlab("Cohort") + ylab("Fraction of founders") + coord_flip()  
## # 
## # writeImage(g.where.now, "where_now", width = 9, height = 6)
## # 
## # 
## # # Q11--------------------------------------------------------------------------
## # # Of the employees at some other company, what fraction are: 
## # # (a) Engineers/developers of some kind
## # # (b) In a business role (CEO, CTO, VP, Product Manager) 
## # # (c) Other
## # 
## # df.11 <- inner_join(experience, company, by = c("CompanyID" = "CompanyId", "src" = "src")) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src")) %>%
## #   inner_join(all.persons, by = c("PersonID" = "PersonId")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   select(cohort, Year, Season, Start, PersonID, Name, CompanyName, StartYear, StartMonth, EndYear, EndMonth, Title, Field) %>%
## #   distinct() %>%
## #   mutate(after_startup = (EndYear*12 + EndMonth) > (Year*12 + Start)) %>%
## #   mutate(employee_at_other_company = (Name != CompanyName) & !(CompanyName %in% startups$Name) & (Field != "Founder") & after_startup) %>%
## #   filter(employee_at_other_company == TRUE) %>%
## #   group_by(cohort) %>%
## #   summarise(Developers = sum(Field %in% c("Developer", "Designer")) / n(),
## #             Business = sum(Field == "Business") / n(),
## #             Other = sum(Field == "Other") / n())
## # 
## # df.11
## # 
## # df.11 %<>% gather(key, value, -cohort)
## # .x <- rep(1:(nrow(df.11)/3), 3)
## # 
## # g.other.role <- ggplot(data = df.11) +
## #   geom_bar(aes(x = .x, y = value), stat = "identity", position = "dodge") +
## #   facet_wrap(~key) + 
## #   scale_x_continuous(breaks = 1:nrow(df.1), labels = df.1$cohort) +
## #   theme_bw() +
## #   xlab("Cohort") +
## #   ylab("Employee roles in other companies") + coord_flip()  
## # 
## # print(g.other.role)
## # 
## # writeImage(g.other.role, "other_role", width = 7, height = 6)
## # 
## # 
## # # Q12--------------------------------------------------------------------------
## # # Fit a hazard model at the level of start-up, where a “death” is that a 
## # # majority of founder/co-founders have changed firms. Time should be from the 
## # # start of their Y Combinator session. 
## # #
## # # (a) Test the hypothesis that all cohorts have the same hazard rate. 
## # # (b) Plot survival curves pooled across cohorts
## # # (c) Plot survival curves by cohort
## # 
## # persons <- inner_join(experience, company, by = c("CompanyID" = "CompanyId", "src" = "src")) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId", "src" = "src"))
## # 
## # death <- function(df) {
## #   # Helper function to calculate 'death' statistics
## #   #
## #   # Args:
## #   #  df: data.frame for single company
## #   #
## #   # Returns:
## #   #  death - logical
## #   #  months - lifetime
## #   
## #   .cohort <- df$cohort[1]
## #   .is_left <- as.numeric(sum(df$is_left) / nrow(df))
## #   .months <- df %>% 
## #     filter(is_left == .is_left) %>%
## #     summarise(months = max(months))
## #   
## #   data_frame(cohort = .cohort, death = .is_left, months = .months$months)
## # }
## # 
## # df.12 <- 
## #   founders %>% 
## #   inner_join(startups, by = c("Name", "src")) %>%
## #   left_join(persons, by = c("Name" = "CompanyName", "src" = "src")) %>% 
## #   filter(Field %in% c("Founder", NA)) %>%
## #   mutate(start = str_c(Year, ifelse(Season == "S", "06", "12"), "01", sep = "-") %>% ymd()) %>%
## #   na.omit() %>%
## #   group_by(Name, start, Last.Name, First.Name) %>%
## #   summarise(cohort = first(cohort),
## #             is_left = !first(Ongoing),
## #             period_left = str_c(first(EndYear), 
## #                                 ifelse(is_left, first(EndMonth), sprintf("%02d", month(Sys.Date()))),
## #                                 "01", sep = "-") %>% ymd()) %>%
## #   mutate(months = days(period_left - start) %>% day() %>% `%/%`(30)) %>%
## #   mutate(months = ifelse(months > 0, months, 0)) %>%
## #   group_by(Name) %>%
## #   do(death(.)) %>%
## #   ungroup()
## # 
## # df.12
## # 
## # # (b)
## # fit.1 <- survfit(Surv(months, death) ~ 1, data = df.12)
## # summary(fit.1)
## # plot(fit.1)
## # 
## # fit.1.plot <- fit.1 %$% 
## #   data_frame(time, surv, lower, upper)
## # 
## # g.surv.pooled <- ggplot(data = fit.1.plot, aes(x = time)) +
## #   geom_step(aes(y = surv), size = 1, direction = "vh") + 
## #   geom_step(aes(y = lower), linetype = "dashed", direction = "vh") +
## #   geom_step(aes(y = upper), linetype = "dashed", direction = "vh") + 
## #   theme_bw() + 
## #   xlab("Time, months") + ylab("Survival rate")
## # 
## # print(g.surv.pooled)
## # writeImage(g.surv.pooled, "surv_pooled", width = 7, height = 6)
## # 
## # # (c)
## # fit.2 <- survfit(Surv(months, death) ~ cohort, data = df.12)
## # sum.2 <- summary(fit.2)
## # sum.2
## # plot(fit.2)
## # 
## # fit.2.plot <- sum.2 %$% 
## #   data_frame(time, surv, strata) %>%
## #   mutate(strata = str_extract(strata, "[0-9]{4}_[SW]"))
## # 
## # has.data <- fit.2.plot %>%
## #   group_by(strata) %>%
## #   summarise(n = n()) %>%
## #   filter(n > 1) %>%
## #   select(-n)
## # 
## # fit.2.plot %<>% filter(strata %in% has.data$strata)
## # 
## # g.surv <- ggplot(data = fit.2.plot, aes(x = time)) +
## #   geom_step(aes(y = surv, colour = strata), direction = "vh") + 
## #   xlab("Time, months") + ylab("Survival rate") + 
## #   guides(colour = guide_legend(ncol = 2)) + 
## #   theme_bw()
## # 
## # print(g.surv)
## # writeImage(g.surv, "surv", width = 7, height = 6)
## # 
## # # (a)
## # 
## # # Whole sample
## # dif <- survdiff(Surv(months, death) ~ cohort, data = df.12)
## # dif
## # 
## # # Just two cohorts
## # dif <- survdiff(Surv(months, death) ~ cohort, 
## #                 data = df.12 %>% filter(cohort %in% c("2009_S", "2012_S")))
## # dif
## # 
## # # Q13--------------------------------------------------------------------------
## # # From the basic model is 12, add as covariates: 
## # # (a) Average work experience of founders
## # # (b) Average years of education 
## # # (c) Fraction of founders with prior startup founder or co-founder experience 
## # 
## # df.13 <- df.12
## # 
## # # Imputes average prior experience of founders
## # prior_expirience <- inner_join(all.persons, 
## #                                experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   group_by(PersonId, cohort, Name) %>%
## #   filter((Year*12 + Start) > (StartYear*12 + StartMonth)) %>%
## #   mutate(fraction = ifelse((EndYear*12 + EndMonth) <= (Year*12 + Start), 
## #                            years,
## #                            (Year*12 + Start - StartYear*12 - StartMonth) / 12)) %>%
## #   summarise(experience = min(sum(fraction), max(EndYear) - min(StartYear) + 1)) %>%
## #   group_by(Name, cohort) %>%
## #   summarise(experience = mean(experience))
## # 
## # df.13 %<>% left_join(prior_expirience, by = c("Name", "cohort"))
## # 
## # # Imputes average years of education
## # prior_education <- founders %>% 
## #   inner_join(startups, by = c("Name", "src")) %>%
## #   left_join(persons, by = c("Name" = "CompanyName", "src" = "src")) %>% 
## #   filter(Field == "Founder") %>%
## #   select(Name, PersonID, Year) %>%
## #   left_join(education, by = c("PersonID")) %>%
## #   mutate(education = ifelse(Year > EndYear, EndYear - StartYear + 1,
## #                             ifelse(Year < StartYear, 0,
## #                                    Year - StartYear + 1))) %>%
## #   group_by(Name) %>%
## #   summarise(education = mean(education, na.rm = TRUE))
## # 
## # df.13 %<>% left_join(prior_education, by = "Name")
## # 
## # # Imputes prior startup founder or co-founder experience
## # prior_founder <- 
## #   inner_join(all.persons, experience, by = c("PersonId" = "PersonID", "src" = "src")) %>%
## #   mutate(Start = ifelse(Season == "Winter", 12, 6)) %>%
## #   filter((Year*12 + Start) > (StartYear*12 + StartMonth)) %>%
## #   inner_join(title, by = c("TitleID" = "TitleId")) %>%
## #   filter(Field == "Founder") %>% 
## #   select(Name, Last.Name, First.Name, Year) %>%
## #   unite(Founder, First.Name, Last.Name, sep = " ") %>%
## #   distinct() %>%
## #   group_by(Name)
## # 
## # find_prior_founders <- function(df, full = prior_founder) {
## #   # Helper function to find prior founders of other startups
## #   #
## #   # Args:
## #   #  df: data.frame with founders for single startup
## #   #  full: data.frame for all startups/founders
## #   #
## #   # Returns:
## #   #  Fraction of founders who had experience as founders
## #   
## #   .startup <- df$Name[1]
## #   .year <- df$Year[1]
## #   
## #   prior <- full %>%
## #     filter(Founder %in% df$Founder, Name != .startup, Year < .year) %>%
## #     group_by(Founder) %>%
## #     summarise(n = n())
## #   
## #   data_frame(founder = nrow(prior) / nrow(df))
## # }
## # 
## # # !!! Long operation
## # prior_founder %<>% 
## #   do(find_prior_founders(.))
## # 
## # df.13 %<>% left_join(prior_founder, by = "Name")
## # df.13
## # 
## # # (a)
## # fit.a <- coxph(Surv(months, death) ~ experience, data = df.13)
## # summary(fit.a)
## # 
## # # (b)
## # fit.b <- coxph(Surv(months, death) ~ education, data = df.13)
## # summary(fit.b)
## # 
## # # (c)
## # fit.c <- coxph(Surv(months, death) ~ founder, data = df.13)
## # summary(fit.c)
## # 
## # # (a)-(c)
## # fit.ac <- coxph(Surv(months, death) ~ experience + education + founder, data = df.13)
## # summary(fit.ac)
## # 
## # # Q14--------------------------------------------------------------------------
## # # Merge the Github data that has y combinator funding data. 
## # # Create an outcome “received funding” and run an logit model with the same 
## # # set of covariates as in 13. Include a cohort fixed effect.  
## # 
## # df.14 <- tc_cohorts %>%
## #   mutate(Funding = str_replace_all(Funding, "[$,]", "") %>% as.numeric) %>%
## #   select(Name, funding = Funding) %>%
## #   left_join(df.13, by = "Name") %>%
## #   na.omit() %>%
## #   mutate(cohort = as.factor(cohort))
## # 
## # df.14
## # 
## # fit.1 <- felm(I(funding > 0) ~ log1p(experience) + log1p(education) + G(cohort), data = df.14)
## # 
## # fit.1 <- felm(log(funding) ~ log1p(experience) + log1p(education) + G(cohort), data = subset(df.14, funding > 0))
## # 
## # summary(fit.1)
## # 
## # 
## # m <- lm(death ~ log1p(experience)*log1p(education) + factor(cohort), data = df.14)
## # 
## # 
## # 
## # fit.2 <- lm(funding ~ experience + education + founder + factor(cohort), data = df.14)
## # summary(fit.2)
## # 
## # 
## # 
## # fit.3 <- glm(factor(death) ~ experience + education + founder + factor(cohort), 
## #              data = df.14, family = "binomial")
## # summary(fit.3)
## # 
## # 
johnjosephhorton/techbios documentation built on May 19, 2019, 5:15 p.m.