In team-oss/scrape-cran: Measuring the Cost and Value of Open Source Software

knitr::opts_chunk$set(echo = FALSE)
load(file = '~/git/oss/data/oss/working/sourceforge/cleaned_SF.RData')
uncomp_SF <- cleaned_SF
load(file = '~/git/oss/data/oss/working/sourceforge/DONE_SFclean.RData')
cleaned_SF <- as.data.frame(cleaned_SF)
length(unique(c(cleaned_SF$Category.1, cleaned_SF$Category.2, cleaned_SF$Category.3)))

library(ggplot2)
library(plyr)
library(dplyr)
library(DiagrammeRsvg)
library(DiagrammeR)
library(magrittr)
library(rsvg)
library(data.table)

Analysis of Sourceforge Data

First, I will assess the completeness of the 16/17 variables.

# profile_dat <- c()
# for(i in 1:ncol(cleaned_SF)){
#   print(i)
#   if(i == 12){
#     x <- cleaned_SF[,i]
# 
#     # Make all blanks NAs
#     x[x == ""] <- NA
# 
#     name <- colnames(cleaned_SF)[i]
#     class <- class(x)
# 
#     # Proportion of missing values
#     miss <- round(sum(is.na(x))*100/nrow(cleaned_SF), digits = 2)
# 
#     # How many unique values to the variable?
#     auth <- c()
#     for(j in 1:length(x)){
#       print(j)
#       new_auth <- trimws(unlist(str_split(x[j], ",")))
#       auth <- c(auth, new_auth)
#     }
# 
#     vals <- length(unique(auth))
#     # summary <- summary(x)
#     #if(vals <= 10){
#     #  tab <- table(x)
#     #  print(tab)
#     #}
#   }else{
#     x <- cleaned_SF[,i]
# 
#     # Make all blanks NAs
#     x[x == ""] <- NA
# 
#     name <- colnames(cleaned_SF)[i]
#     class <- class(x)
# 
#     # Proportion of missing values
#     miss <- round(sum(is.na(x))*100/nrow(cleaned_SF), digits = 2)
# 
#     # How many unique values to the variable?
#     vals <- length(unique(x))
#     # summary <- summary(x)
#     #if(vals <= 10){
#     #  tab <- table(x)
#     #  print(tab)
#     #}
#   }
#   new_row <- c(name, class, miss, vals)
#   profile_dat <- rbind(profile_dat, new_row)
# }
#save(auth, file = '~/git/oss/data/oss/working/sourceforge/auth_list.RData')

colnames(profile_dat) <- c("Column Name", "Class", "% Missing" , "Unique Values")

#save(profile_dat, file = '~/git/oss/data/oss/working/sourceforge/profile_dat.RData')
#colnames(profile_dat) <- c("Column Name", "Class", "% Missing" , "Unique Values")
#rownames(profile_dat) <- c()
#profile_dat <- as.data.frame(profile_dat)
#View(profile_dat)
load(file = '~/git/oss/data/oss/working/sourceforge/profile_dat.RData')

library(gridExtra)
#save this table
# png(filename="~/git/oss/src/ckelling/images/profile_table.png",
#     units="in",
#     width=10,
#     height=10,
#     #pointsize=12,
#     res=72
# )
# p<-tableGrob(profile_dat)
# grid.arrange(p)
# dev.off()

I would like to look at some histograms of the quantitative variables.

hist(as.numeric(cleaned_SF$Average.Rating))
hist(as.numeric(cleaned_SF$Number.of.Ratings))
hist(as.numeric(cleaned_SF$Number.of.Ratings[which(as.numeric(cleaned_SF$Number.of.Ratings) > 5)]))
hist(as.numeric(cleaned_SF$Weekly.Downloads))
hist(as.numeric(cleaned_SF$Weekly.Downloads[which(as.numeric(cleaned_SF$Weekly.Downloads) >0 & as.numeric(cleaned_SF$Weekly.Downloads) < 100)]))
hist(as.numeric(cleaned_SF$Ease))
hist(as.numeric(cleaned_SF$features))
hist(as.numeric(cleaned_SF$design))
hist(as.numeric(cleaned_SF$support))
hist(as.numeric(cleaned_SF$Total.Downloads[which(as.numeric(cleaned_SF$Total.Downloads) > 1000 & as.numeric(cleaned_SF$Total.Downloads) < 30000)]))

#plot last updated over time!
library(lubridate)

## Change the column's class to be compatible with lubridate
cleaned_SF$Last.Update <- as.POSIXct(as.character(cleaned_SF$Last.Update), format = "%Y-%m-%d")
data <- cleaned_SF

## Pull components of the date
# data$min <- factor(min(cleaned_SF$Received_Date_Time))
# data$hour <- factor(hour(cleaned_SF$Received_Date_Time))
data <- data[-which(is.na(data$Last.Update)),]
data$date <- factor(day(data$Last.Update))
data$month <- factor(month(data$Last.Update))
data$year <- factor(year(data$Last.Update))
#head(data)


## Group it by date and hour
summary <- data %>%
  group_by(year, month) %>%
  dplyr::summarise(freq = length(OSS.Title))


summary$date <- as.Date(paste0('01','/',as.character(summary$month), '/', as.character(summary$year)), "%d/%m/%Y")


## Plot frequency lines
ggplot(summary, aes(x=date, y=freq))+
  geom_line(aes(group = 1,colour = substr(date,6,7))) +
  #geom_point() +
  ggtitle("Last Update Over Time") +
  labs(x = "Date", y = "Total Update") +
  theme(plot.title = element_text(hjust = 0.5))+
    scale_colour_discrete(name="Month")


# ggplot(summary, aes(month, freq)) +
#    geom_line(aes(color = year, group = year)) +
#    geom_point() +
#    ggtitle("Last Update Over Time") +
#    labs(x = "Date", y = "Total update") +
#    theme(plot.title = element_text(hjust = 0.5))+ 
#     scale_colour_discrete(name="Month")

Now, I would like to do the same for the date registered.

#plot date registered over time!

## Change the column's class to be compatible with lubridate
cleaned_SF$Date.registered <- as.POSIXct(as.character(cleaned_SF$Date.registered), format = "%Y-%m-%d")
data <- cleaned_SF

## Pull components of the date
# data$min <- factor(min(cleaned_SF$Received_Date_Time))
# data$hour <- factor(hour(cleaned_SF$Received_Date_Time))
data$date <- factor(day(cleaned_SF$Date.registered))
data$month <- factor(month(cleaned_SF$Date.registered))
data$year <- factor(year(cleaned_SF$Date.registered))

## Group it by date and hour
summary <- data %>%
  group_by(year, month) %>%
  dplyr::summarise(freq = length(OSS.Title))

summary$date <- as.Date(paste0('01','/',as.character(summary$month), '/', as.character(summary$year)), "%d/%m/%Y")
summary<- summary[-1,]

## Plot frequency lines
ggplot(summary, aes(x=date, y=freq))+
  geom_line(aes(group = 1,colour = substr(date,6,7))) +
  #geom_point() +
  ggtitle("Date Registered Over Time") +
  labs(x = "Date", y = "Total Update") +
  theme(plot.title = element_text(hjust = 0.5))


# ggplot(summary, aes(month, freq)) +
#    geom_line(aes(color = year, group = year)) +
#    geom_point() +
#    ggtitle("Date Registered Over Time") +
#    labs(x = "Date", y = "Total update") +
#    theme(plot.title = element_text(hjust = 0.5))

Now, I would like to assess the universe by looking at categories.

agg_dat <- plyr::count(cleaned_SF, c('Category.1', 'Category.2', 'Category.3'))
#agg_dat1 <- plyr::count(cleaned_SF, c('Category.1', 'Category.2'))

ggplot(agg_dat1)+ geom_bar(aes(x=Category.1, y=freq,fill = Category.1), stat= "identity")+ theme(legend.position="none",axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Categories and Subcategories")

for(i in 1:nrow(agg_dat)){
  if(agg_dat$Category.1[i]=="Graphics" & is.na(agg_dat$Category.1[i]) == FALSE){
    agg_dat$Category.2[i] <- agg_dat$Category.3[i]
    agg_dat$Category.3[i] <- NA
  }
}
agg_dat1 <- plyr::count(agg_dat, c('Category.1', 'Category.2'))
agg_dat1 <- agg_dat1[-nrow(agg_dat1),]
agg_dat <- agg_dat[-nrow(agg_dat),]

top_cat  <- data.table(agg_dat1, key="Category.1")
top_cat <- top_cat[order(Category.1, -freq),]
top_cat <- top_cat[, head(.SD, 2), by=Category.1]
#taking out the NA row
top_cat <- top_cat[-nrow(top_cat),]
View(top_cat)

library(dplyr)
library(DiagrammeR)
library(plyr)

source("~/git/oss/src/ckelling/analysis/sourceforge/dan_functions.R")
#load(file = '~/git/oss/data/oss/working/sourceforge/cleaned_SF.RData')
#cleaned_SF <- as.data.frame(cleaned_SF)
#agg_dat <- plyr::count(cleaned_SF, c('Category.1', 'Category.2', 'Category.3'))

#Full network
network_graph(agg_dat)

#Audio and Video Graph
av_counts = agg_dat[which(agg_dat$Category.1 == "Audio & Video"),]
network_graph(av_counts)#, title = "Category: Audio and Video")

#Business and Enterprise Graph
be_counts = agg_dat[which(agg_dat$Category.1 == "Business & Enterprise"),]
network_graph(be_counts)

#Communications Graph
comm_counts = agg_dat[which(agg_dat$Category.1 == "Communications"),]
network_graph(comm_counts)

#Development Graph
dev_counts = agg_dat[which(agg_dat$Category.1 == "Development"),]
network_graph(dev_counts)

#Games and Graphics Graph
#need to edit graphics a bit
gg_counts = agg_dat[which(agg_dat$Category.1 == "Graphics" | agg_dat$Category.1 == "Games"),]
network_graph(gg_counts)

#Home and Education and Other Graph
he_counts = agg_dat[which(agg_dat$Category.1 == "Home & Education" | agg_dat$Category.1 == "Multimedia" | agg_dat$Category.1 == "Other/Nonlisted Topic"),]
network_graph(he_counts)

#Science and Engineering
se_counts = agg_dat[which(agg_dat$Category.1 == "Science & Engineering"),]
network_graph(se_counts)

# Security and Utilities
su_counts = agg_dat[which(agg_dat$Category.1 == "Security & Utilities"),]
network_graph(su_counts)

#System Administration
sa_counts = agg_dat[which(agg_dat$Category.1 == "System Administration"),]
network_graph(sa_counts)

#NA
#na_counts = agg_dat[which(is.na(agg_dat$Category.1)),]
#network_graph(na_counts
na_val = print(agg_dat[which(is.na(agg_dat$Category.1)),][,4])
paste("Number of NA values: ", na_val, ", which is ", round(na_val/sum(agg_dat$freq),3)*100, "%.", sep="")

# network_graph(se_counts) %>%
#   export_svg %>% charToRaw %>% rsvg_pdf("~/git/oss/src/ckelling/images/graph.pdf")


network_graph(se_counts) %>%
  export_svg %>% charToRaw %>% rsvg_png("~/git/oss/src/ckelling/images/se_graph.pdf")

network_graph(agg_dat) %>%
  export_svg %>% charToRaw %>% rsvg_png("~/git/oss/src/ckelling/images/full_graph2.pdf")

This provides an example of one of the networks. I have created networks for all of the other categories, but as of now, they are too large to share in the Rmarkdown document.

se_counts = agg_dat[which(agg_dat$Category.1 == "Science & Engineering"),]
network_graph(se_counts)

library(SnowballC)
library(stringr)
library(tm)
library(wordcloud)
library(plyr)
library(dplyr)

load(file = '~/git/oss/data/oss/working/sourceforge/DONE_SFunclean.RData')
complete_data <- cleaned_SF

## Word cloud of descriptions
dps_chrg_corpus <- Corpus(VectorSource(complete_data$Description))
#dps_chrg_corpus <- tm_map(dps_chrg_corpus, PlainTextDocument)
dps_chrg_corpus <- tm_map(dps_chrg_corpus, content_transformer(tolower))
dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeWords, stopwords('english'))
dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeWords, c("will","set","functions", "data", "package", "based", "can", "provides", "set", "used", "project", "using", "contains", "function"))
dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeNumbers)
dps_chrg_corpus <- tm_map(dps_chrg_corpus, removePunctuation)
dps_chrg_corpus <- tm_map(dps_chrg_corpus, stemDocument)
dps_temp_stemmed <- data.frame(text = sapply(dps_chrg_corpus, as.character), stringsAsFactors = FALSE)
dps_tdm <- TermDocumentMatrix(dps_chrg_corpus)

#need to subset my tdm
load(file= "~/git/oss/data/oss/working/openhub/randomProjects/oh_random_proj_desc_dtm.RData")
tdm <- dps_tdm# your term document matrix
your_terms <- findFreqTerms(tdm, lowfreq = 1500)
say_terms <- findFreqTerms(oh_random_proj_desc_dtm, lowfreq = 10)
#try low freq of 100
new_tdmc <- tdm[your_terms, ] 
new_tdms <- oh_random_proj_desc_dtm[say_terms, ] 

#create my tdm
dps_m <- as.matrix(new_tdmc)
dps_v <- sort(rowSums(dps_m),decreasing=TRUE)
dps_d <- data.frame(word = names(dps_v),freq = dps_v)
#wordcloud(dps_d$word, dps_d$freq, min.freq = 1000,max.words = 500, random.order = FALSE, colors=brewer.pal(8, "Dark2"))

#create Openhub tdm
dps_ms <- as.matrix(new_tdms)
dps_vs <- sort(rowSums(dps_ms),decreasing=TRUE)
dps_ds <- data.frame(word = names(dps_vs),freq = dps_vs)

# find the top 300 terms for each tdm
dps_d <- dps_d[1:300,]
dps_ds <- dps_ds[1:300,]

#combine the datasets into a combined tdm
test <- full_join(dps_d, dps_ds, by = c(word = "word"), suffix = c("SF", "OH"))
test[is.na(test) == TRUE] <- 0


test_compar <- c()
i <- c()
new_row <- c()

for(i in 1:nrow(test)){
  #i=303
  if(test$freqSF[i] == 0 | test$freqOH[i] == 0){
    new_row <- test[i,]
    test_compar <- rbind(test_compar, new_row)
  }
  #rownames(test_compar) <- c()
}

rownames(test) <- test$word
test <- test[,-1]
rownames(test_compar) <- test_compar$word
test_compar <- test_compar[,-1]

#plot comparison and commonality clouds
commonality.cloud(test, max.words = 400, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

comparison.cloud(test_compar, random.order=FALSE, colors = c("indianred3","steelblue3"),
                 title.size=2.5, max.words=400)

png(filename="~/git/oss/src/ckelling/images/new_images/wordcloud_common.png",
    units="in",
    width=10,
    height=10,
    #pointsize=12,
    res=72
)

commonality.cloud(test, max.words = 400, random.order = FALSE, colors = brewer.pal(8, "Dark2"))


dev.off()