knitr::opts_chunk$set(echo = FALSE) load(file = '~/git/oss/data/oss/working/sourceforge/cleaned_SF.RData') uncomp_SF <- cleaned_SF load(file = '~/git/oss/data/oss/working/sourceforge/DONE_SFclean.RData') cleaned_SF <- as.data.frame(cleaned_SF) length(unique(c(cleaned_SF$Category.1, cleaned_SF$Category.2, cleaned_SF$Category.3))) library(ggplot2) library(plyr) library(dplyr) library(DiagrammeRsvg) library(DiagrammeR) library(magrittr) library(rsvg) library(data.table)
First, I will assess the completeness of the 16/17 variables.
# profile_dat <- c() # for(i in 1:ncol(cleaned_SF)){ # print(i) # if(i == 12){ # x <- cleaned_SF[,i] # # # Make all blanks NAs # x[x == ""] <- NA # # name <- colnames(cleaned_SF)[i] # class <- class(x) # # # Proportion of missing values # miss <- round(sum(is.na(x))*100/nrow(cleaned_SF), digits = 2) # # # How many unique values to the variable? # auth <- c() # for(j in 1:length(x)){ # print(j) # new_auth <- trimws(unlist(str_split(x[j], ","))) # auth <- c(auth, new_auth) # } # # vals <- length(unique(auth)) # # summary <- summary(x) # #if(vals <= 10){ # # tab <- table(x) # # print(tab) # #} # }else{ # x <- cleaned_SF[,i] # # # Make all blanks NAs # x[x == ""] <- NA # # name <- colnames(cleaned_SF)[i] # class <- class(x) # # # Proportion of missing values # miss <- round(sum(is.na(x))*100/nrow(cleaned_SF), digits = 2) # # # How many unique values to the variable? # vals <- length(unique(x)) # # summary <- summary(x) # #if(vals <= 10){ # # tab <- table(x) # # print(tab) # #} # } # new_row <- c(name, class, miss, vals) # profile_dat <- rbind(profile_dat, new_row) # } #save(auth, file = '~/git/oss/data/oss/working/sourceforge/auth_list.RData') colnames(profile_dat) <- c("Column Name", "Class", "% Missing" , "Unique Values") #save(profile_dat, file = '~/git/oss/data/oss/working/sourceforge/profile_dat.RData') #colnames(profile_dat) <- c("Column Name", "Class", "% Missing" , "Unique Values") #rownames(profile_dat) <- c() #profile_dat <- as.data.frame(profile_dat) #View(profile_dat) load(file = '~/git/oss/data/oss/working/sourceforge/profile_dat.RData') library(gridExtra) #save this table # png(filename="~/git/oss/src/ckelling/images/profile_table.png", # units="in", # width=10, # height=10, # #pointsize=12, # res=72 # ) # p<-tableGrob(profile_dat) # grid.arrange(p) # dev.off()
I would like to look at some histograms of the quantitative variables.
hist(as.numeric(cleaned_SF$Average.Rating)) hist(as.numeric(cleaned_SF$Number.of.Ratings)) hist(as.numeric(cleaned_SF$Number.of.Ratings[which(as.numeric(cleaned_SF$Number.of.Ratings) > 5)])) hist(as.numeric(cleaned_SF$Weekly.Downloads)) hist(as.numeric(cleaned_SF$Weekly.Downloads[which(as.numeric(cleaned_SF$Weekly.Downloads) >0 & as.numeric(cleaned_SF$Weekly.Downloads) < 100)])) hist(as.numeric(cleaned_SF$Ease)) hist(as.numeric(cleaned_SF$features)) hist(as.numeric(cleaned_SF$design)) hist(as.numeric(cleaned_SF$support)) hist(as.numeric(cleaned_SF$Total.Downloads[which(as.numeric(cleaned_SF$Total.Downloads) > 1000 & as.numeric(cleaned_SF$Total.Downloads) < 30000)]))
#plot last updated over time! library(lubridate) ## Change the column's class to be compatible with lubridate cleaned_SF$Last.Update <- as.POSIXct(as.character(cleaned_SF$Last.Update), format = "%Y-%m-%d") data <- cleaned_SF ## Pull components of the date # data$min <- factor(min(cleaned_SF$Received_Date_Time)) # data$hour <- factor(hour(cleaned_SF$Received_Date_Time)) data <- data[-which(is.na(data$Last.Update)),] data$date <- factor(day(data$Last.Update)) data$month <- factor(month(data$Last.Update)) data$year <- factor(year(data$Last.Update)) #head(data) ## Group it by date and hour summary <- data %>% group_by(year, month) %>% dplyr::summarise(freq = length(OSS.Title)) summary$date <- as.Date(paste0('01','/',as.character(summary$month), '/', as.character(summary$year)), "%d/%m/%Y") ## Plot frequency lines ggplot(summary, aes(x=date, y=freq))+ geom_line(aes(group = 1,colour = substr(date,6,7))) + #geom_point() + ggtitle("Last Update Over Time") + labs(x = "Date", y = "Total Update") + theme(plot.title = element_text(hjust = 0.5))+ scale_colour_discrete(name="Month") # ggplot(summary, aes(month, freq)) + # geom_line(aes(color = year, group = year)) + # geom_point() + # ggtitle("Last Update Over Time") + # labs(x = "Date", y = "Total update") + # theme(plot.title = element_text(hjust = 0.5))+ # scale_colour_discrete(name="Month")
Now, I would like to do the same for the date registered.
#plot date registered over time! ## Change the column's class to be compatible with lubridate cleaned_SF$Date.registered <- as.POSIXct(as.character(cleaned_SF$Date.registered), format = "%Y-%m-%d") data <- cleaned_SF ## Pull components of the date # data$min <- factor(min(cleaned_SF$Received_Date_Time)) # data$hour <- factor(hour(cleaned_SF$Received_Date_Time)) data$date <- factor(day(cleaned_SF$Date.registered)) data$month <- factor(month(cleaned_SF$Date.registered)) data$year <- factor(year(cleaned_SF$Date.registered)) ## Group it by date and hour summary <- data %>% group_by(year, month) %>% dplyr::summarise(freq = length(OSS.Title)) summary$date <- as.Date(paste0('01','/',as.character(summary$month), '/', as.character(summary$year)), "%d/%m/%Y") summary<- summary[-1,] ## Plot frequency lines ggplot(summary, aes(x=date, y=freq))+ geom_line(aes(group = 1,colour = substr(date,6,7))) + #geom_point() + ggtitle("Date Registered Over Time") + labs(x = "Date", y = "Total Update") + theme(plot.title = element_text(hjust = 0.5)) # ggplot(summary, aes(month, freq)) + # geom_line(aes(color = year, group = year)) + # geom_point() + # ggtitle("Date Registered Over Time") + # labs(x = "Date", y = "Total update") + # theme(plot.title = element_text(hjust = 0.5))
Now, I would like to assess the universe by looking at categories.
agg_dat <- plyr::count(cleaned_SF, c('Category.1', 'Category.2', 'Category.3')) #agg_dat1 <- plyr::count(cleaned_SF, c('Category.1', 'Category.2')) ggplot(agg_dat1)+ geom_bar(aes(x=Category.1, y=freq,fill = Category.1), stat= "identity")+ theme(legend.position="none",axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Categories and Subcategories") for(i in 1:nrow(agg_dat)){ if(agg_dat$Category.1[i]=="Graphics" & is.na(agg_dat$Category.1[i]) == FALSE){ agg_dat$Category.2[i] <- agg_dat$Category.3[i] agg_dat$Category.3[i] <- NA } } agg_dat1 <- plyr::count(agg_dat, c('Category.1', 'Category.2')) agg_dat1 <- agg_dat1[-nrow(agg_dat1),] agg_dat <- agg_dat[-nrow(agg_dat),] top_cat <- data.table(agg_dat1, key="Category.1") top_cat <- top_cat[order(Category.1, -freq),] top_cat <- top_cat[, head(.SD, 2), by=Category.1] #taking out the NA row top_cat <- top_cat[-nrow(top_cat),] View(top_cat)
library(dplyr) library(DiagrammeR) library(plyr) source("~/git/oss/src/ckelling/analysis/sourceforge/dan_functions.R") #load(file = '~/git/oss/data/oss/working/sourceforge/cleaned_SF.RData') #cleaned_SF <- as.data.frame(cleaned_SF) #agg_dat <- plyr::count(cleaned_SF, c('Category.1', 'Category.2', 'Category.3')) #Full network network_graph(agg_dat) #Audio and Video Graph av_counts = agg_dat[which(agg_dat$Category.1 == "Audio & Video"),] network_graph(av_counts)#, title = "Category: Audio and Video") #Business and Enterprise Graph be_counts = agg_dat[which(agg_dat$Category.1 == "Business & Enterprise"),] network_graph(be_counts) #Communications Graph comm_counts = agg_dat[which(agg_dat$Category.1 == "Communications"),] network_graph(comm_counts) #Development Graph dev_counts = agg_dat[which(agg_dat$Category.1 == "Development"),] network_graph(dev_counts) #Games and Graphics Graph #need to edit graphics a bit gg_counts = agg_dat[which(agg_dat$Category.1 == "Graphics" | agg_dat$Category.1 == "Games"),] network_graph(gg_counts) #Home and Education and Other Graph he_counts = agg_dat[which(agg_dat$Category.1 == "Home & Education" | agg_dat$Category.1 == "Multimedia" | agg_dat$Category.1 == "Other/Nonlisted Topic"),] network_graph(he_counts) #Science and Engineering se_counts = agg_dat[which(agg_dat$Category.1 == "Science & Engineering"),] network_graph(se_counts) # Security and Utilities su_counts = agg_dat[which(agg_dat$Category.1 == "Security & Utilities"),] network_graph(su_counts) #System Administration sa_counts = agg_dat[which(agg_dat$Category.1 == "System Administration"),] network_graph(sa_counts) #NA #na_counts = agg_dat[which(is.na(agg_dat$Category.1)),] #network_graph(na_counts na_val = print(agg_dat[which(is.na(agg_dat$Category.1)),][,4]) paste("Number of NA values: ", na_val, ", which is ", round(na_val/sum(agg_dat$freq),3)*100, "%.", sep="") # network_graph(se_counts) %>% # export_svg %>% charToRaw %>% rsvg_pdf("~/git/oss/src/ckelling/images/graph.pdf") network_graph(se_counts) %>% export_svg %>% charToRaw %>% rsvg_png("~/git/oss/src/ckelling/images/se_graph.pdf") network_graph(agg_dat) %>% export_svg %>% charToRaw %>% rsvg_png("~/git/oss/src/ckelling/images/full_graph2.pdf")
This provides an example of one of the networks. I have created networks for all of the other categories, but as of now, they are too large to share in the Rmarkdown document.
se_counts = agg_dat[which(agg_dat$Category.1 == "Science & Engineering"),] network_graph(se_counts)
library(SnowballC) library(stringr) library(tm) library(wordcloud) library(plyr) library(dplyr) load(file = '~/git/oss/data/oss/working/sourceforge/DONE_SFunclean.RData') complete_data <- cleaned_SF ## Word cloud of descriptions dps_chrg_corpus <- Corpus(VectorSource(complete_data$Description)) #dps_chrg_corpus <- tm_map(dps_chrg_corpus, PlainTextDocument) dps_chrg_corpus <- tm_map(dps_chrg_corpus, content_transformer(tolower)) dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeWords, stopwords('english')) dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeWords, c("will","set","functions", "data", "package", "based", "can", "provides", "set", "used", "project", "using", "contains", "function")) dps_chrg_corpus <- tm_map(dps_chrg_corpus, removeNumbers) dps_chrg_corpus <- tm_map(dps_chrg_corpus, removePunctuation) dps_chrg_corpus <- tm_map(dps_chrg_corpus, stemDocument) dps_temp_stemmed <- data.frame(text = sapply(dps_chrg_corpus, as.character), stringsAsFactors = FALSE) dps_tdm <- TermDocumentMatrix(dps_chrg_corpus) #need to subset my tdm load(file= "~/git/oss/data/oss/working/openhub/randomProjects/oh_random_proj_desc_dtm.RData") tdm <- dps_tdm# your term document matrix your_terms <- findFreqTerms(tdm, lowfreq = 1500) say_terms <- findFreqTerms(oh_random_proj_desc_dtm, lowfreq = 10) #try low freq of 100 new_tdmc <- tdm[your_terms, ] new_tdms <- oh_random_proj_desc_dtm[say_terms, ] #create my tdm dps_m <- as.matrix(new_tdmc) dps_v <- sort(rowSums(dps_m),decreasing=TRUE) dps_d <- data.frame(word = names(dps_v),freq = dps_v) #wordcloud(dps_d$word, dps_d$freq, min.freq = 1000,max.words = 500, random.order = FALSE, colors=brewer.pal(8, "Dark2")) #create Openhub tdm dps_ms <- as.matrix(new_tdms) dps_vs <- sort(rowSums(dps_ms),decreasing=TRUE) dps_ds <- data.frame(word = names(dps_vs),freq = dps_vs) # find the top 300 terms for each tdm dps_d <- dps_d[1:300,] dps_ds <- dps_ds[1:300,] #combine the datasets into a combined tdm test <- full_join(dps_d, dps_ds, by = c(word = "word"), suffix = c("SF", "OH")) test[is.na(test) == TRUE] <- 0 test_compar <- c() i <- c() new_row <- c() for(i in 1:nrow(test)){ #i=303 if(test$freqSF[i] == 0 | test$freqOH[i] == 0){ new_row <- test[i,] test_compar <- rbind(test_compar, new_row) } #rownames(test_compar) <- c() } rownames(test) <- test$word test <- test[,-1] rownames(test_compar) <- test_compar$word test_compar <- test_compar[,-1] #plot comparison and commonality clouds commonality.cloud(test, max.words = 400, random.order = FALSE, colors = brewer.pal(8, "Dark2")) comparison.cloud(test_compar, random.order=FALSE, colors = c("indianred3","steelblue3"), title.size=2.5, max.words=400) png(filename="~/git/oss/src/ckelling/images/new_images/wordcloud_common.png", units="in", width=10, height=10, #pointsize=12, res=72 ) commonality.cloud(test, max.words = 400, random.order = FALSE, colors = brewer.pal(8, "Dark2")) dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.