options( htmltools.dir.version = FALSE, formatR.indent = 2, width = 55, digits = 4 ) thisyear <- "YEAR" library(xiangma) library(showtext) library(survival) library(kableExtra) CONN <- .createConn() commdf0 <- dbGetQuery(CONN, "SELECT openid, msgid, doubanid, time, CHARACTER_LENGTH(trim(content))as nchar, include from comment_log") userdf0 <- dbGetQuery(CONN, "SELECT * from member_log") bookdf0 <- dbGetQuery(CONN, "SELECT id as doubanid, title as doubantitle, author from douban_list") followdf0 <- dbGetQuery(CONN, paste0("SELECT * from followers where openid in (", paste0(paste0("'", userdf0$openid, "'"), collapse=","), ")")) dbDisconnect(CONN) Encoding(bookdf0$doubantitle) <- "UTF-8" Encoding(bookdf0$author) <- "UTF-8" Encoding(userdf0$publicname) <- "UTF-8" Encoding(followdf0$nickname) <- "UTF-8" Encoding(followdf0$city) <- "UTF-8" bookdf0$author <- .cleanAuthors(bookdf0$author) userdf0 <- merge(userdf0, followdf0[, c("openid", "sex")], all.x = TRUE) commdf0 <- merge(commdf0, followdf0[, c("openid", "sex")], all.x = TRUE) commdf1 <- commdf0[commdf0$include %in% 1 & substr(commdf0$time,1,4) <= thisyear, ] commdf2 <- commdf1[substr(commdf1$time,1,4)==thisyear, ] year.start <- strptime(paste0(thisyear, "-01-01 00:00:00"), format = "%Y-%m-%d %H:%M:%S") year.end <- strptime(paste0(as.numeric(thisyear)+1, "-01-01 00:00:00"), format = "%Y-%m-%d %H:%M:%S") - 1 userdf_v1 <- .verifyUsers(userdf0, year.start, year.end) userdf_v2 <- .verifyUsers(userdf0, "2000-01-01 00:00:00", year.start) userdf_v3 <- .verifyUsers(userdf0, year.start, paste0(substr(year.start, 1, 10), " 23:59:59")) userdf_v4 <- .verifyUsers(userdf0, paste0(substr(year.end, 1, 10), " 00:00:00"), year.end) userdf_new <- userdf0[userdf0$openid %in% setdiff(userdf_v1$openid[userdf_v1$verify == 1], userdf_v2$openid[userdf_v2$verify == 1]), ] userdf_newsur <- userdf_new[userdf_new$openid %in% userdf_v4$openid[userdf_v4$verify == 1], ] commdf1 <- merge(commdf1, bookdf0, all.x = TRUE) commdf2 <- merge(commdf2, bookdf0, all.x = TRUE) hotdf1 <- arrange(summarise(group_by(commdf1, doubanid, doubantitle, author), count = length(doubanid)), desc(count)) hotdf2 <- arrange(summarise(group_by(commdf2, doubanid, doubantitle, author), count = length(doubanid)), desc(count)) autdf1 <- arrange(summarise(group_by(commdf1, author), count = length(doubanid)), desc(count)) autdf2 <- arrange(summarise(group_by(commdf2, author), count = length(doubanid)), desc(count)) autdf1 <- autdf1[nzchar(autdf1$author), ] autdf2 <- autdf2[nzchar(autdf2$author), ] rderdf1 <- arrange(summarise(group_by(commdf1, openid), count = length(doubanid)), count) rderdf1$num <- (1:nrow(rderdf1))/ nrow(rderdf1) rderdf1$cum <- cumsum(rderdf1$count)/ sum(rderdf1$count) rderdf2 <- arrange(summarise(group_by(commdf2, openid), count = length(doubanid)), count) rderdf2$num <- (1:nrow(rderdf2))/ nrow(rderdf2) rderdf2$cum <- cumsum(rderdf2$count)/ sum(rderdf2$count) survdf1 <- userdf0 survdf1$leavetime[is.na(survdf1$leavetime)] <- as.character(year.end) survdf1 <- survdf1[survdf1$jointime < paste0(as.numeric(thisyear) + 1, "-01-01 00:00:00"), ] survdf1$days <- as.numeric(difftime(strptime(survdf1$leavetime, format = "%Y-%m-%d %H:%M:%S"), strptime(survdf1$jointime, format = "%Y-%m-%d %H:%M:%S"), unit = "days")) survdf2 <- summarise(group_by(survdf1, openid), days = sum(days), sex = max(sex), leavetime = max(leavetime))
\mainmatter
响马读书群于r substr(min(commdf0$time), 1, 4)
年r as.numeric(substr(min(commdf0$time), 6, 7))
月r as.numeric(substr(min(commdf0$time), 9, 10))
日组建并提交第一篇书评,至今已积累了r format(nrow(commdf1), big.mark = ",")
篇书评,共计r format(sum(commdf1$nchar, na.rm = TRUE), big.mark = ",")
字,平均每篇书评r round(sum(commdf1$nchar, na.rm = TRUE)/nrow(commdf1), 0)
字,书评字数中位数为r median(commdf1$nchar, na.rm = TRUE)
。本年度群友提交的书评数为r format(nrow(commdf2), na.rm = TRUE)
篇,书评总字数为r format(sum(commdf2$nchar, na.rm = TRUE), big.mark = ",")
,平均每篇书评r round(sum(commdf2$nchar, na.rm = TRUE)/nrow(commdf2), 0)
字,书评字数中位数为r median(commdf2$nchar, na.rm = TRUE)
。其他年度如图\@ref(fig:pic01)所示。
par(mar = c(4,4,1,1), mfrow = c(1,2)) barplot(table(substr(commdf1$time,1,4)), ylab="书评数") barplot(sapply(split(commdf1$nchar, f = substr(commdf1$time,1,4)), sum, na.rm = TRUE), ylab="书评字数")
在本年度内,曾经存在于“响马读书”微信群的群友一共有r length(unique(userdf_v1$openid[userdf_v1$verify == 1]))
人,其中本年度新增的群友有r length(unique(userdf_new$openid))
人,这些新增群友中,年末还留存在群里的有r length(unique(userdf_newsur$openid))
人,留存率为r round(length(unique(userdf_newsur$openid))/length(unique(userdf_new$openid))*100,2)
\%。在本年度的全部r length(unique(userdf_v1$openid[userdf_v1$verify == 1]))
位群友中,活跃群友(至少提交过一次书评的)数为r length(unique(commdf2$openid))
人,活跃比例为r round(length(unique(commdf2$openid))/length(unique(userdf_v1$openid[userdf_v1$verify == 1]))*100,2)
\%。每位活跃群友在本年度平均读书量为r round(nrow(commdf2)/length(unique(commdf2$openid)),2)
本。其他年度如图\@ref(fig:pic02)所示。
par(mar = c(4,4,1,1), mfrow = c(1,2)) barplot(sapply(split(commdf1$openid, f = substr(commdf1$time,1,4)), FUN = function(X) length(unique(X))), ylab="活跃群友数") barplot(sapply(split(commdf1$openid, f = substr(commdf1$time,1,4)), length) / sapply(split(commdf1$openid, f = substr(commdf1$time,1,4)), FUN = function(X) length(unique(X))), ylab="活跃群友平均阅读量")
截至目前,“响马读书群”所有群友阅读的书籍一共有r length(unique(commdf1$doubanid))
本,本年度阅读的书籍有r length(unique(commdf2$doubanid))
本。
本年度热度排名前10的书籍如表\@ref(tab:tbl01)所示。其中“本年度阅读次数”是指本年度群友提交的书评中包含该书籍的次数,如果是同一本书的不同版本,会合并成一本。“累计阅读次数”是指截至目前该书籍被阅读并提交书评的次数。
hotout1 <- hotdf2[1:10, ] hotout1 <- arrange(merge(hotout1, select(hotdf1, doubanid, c1 = count), all.x = TRUE), desc(count)) hotout1$doubanid <- NULL names(hotout1) <- c("书名", "作者", "本年度阅读次数", "累计阅读次数") knitr::kable(hotout1, caption="年度排名前十的书籍") %>% kable_styling(latex_options = "hold_position", font_size = 7)
在本年度的r length(unique(commdf2$doubanid))
本书籍中,一共包含了r length(unique(commdf2$author))
位作者,他们的热度排名如表\@ref(tab:tbl02)所示。
autout1 <- autdf2[1:10, ] autout1 <- arrange(merge(autout1, select(autdf1, author, c1 = count), all.x = TRUE), desc(count)) names(autout1) <- c("作者", "本年度阅读次数", "累计阅读次数") knitr::kable(autout1, caption="年度排名前十的作者") %>% kable_styling(latex_options = "hold_position", font_size = 7)
其中“本年度阅读次数”是指本年度该作者的作品被阅读并评论的次数。“累计阅读次数”是指截至目前该作者的作品被阅读并评论的次数。
此处的读者指的是“响马读书群”提交过书评的活跃群友,本年度平均每月的读着数为r round(mean(sapply(split(commdf2$openid, f = substr(commdf2$time,1,7)), FUN = function(X) length(unique(X)))), 1)
。
每位读者的阅读量不同,有的群友读的书非常多,有的群友每个月刚好完成任务,这就造成了阅读量的不均匀。我们使用洛伦兹曲线来衡量这种不均匀的程度,横轴表示按照阅读量排序后,累积的读者数目的百分比,纵轴表示相对应的阅读书籍数的累积百分比。
理想的情况下如果完全均匀,那么该曲线是一条45度的直线,实际情况中会弯曲,弯曲的程度越大,表示不均匀的程度越大。如图\@ref(fig:pic04)所示。
par(mar = c(4,4,1,1)) plot(cum~num, data = rderdf1, type = "l", xlab = "读者", ylab = "读书量", col = "red", lty = 2) points(cum~num, data = rderdf2, type = "l", col = "black", lwd = 2) legend(0, 1, c("本年度", "累积"), xjust = 0, col = c("black", "red"), lty = c(1, 2), lwd = c(2, 1))
根据“响马读书”群的群规,每位群友如果因为未提交书评被清退后,隔一个月后可以再次申请入群,我们把单次入群到退出的过程记为一个生命周期,统计单次入群生存天数可得,均值为r round(mean(survdf1$days), 1)
天,中位数为r round(median(survdf1$days), 1)
天。对于多次入群的群友,我们也可以统计其累计生存时间,均值为r round(mean(survdf2$days), 1)
天,中位数为r round(median(survdf2$days), 1)
天。生存天数的分布如图\@ref(fig:pic05)所示。
par(mar = c(4,4,1,1), mfrow = c(1,2)) hist(survdf1$days, breaks = 50, xlab = "单次入群生存天数", main = "") hist(survdf2$days, breaks = 50, xlab = "累计生存天数", main = "")
基于生存数据拟合生存模型,并画出生存曲线,如图\@ref(fig:pic06)所示。
survdf1$status <- 2 survdf1$status[survdf1$leavetime == as.character(year.end)] <- 1 survdf2$status <- 2 survdf2$status[survdf2$leavetime == as.character(year.end)] <- 1 s1 <- survfit(Surv(days, status) ~ 1, data = survdf1) s2 <- survfit(Surv(days, status) ~ 1, data = survdf2) par(mar = c(4,4,4,1), mfrow = c(1,2)) plot(s1, xlab = "Survival Time (days)", main = "单次入群生存曲线") plot(s1, xlab = "Survival Time (days)", main = "累计生存曲线")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.