/code/0041-basic-word-stats-and-plots.R

https://github.com/benmarwick/saa-meeting-abstracts · R · 145 lines · 110 code · 24 blank · 11 comment · 0 complexity · f8c14462f28a9ec55f1660a74865a006 MD5 · raw file

  1. df <- all_txts_c_dtm
  2. dont_care <-
  3. c('university', 'abstracts', 'meeting',
  4. 'annual', 'paper', 'chair', 'across',
  5. 'presented', 'session', 'college', 'area',
  6. 'discussed', 'this', 'that', 'there', 'which',
  7. 'their', 'poster', 'using', 'through', 'into',
  8. 'some', 'from', 'been', 'discuss', 'while',
  9. 'over')
  10. # from tidytext
  11. df_tbl <-
  12. convert(df, to = "data.frame") %>%
  13. pivot_longer(-document,
  14. names_to = "word",
  15. values_to = "n") %>%
  16. mutate(year = parse_number(document)) %>%
  17. filter(word %in% names(data_int_syllables)) %>%
  18. filter(str_length(word) > 3) %>%
  19. filter(!word %in% dont_care)
  20. df_tbl_yearly_totals <-
  21. df_tbl %>%
  22. group_by(year) %>%
  23. summarise(total_words = sum(n, na.rm = TRUE))
  24. # compute proportion of all words per year
  25. df_tbl_prop <-
  26. df_tbl %>%
  27. # left_join(all_txts_c_summary) %>%
  28. left_join(df_tbl_yearly_totals) %>%
  29. mutate(prop = n / total_words ) %>%
  30. filter(prop > 0) %>%
  31. group_by(word) %>%
  32. mutate(sum_the_word = sum(n)) %>%
  33. mutate(keyword_n = str_c(word, " (n = ", sum_the_word, ")"))
  34. # want to find words with the greatest difference in
  35. # proportion between two time periods
  36. df_tbl_prop_two_groups <-
  37. df_tbl_prop %>%
  38. mutate(year_group = ifelse(document < 2004,
  39. "early",
  40. "late"))
  41. df_tbl_prop_two_groups_diff <-
  42. df_tbl_prop_two_groups %>%
  43. group_by(word, year_group) %>%
  44. summarise(mean_prop = mean(prop, na.rm = TRUE)) %>%
  45. pivot_wider(names_from = year_group,
  46. values_from = mean_prop) %>%
  47. drop_na() %>%
  48. mutate(diff = late - early)
  49. df_tbl_prop_two_groups_diff_interesting <-
  50. df_tbl_prop_two_groups_diff %>%
  51. filter(diff >= 0.00001 | diff <= -0.00001)
  52. df_tbl_prop_two_groups_diff_interesting_top <-
  53. df_tbl_prop_two_groups_diff_interesting %>%
  54. dplyr::arrange(desc(diff)) %>%
  55. head(30)
  56. df_tbl_prop_two_groups_diff_interesting_bottom <-
  57. df_tbl_prop_two_groups_diff_interesting %>%
  58. dplyr::arrange((diff)) %>%
  59. head(30)
  60. # check
  61. hist_of_diffs <-
  62. ggplot(df_tbl_prop_two_groups_diff_interesting,
  63. aes(diff)) +
  64. geom_histogram() +
  65. xlim(-0.0001, 0.0001) +
  66. theme_minimal() +
  67. xlab("← words used less often ... words used more often →\nin more recent years")
  68. library(ggrepel)
  69. library(ggforce)
  70. word_clouds <-
  71. ggplot() +
  72. geom_text_repel(data = df_tbl_prop_two_groups_diff_interesting_top,
  73. aes(x = diff ,
  74. y = 1,
  75. label = word,
  76. size = diff ),
  77. bg.color = "white",
  78. bg.r = 0.1,
  79. max.overlaps = 100,
  80. force = 20,
  81. segment.color = NA) +
  82. geom_text_repel(data = df_tbl_prop_two_groups_diff_interesting_bottom,
  83. aes(x = diff ,
  84. y = 1,
  85. size = abs(diff) ,
  86. label = word ),
  87. bg.color = "white",
  88. bg.r = 0.1,
  89. force = 20,
  90. max.overlaps = 100,
  91. segment.color = NA) +
  92. #xlim(-0.001, 0.001) +
  93. theme_void() +
  94. guides(size = "none")
  95. word_clouds
  96. # seems like more words decline in use than increase in use over time
  97. # this looks good
  98. hist_of_diffs +
  99. annotation_custom(
  100. ggplotGrob(word_clouds),
  101. xmin = -1e-04,
  102. xmax = 1e-04,
  103. ymin = 10,
  104. ymax = 5000
  105. )
  106. # what about completely new words appearing/disappearing?
  107. # not very useful
  108. df_tbl_prop_two_groups_wide <-
  109. df_tbl_prop_two_groups %>%
  110. select(word, prop, year_group) %>%
  111. pivot_wider(names_from = year_group,
  112. values_from = prop,
  113. values_fn = list(prop = mean))
  114. df_tbl_prop_two_groups_wide_only_in_late <-
  115. df_tbl_prop_two_groups_wide %>%
  116. filter(is.na(early )) %>%
  117. arrange(desc(late))
  118. df_tbl_prop_two_groups_wide_only_in_early <-
  119. df_tbl_prop_two_groups_wide %>%
  120. filter(is.na(late )) %>%
  121. arrange(desc(early))