/incubot.rkt

http://github.com/elibarzilay/rudybot · Shell · 170 lines · 160 code · 7 blank · 3 comment · 0 complexity · 62624e68f1573577d843679086ab26d1 MD5 · raw file

  1. #! /bin/sh
  2. #| Hey Emacs, this is -*-scheme-*- code!
  3. #$Id$
  4. exec racket -l errortrace --require "$0" --main -- ${1+"$@"}
  5. |#
  6. ;; Some code to reply in an alarmingly-human-like way. Idea, but not
  7. ;; the code, utterly stolen from Peter Danenberg (aka "klutometis"),
  8. ;; to whom all credit is due.
  9. ;; The basic idea: someone says something to the bot. The bot doesn't
  10. ;; recognize that input as one of its built-in commands, so this code
  11. ;; runs. This code breaks the input into words, and ranks each word
  12. ;; based on how frequently it appears in the "corpus" (just a log of
  13. ;; all the input the bot has seen). It picks the "most interesting"
  14. ;; word -- i.e., the one that appears the least -- and then finds all
  15. ;; the utterances in the corpus that contain it. It then returns an
  16. ;; utterance chosen at random from that set, favoring the longer
  17. ;; (presumably more-interesting) ones.
  18. #lang racket
  19. (require
  20. scheme/set
  21. scheme/include
  22. (only-in "log-parser.rkt" utterance-text )
  23. (only-in "vars.rkt" *incubot-logger*))
  24. (include "incubot-tests.rkt")
  25. (provide (except-out (struct-out corpus) corpus))
  26. (struct corpus (strings strings-by-word) #:transparent)
  27. (provide (rename-out [public-make-corpus make-corpus]))
  28. (define/contract (public-make-corpus . sentences)
  29. (->* () () #:rest (listof string?) corpus?)
  30. (make-corpus-from-sequence (in-list sentences)))
  31. (define (random-favoring-smaller-numbers k)
  32. (let (
  33. ;; 0 <= r < 1, but smaller numbers are more likely
  34. [r (/(sub1 (exp (random))) (sub1 (exp 1)))])
  35. (inexact->exact
  36. (truncate
  37. (* r k) ;; 0 <= this < k
  38. ))))
  39. ;; favor longer utterances over shorter ones.
  40. (define/contract (random-choose seq)
  41. (-> list? any/c)
  42. (let ([sorted (sort seq > #:key string-length)])
  43. (list-ref
  44. sorted
  45. (random-favoring-smaller-numbers (length seq)))))
  46. (define/contract (strings-containing-word w c)
  47. (-> string? corpus? (listof string?))
  48. (dict-ref (corpus-strings-by-word c) w))
  49. (provide incubot-sentence)
  50. (define incubot-sentence
  51. (match-lambda*
  52. [(list (? list? s) (? corpus? c))
  53. (incubot-sentence (wordlist->wordset s) c)]
  54. [(list (? string? s) (? corpus? c))
  55. (incubot-sentence (string->words s) c)]
  56. [(list (? set? ws) (? corpus? c))
  57. (let ([rare (rarest ws c)])
  58. ((*incubot-logger*) "incubot corpus has ~a entries" (set-count (corpus-strings c)))
  59. (and rare
  60. ((*incubot-logger*) "incubot chose ~s" rare)
  61. (random-choose (strings-containing-word rare c))))]))
  62. (define/contract (in-corpus? s c)
  63. (string? corpus? . -> . boolean?)
  64. (set-member? (corpus-strings c) s))
  65. (define (make-immutable-ci-hash)
  66. (make-immutable-custom-hash
  67. string-ci=?
  68. (compose equal-hash-code string-downcase)))
  69. (define (make-corpus-from-sequence seq [limit #f])
  70. (let/ec return
  71. (for/fold ([c (corpus
  72. (set)
  73. (make-immutable-ci-hash))])
  74. ([(sentence forms-read) (in-indexed seq)])
  75. (when (equal? limit forms-read)
  76. (return c))
  77. (add-to-corpus sentence c))))
  78. (provide make-corpus-from-sexps)
  79. ;; TODO -- somehow arrange that, if we get a fatal signal, we finish
  80. ;; writing out the current sexp, so that the output file remains
  81. ;; well-formed.
  82. (define (make-corpus-from-sexps inp [limit #f])
  83. (make-corpus-from-sequence
  84. (in-port
  85. (lambda (ip)
  86. (let ([datum (read ip)])
  87. ;; this sure seems kludgy. I wonder if there's a better way
  88. (if (eof-object? datum)
  89. datum
  90. (utterance-text datum))))
  91. inp)
  92. limit))
  93. (provide make-corpus-from-file)
  94. (define (make-corpus-from-file ifn)
  95. (call-with-input-file ifn
  96. (lambda (ip)
  97. (make-corpus-from-sequence (in-lines ip)))))
  98. (define (offensive? s)
  99. (regexp-match #px"\\bnigger\\b" s))
  100. (provide add-to-corpus)
  101. (define/contract (add-to-corpus s c)
  102. (-> string? corpus? corpus?)
  103. (if (offensive? s)
  104. (begin0
  105. c
  106. ((*incubot-logger*) "Not adding offensive string to corpus"))
  107. (corpus
  108. (set-add (corpus-strings c) s)
  109. (for/fold ([h (corpus-strings-by-word c)])
  110. ([w (in-set (string->words s))])
  111. (dict-update h w (curry cons s) '())))))
  112. (define (setof pred)
  113. (lambda (thing)
  114. (and (set? thing)
  115. (for/and ([item (in-set thing)])
  116. (pred item)))))
  117. (define/contract (wordlist->wordset ws)
  118. ((listof string?) . -> . (setof string?))
  119. (define (strip rx) (curryr (curry regexp-replace* rx) ""))
  120. (apply
  121. set
  122. (filter (compose positive? string-length)
  123. (map (compose
  124. (strip #px"^'+")
  125. (strip #px"'+$")
  126. (strip #px"[^'[:alpha:]]+"))
  127. ws))))
  128. (define/contract (string->words s)
  129. (string? . -> . set?)
  130. (wordlist->wordset (regexp-split #rx" " (string-downcase s))))
  131. (define/contract (word-popularity w c)
  132. (string? corpus? . -> . natural-number/c)
  133. (length (dict-ref (corpus-strings-by-word c) w '())))
  134. (define/contract (rarest ws c)
  135. (-> set? corpus? (or/c string? #f))
  136. (let-values ([(_ tied-for-rarest)
  137. (for/fold ([smallest-ranking +inf.0]
  138. [rarest-words-so-far (set)])
  139. ([word (in-set ws)])
  140. (let ([p (word-popularity word c)])
  141. (if (and (positive? p)
  142. (<= p smallest-ranking))
  143. (values p (set-add rarest-words-so-far word))
  144. (values smallest-ranking rarest-words-so-far))))])
  145. (and (positive? (set-count tied-for-rarest))
  146. (random-choose (set-map tied-for-rarest values)))))