PageRenderTime 38ms CodeModel.GetById 1ms app.highlight 34ms RepoModel.GetById 1ms app.codeStats 0ms

/incubot.rkt

http://github.com/elibarzilay/rudybot
Shell | 170 lines | 160 code | 7 blank | 3 comment | 0 complexity | 62624e68f1573577d843679086ab26d1 MD5 | raw file
  1#! /bin/sh
  2#| Hey Emacs, this is -*-scheme-*- code!
  3#$Id$
  4exec  racket -l errortrace --require "$0" --main -- ${1+"$@"}
  5|#
  6
  7;; Some code to reply in an alarmingly-human-like way.  Idea, but not
  8;; the code, utterly stolen from Peter Danenberg (aka "klutometis"),
  9;; to whom all credit is due.
 10
 11;; The basic idea: someone says something to the bot.  The bot doesn't
 12;; recognize that input as one of its built-in commands, so this code
 13;; runs.  This code breaks the input into words, and ranks each word
 14;; based on how frequently it appears in the "corpus" (just a log of
 15;; all the input the bot has seen).  It picks the "most interesting"
 16;; word -- i.e., the one that appears the least -- and then finds all
 17;; the utterances in the corpus that contain it.  It then returns an
 18;; utterance chosen at random from that set, favoring the longer
 19;; (presumably more-interesting) ones.
 20
 21#lang racket
 22(require
 23 scheme/set
 24 scheme/include
 25 (only-in "log-parser.rkt" utterance-text )
 26 (only-in "vars.rkt" *incubot-logger*))
 27
 28(include "incubot-tests.rkt")
 29
 30(provide (except-out (struct-out corpus) corpus))
 31(struct corpus (strings strings-by-word) #:transparent)
 32
 33(provide (rename-out [public-make-corpus make-corpus]))
 34(define/contract (public-make-corpus . sentences)
 35  (->* () () #:rest (listof string?) corpus?)
 36  (make-corpus-from-sequence  (in-list sentences)))
 37
 38(define (random-favoring-smaller-numbers k)
 39  (let (
 40        ;; 0 <= r < 1, but smaller numbers are more likely
 41        [r (/(sub1 (exp (random))) (sub1 (exp 1)))])
 42
 43    (inexact->exact
 44     (truncate
 45      (* r k) ;; 0 <= this < k
 46      ))))
 47
 48;; favor longer utterances over shorter ones.
 49(define/contract (random-choose seq)
 50  (-> list? any/c)
 51  (let ([sorted (sort seq > #:key string-length)])
 52    (list-ref
 53     sorted
 54     (random-favoring-smaller-numbers (length seq)))))
 55
 56(define/contract (strings-containing-word w c)
 57  (-> string? corpus? (listof string?))
 58  (dict-ref (corpus-strings-by-word c) w))
 59
 60(provide  incubot-sentence)
 61(define incubot-sentence
 62  (match-lambda*
 63   [(list (? list? s) (? corpus? c))
 64    (incubot-sentence (wordlist->wordset s) c)]
 65   [(list (? string? s) (? corpus? c))
 66    (incubot-sentence (string->words s) c)]
 67   [(list (? set? ws) (? corpus? c))
 68    (let ([rare (rarest ws c)])
 69      ((*incubot-logger*) "incubot corpus has ~a entries" (set-count (corpus-strings c)))
 70      (and rare
 71           ((*incubot-logger*) "incubot chose ~s" rare)
 72           (random-choose (strings-containing-word rare c))))]))
 73
 74(define/contract (in-corpus? s c)
 75  (string? corpus? . -> . boolean?)
 76  (set-member? (corpus-strings c) s))
 77
 78(define (make-immutable-ci-hash)
 79  (make-immutable-custom-hash
 80                    string-ci=?
 81                    (compose equal-hash-code string-downcase)))
 82
 83(define (make-corpus-from-sequence seq [limit #f])
 84  (let/ec return
 85    (for/fold ([c (corpus
 86                   (set)
 87                   (make-immutable-ci-hash))])
 88        ([(sentence forms-read) (in-indexed seq)])
 89        (when (equal? limit forms-read)
 90          (return c))
 91
 92      (add-to-corpus sentence c))))
 93
 94(provide make-corpus-from-sexps)
 95;; TODO -- somehow arrange that, if we get a fatal signal, we finish
 96;; writing out the current sexp, so that the output file remains
 97;; well-formed.
 98(define (make-corpus-from-sexps inp [limit #f])
 99  (make-corpus-from-sequence
100   (in-port
101    (lambda (ip)
102      (let ([datum (read ip)])
103        ;; this sure seems kludgy.  I wonder if there's a better way
104        (if (eof-object? datum)
105            datum
106            (utterance-text datum))))
107    inp)
108   limit))
109
110(provide make-corpus-from-file)
111(define (make-corpus-from-file ifn)
112  (call-with-input-file ifn
113    (lambda (ip)
114      (make-corpus-from-sequence (in-lines ip)))))
115
116(define (offensive? s)
117  (regexp-match #px"\\bnigger\\b" s))
118
119(provide add-to-corpus)
120(define/contract (add-to-corpus s c)
121  (-> string? corpus? corpus?)
122  (if (offensive? s)
123      (begin0
124          c
125        ((*incubot-logger*) "Not adding offensive string to corpus"))
126      (corpus
127       (set-add (corpus-strings c) s)
128       (for/fold ([h (corpus-strings-by-word c)])
129           ([w (in-set (string->words s))])
130           (dict-update h w (curry cons s) '())))))
131
132(define (setof pred)
133  (lambda (thing)
134    (and (set? thing)
135         (for/and ([item (in-set thing)])
136                  (pred item)))))
137
138(define/contract (wordlist->wordset ws)
139  ((listof string?) . -> . (setof string?))
140  (define (strip rx) (curryr (curry regexp-replace* rx) ""))
141  (apply
142   set
143   (filter (compose positive? string-length)
144           (map (compose
145                 (strip #px"^'+")
146                 (strip #px"'+$")
147                 (strip #px"[^'[:alpha:]]+"))
148                ws))))
149
150(define/contract (string->words s)
151  (string? . -> . set?)
152  (wordlist->wordset (regexp-split #rx" " (string-downcase s))))
153
154(define/contract (word-popularity w c)
155  (string? corpus? . -> . natural-number/c)
156  (length (dict-ref (corpus-strings-by-word c) w '())))
157
158(define/contract (rarest ws c)
159  (-> set? corpus? (or/c string? #f))
160  (let-values ([(_ tied-for-rarest)
161                (for/fold ([smallest-ranking +inf.0]
162                           [rarest-words-so-far (set)])
163                    ([word (in-set ws)])
164                    (let ([p (word-popularity word c)])
165                      (if (and (positive? p)
166                               (<= p smallest-ranking))
167                          (values p (set-add rarest-words-so-far word))
168                          (values smallest-ranking rarest-words-so-far))))])
169    (and (positive? (set-count tied-for-rarest))
170         (random-choose (set-map tied-for-rarest values)))))