PageRenderTime 87ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 1ms

/jacked_markovGenerator.php

http://poordecisions.googlecode.com/
PHP | 557 lines | 294 code | 39 blank | 224 comment | 38 complexity | 04d5259f8af0d1f72fd42fd5b51d300d MD5 | raw file
Possible License(s): Apache-2.0
  1. <?php
  2. /**
  3. * Copyright (c) 2008 Rob Tinsley (www.bitari.com)
  4. *
  5. * Permission is hereby granted, free of charge, to any person
  6. * obtaining a copy of this software and associated documentation
  7. * files (the "Software"), to deal in the Software without
  8. * restriction, including without limitation the rights to use,
  9. * copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. * copies of the Software, and to permit persons to whom the
  11. * Software is furnished to do so, subject to the following
  12. * conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be
  15. * included in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  19. * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  20. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  21. * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  22. * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  24. * OTHER DEALINGS IN THE SOFTWARE.
  25. *
  26. * @package MarkovLetterChain
  27. * @author Rob Tinsley (www.bitari.com)
  28. * @copyright 2008 Rob Tinsley
  29. * @license http://www.opensource.org/licenses/mit-license.php MIT License
  30. * @link http://www.bitari.com/
  31. */
  32. /*
  33. * CHANGES:
  34. *
  35. * 2008-02-04 initial version
  36. * 2008-02-05 documentation started
  37. * 2008-02-07 renamed some variables, documentation updates
  38. * 2008-02-15 renamed some variables, documentation updates
  39. * 2008-02-21 unicode (UTF-8) enhancements
  40. */
  41. /**
  42. * @package MarkovLetterChain
  43. * @version 2008-02-21 (alpha)
  44. */
  45. class MarkovLetterChain
  46. {
  47. /**
  48. * The order of the chain, 1 for first order, 2 for second order, and so on.
  49. *
  50. * @var integer
  51. * @access private
  52. */
  53. var $_order;
  54. /**
  55. * The frequency table, where _table[seed_string][next_letter] = frequency
  56. *
  57. * Note that there are two special letters, '^' and '$', which denote beginning and end of a word, respectively.
  58. *
  59. * @var array
  60. * @access private
  61. */
  62. var $_table = array();
  63. /**
  64. * The list of words fed to the object, where _dictionary[word] = count
  65. *
  66. * @var array
  67. * @access private
  68. */
  69. var $_dictionary = array();
  70. /**
  71. * The entropy of the last string returned from _generate() in bits
  72. *
  73. * @var array
  74. * @access private
  75. */
  76. var $_last_generated_entropy = 0;
  77. /**
  78. * Whether to use UTF-8 (rather than ASCII) internally.
  79. *
  80. * @var boolean
  81. * @access private
  82. */
  83. var $_utf8_enabled = false;
  84. /* **************************************** */
  85. /**
  86. * Constructor.
  87. *
  88. * @param integer $order (optional) the order of the chain, defaults to 1
  89. * @return void
  90. * @access public
  91. */
  92. function MarkovLetterChain ( $order = 1 )
  93. {
  94. $this->_order = $order;
  95. }
  96. /**
  97. * Save the object's state to a file.
  98. *
  99. * It is optional whether or not the dictionary is saved as that might increase the file size substantially.
  100. *
  101. * @param string $filename the file to save the object state into
  102. * @param string $filename the file to save the object state into
  103. * @param boolean $save_dictionary (optional) whether to include the dictionary in the saved state, defaults to true
  104. * @return boolean true, or false if there was an error
  105. * @access public
  106. */
  107. function save_state ( $filename, $save_dictionary = true )
  108. {
  109. $f = @fopen( $filename, 'w' );
  110. if ( !is_resource( $f ) ) {
  111. return false;
  112. }
  113. $serial = serialize( array(
  114. 'order' => $this->_order,
  115. 'table' => $this->_table,
  116. 'dictionary' => $save_dictionary ? $this->_dictionary : array()
  117. ) );
  118. $bytes = @fwrite( $f, $serial );
  119. @fclose( $f );
  120. return $bytes === strlen( $serial );
  121. }
  122. /**
  123. * Load the object's state from a file.
  124. *
  125. * It is optional whether or not the dictionary is loaded as that might increase memory usage substantially.
  126. * Note that if the dictionary is not loaded (or you are loading from a state file to which the dictionary was not saved)
  127. * the generate() function will not be able to filter out dictionary words.
  128. *
  129. * @param string $filename the file to save the object state into
  130. * @param boolean $load_dictionary (optional) whether to include the dictionary in the loaded state, defaults to true
  131. * @return boolean true, or false if there was an error
  132. * @access public
  133. */
  134. function load_state ( $filename, $load_dictionary = true )
  135. {
  136. $this->_order = 0;
  137. $this->_table = array();
  138. $this->_dictionary = array();
  139. $this->_last_generated_entropy = 0;
  140. $this->_utf8_enabled = false;
  141. $file_contents_array = file( $filename );
  142. if ( !is_array( $file_contents_array ) ) {
  143. return false;
  144. }
  145. $file_contents = implode( '', $file_contents_array );
  146. unset( $file_contents_array );
  147. $unserial = unserialize( $file_contents );
  148. unset( $file_contents );
  149. if ( !is_array( $unserial ) ) {
  150. return false;
  151. }
  152. $this->_order = $unserial['order'];
  153. $this->_table = $unserial['table'];
  154. $this->_dictionary = $load_dictionary ? $unserial['dictionary'] : array();
  155. return true;
  156. }
  157. /**
  158. * Print the frequency table on stdout.
  159. *
  160. * @return void
  161. * @access public
  162. */
  163. function print_table ()
  164. {
  165. foreach ( array_keys( $this->_table ) as $seed ) {
  166. foreach ( $this->_table[$seed] as $letter => $freq ) {
  167. if ( $letter !== '#sum' ) {
  168. print "$seed|$letter\t$freq\n";
  169. }
  170. }
  171. }
  172. }
  173. /**
  174. * Print the dictionary on stdout.
  175. *
  176. * @return void
  177. * @access public
  178. */
  179. function print_dictionary ()
  180. {
  181. foreach ( $this->_dictionary as $word => $freq ) {
  182. print "$word\t$freq\n";
  183. }
  184. }
  185. /**
  186. * Whether a particular word is in the dictionary.
  187. *
  188. * @param string $word the word to look for in the dictionary
  189. * @return boolean true if the word is in the dictionary, otherwise false
  190. * @access public
  191. */
  192. function in_dictionary ( $word )
  193. {
  194. return array_key_exists( $word, $this->_dictionary );
  195. }
  196. /* **************************************** */
  197. /**
  198. * Whether to work in UTF-8 rather than ASCII
  199. *
  200. * @param mixed $enable true to enable, false to disable, null to make no change
  201. * @return boolean true if enabled before the function was called, otherwise false
  202. * @access public
  203. */
  204. function enable_utf8 ( $enable = true )
  205. {
  206. $old = $this->_utf8_enabled;
  207. if ( $enable !== NULL ) {
  208. $this->_utf8_enabled = ( $enable ? true : false );
  209. }
  210. return $old;
  211. }
  212. /**
  213. * Get string length (using UTF-8 functions if appropriate)
  214. *
  215. * @param string $string the string being measured for length
  216. * @return integer the length of the string on success, and 0 if the string is empty
  217. * @access private
  218. */
  219. function _strlen ( $string )
  220. {
  221. if ( $this->_utf8_enabled ) {
  222. return mb_strlen( $string, 'UTF-8' );
  223. } else {
  224. return strlen( $string );
  225. }
  226. }
  227. /**
  228. * Return part of a string (using UTF-8 functions if appropriate)
  229. *
  230. * @param string $string the input string
  231. * @param integer $start the starting position, counting from 0
  232. * @param integer $length (optional) the maximum length of the string to return, defaults to entire string
  233. * @return integer the length of the string on success, and 0 if the string is empty
  234. * @access private
  235. *
  236. */
  237. function _substr ( $string, $start, $length = NULL )
  238. {
  239. if ( $this->_utf8_enabled ) {
  240. if ( $length !== NULL ) {
  241. $r = mb_substr( $string, $start, $length, 'UTF-8' );
  242. } else {
  243. $r = mb_substr( $string, $start, mb_strlen( $string ), 'UTF-8' );
  244. }
  245. } else {
  246. if ( $length !== NULL ) {
  247. $r = substr( $string, $start, $length );
  248. } else {
  249. $r = substr( $string, $start );
  250. }
  251. }
  252. return is_string( $r ) ? $r : '';
  253. }
  254. /**
  255. * Add more words to the frequency table and the dictionary.
  256. *
  257. * @param string $stuff a free-form string
  258. * @param string $charset (optional) the character set of the input string
  259. * @return void
  260. * @access public
  261. */
  262. function feed ( $stuff, $charset = NULL )
  263. {
  264. if ( $this->_utf8_enabled ) {
  265. if ( $charset !== NULL ) {
  266. $stuff = iconv( $charset, 'UTF-8//TRANSLIT', $stuff );
  267. }
  268. $stuff = mb_strtolower( $stuff, 'UTF-8' );
  269. preg_match_all( "/(?<=[^'\p{L}_\p{N}.]|\s')[\p{L}]+(?=[^'\p{L}_\p{N}.]|\.[^\p{L}_\p{N}])/iu", " $stuff ", $words );
  270. } else {
  271. if ( $charset !== NULL ) {
  272. $stuff = iconv( $charset, 'ASCII//TRANSLIT', $stuff );
  273. }
  274. $stuff = strtolower( $stuff );
  275. preg_match_all( "/(?<=[^'a-z_0-9.]|\s')[a-z]+(?=[^'a-z_0-9.]|\.[^a-z_0-9])/i", " $stuff ", $words );
  276. }
  277. unset( $stuff );
  278. foreach ( $words[0] as $word ) {
  279. $this->_word( $word );
  280. }
  281. }
  282. /**
  283. * Add one word to the frequency table and (optionally) the dictionary.
  284. *
  285. * @param string $word the word to add to the frequency table
  286. * @param boolean $add_to_dictionary (optional) whether to also add the word to the dictionary, defaults to true
  287. * @return void
  288. * @access private
  289. */
  290. function _word ( $word, $add_to_dictionary = true )
  291. {
  292. if ( !is_integer( $this->_order ) || $this->_order < 1 ) {
  293. return;
  294. }
  295. if ( !$add_to_dictionary ) {
  296. // do nothing
  297. } elseif ( array_key_exists( $word, $this->_dictionary ) ) {
  298. $this->_dictionary[$word]++;
  299. } else {
  300. $this->_dictionary[$word] = 1;
  301. }
  302. $word = '^' . $word . '$'; # mark the beginning and end of $word
  303. $len = $this->_strlen( $word );
  304. // process all substrings at the beginning of $word shorter than $_order
  305. for ( $leadin = 2; $leadin <= $this->_order && $leadin <= $len; $leadin++ ) {
  306. $this->_segment( $this->_substr( $word, 0, $leadin ) );
  307. }
  308. // process all substring of $word with length $_order
  309. for ( $cursor = 0; $cursor < $len - $this->_order; $cursor++ ) {
  310. $this->_segment( $this->_substr( $word, $cursor, $this->_order + 1 ) );
  311. }
  312. }
  313. /**
  314. * Add part of a word to the frequency table.
  315. *
  316. * @param string $segment the word-segment to add to the frequency table
  317. * @return void
  318. * @access private
  319. */
  320. function _segment ( $segment )
  321. {
  322. $s0 = $this->_substr( $segment, 0, -1 );
  323. $s1 = $this->_substr( $segment, -1, 1 );
  324. unset( $segment );
  325. if ( array_key_exists( $s0, $this->_table ) && array_key_exists( $s1, $this->_table[$s0] ) ) {
  326. $this->_table[$s0][$s1]++;
  327. } else {
  328. $this->_table[$s0][$s1] = 1;
  329. }
  330. if ( array_key_exists( $s0, $this->_table ) && array_key_exists( '#sum', $this->_table[$s0] ) ) {
  331. $this->_table[$s0]['#sum']++;
  332. } else {
  333. $this->_table[$s0]['#sum'] = 1;
  334. }
  335. }
  336. /* **************************************** */
  337. /**
  338. * Makes all frequencies in the frequency table be their own root.
  339. *
  340. * @param float $n (optional) the root to use, defaults to 2 (square root)
  341. * @return void
  342. * @access public
  343. */
  344. function root ( $n = 2 )
  345. {
  346. $this->_freq_formula( 'root', $n );
  347. }
  348. /**
  349. * Multiplies all frequencies in the frequency table by a constant.
  350. *
  351. * @param float $n the constant to muliply the frequencies by
  352. * @return void
  353. * @access public
  354. */
  355. function multiply ( $n )
  356. {
  357. $this->_freq_formula( 'multiply', $n );
  358. }
  359. /**
  360. * Drops all frequency table entries with a frequency below a given threshold.
  361. *
  362. * @param float $n the threshold for all frequencies
  363. * @return void
  364. * @access public
  365. */
  366. function threshold ( $n )
  367. {
  368. $this->_freq_formula( 'threshold', $n );
  369. }
  370. /**
  371. * Drops all frequency table entries with a frequency below a given threshold.
  372. *
  373. * @param string $f the name of the formula
  374. * @param mixed $n (optional) a constant also passed to the formula
  375. * @return void
  376. * @access private
  377. */
  378. function _freq_formula ( $f, $n = NULL )
  379. {
  380. foreach ( array_keys( $this->_table ) as $seed ) {
  381. unset( $this->_table[$seed]['#sum'] );
  382. foreach ( $this->_table[$seed] as $letter => $freq ) {
  383. switch ( $f ) {
  384. case 'root':
  385. if ( $n === 2 ) {
  386. $freq = sqrt( $freq );
  387. } else {
  388. $freq = pow( $freq, 1 / $n );
  389. }
  390. break;
  391. case 'multiply':
  392. $freq *= $n;
  393. break;
  394. case 'threshold':
  395. if ( $freq < $n ) {
  396. $freq = 0;
  397. }
  398. break;
  399. }
  400. $freq = intval( $freq + 0.5 );
  401. if ( $freq > 0 ) {
  402. $this->_table[$seed][$letter] = $freq;
  403. } else {
  404. unset( $this->_table[$seed][$letter] );
  405. }
  406. }
  407. if ( count( $this->_table[$seed] ) ) {
  408. $this->_table[$seed]['#sum'] = array_sum( $this->_table[$seed] );
  409. } else {
  410. unset( $this->_table[$seed] );
  411. }
  412. }
  413. }
  414. /* **************************************** */
  415. /**
  416. * Generate a random word.
  417. *
  418. * @param integer $minlen the shortest allowed word
  419. * @param integer $maxlen the longest allowed word
  420. * @param boolean $allow_dictionary_words (optional) whether to allow the function to return dictionary words, defaults to true
  421. * @return mixed NULL if there was an error, otherwise a string
  422. * @access public
  423. */
  424. function generate ( $minlen, $maxlen, $allow_dictionary_words = true )
  425. {
  426. for ($i = 0; $i < 100; $i++) {
  427. $word = $this->_generate();
  428. if ( !is_string( $word ) ) {
  429. return NULL;
  430. }
  431. if ( !$allow_dictionary_words && array_key_exists( $word, $this->_dictionary ) ) {
  432. continue;
  433. }
  434. $wordlen = $this->_strlen( $word );
  435. if ( $wordlen >= $minlen && $wordlen <= $maxlen ) {
  436. return $word;
  437. }
  438. }
  439. return NULL;
  440. }
  441. /**
  442. * Generate a random word.
  443. *
  444. * @return mixed NULL if there was an error, otherwise a string
  445. * @access private
  446. * @uses MarkovLetterChain::rand()
  447. */
  448. function _generate ()
  449. {
  450. $this->_last_generated_entropy = 0;
  451. $k = 1.0;
  452. if ( !is_integer( $this->_order ) || $this->_order < 1 ) {
  453. return NULL;
  454. }
  455. $word = '';
  456. $seed = '^';
  457. while (true) {
  458. $sum = $this->_table[$seed]['#sum'];
  459. $r = $this->rand( 0, $sum - 1 );
  460. foreach ( $this->_table[$seed] as $letter => $freq ) {
  461. if ( $letter === '#sum' ) {
  462. continue;
  463. }
  464. if ( $r < $freq ) {
  465. break;
  466. }
  467. $r -= $freq;
  468. }
  469. $k *= $sum / $freq;
  470. if ($letter == '$') {
  471. $this->_last_generated_entropy = log( $k, 2 );
  472. return $word;
  473. }
  474. $word .= $letter;
  475. $seed .= $letter;
  476. if ( $this->_strlen( $seed ) > $this->_order ) {
  477. $seed = $this->_substr( $seed, -$this->_order, $this->_order );
  478. }
  479. }
  480. }
  481. /**
  482. * Generate a random integer.
  483. *
  484. * @param integer $min the lowest value to return
  485. * @param integer $max the highest value to return
  486. * @return integer A (pseudo-)random value in the range [min,max]
  487. * @access private
  488. */
  489. function rand ( $min, $max )
  490. {
  491. return rand( $min, $max );
  492. }
  493. /**
  494. * Calculate the entropy of the last string returned from _generate() in bits.
  495. *
  496. * IMPORTANT NOTE: the entropy of the string returned from generate()
  497. * [without the leading underscore] will be lower, as that function
  498. * imposes additional constraints, such as on the length of the word,
  499. * and (optionally) excludes dictionary words.
  500. *
  501. * @return float the entropy of the last string returned from _generate() in bits
  502. * @access public
  503. */
  504. function last_generated_entropy ()
  505. {
  506. return $this->_last_generated_entropy;
  507. }
  508. }
  509. ?>