/zf/library/Zend/Search/Lucene/Analysis/Analyzer.php

http://github.com/eryx/php-framework-benchmark · PHP · 175 lines · 46 code · 30 blank · 99 comment · 3 complexity · 1f188aefb9c1a92204c66dc53f83c955 MD5 · raw file

  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Search_Lucene
  17. * @subpackage Analysis
  18. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id: Analyzer.php 23775 2011-03-01 17:25:24Z ralph $
  21. */
  22. /** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */
  23. /** @todo Section should be removed with ZF 2.0 release as obsolete */
  24. if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) {
  25. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  26. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  27. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  28. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  29. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  30. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  31. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  32. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  33. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  34. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  35. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  36. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  37. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  38. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  39. /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  40. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  41. }
  42. /**
  43. * An Analyzer is used to analyze text.
  44. * It thus represents a policy for extracting index terms from text.
  45. *
  46. * Note:
  47. * Lucene Java implementation is oriented to streams. It provides effective work
  48. * with a huge documents (more then 20Mb).
  49. * But engine itself is not oriented such documents.
  50. * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
  51. *
  52. * @category Zend
  53. * @package Zend_Search_Lucene
  54. * @subpackage Analysis
  55. * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
  56. * @license http://framework.zend.com/license/new-bsd New BSD License
  57. */
  58. abstract class Zend_Search_Lucene_Analysis_Analyzer
  59. {
  60. /**
  61. * The Analyzer implementation used by default.
  62. *
  63. * @var Zend_Search_Lucene_Analysis_Analyzer
  64. */
  65. private static $_defaultImpl;
  66. /**
  67. * Input string
  68. *
  69. * @var string
  70. */
  71. protected $_input = null;
  72. /**
  73. * Input string encoding
  74. *
  75. * @var string
  76. */
  77. protected $_encoding = '';
  78. /**
  79. * Tokenize text to a terms
  80. * Returns array of Zend_Search_Lucene_Analysis_Token objects
  81. *
  82. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  83. *
  84. * @param string $data
  85. * @return array
  86. */
  87. public function tokenize($data, $encoding = '')
  88. {
  89. $this->setInput($data, $encoding);
  90. $tokenList = array();
  91. while (($nextToken = $this->nextToken()) !== null) {
  92. $tokenList[] = $nextToken;
  93. }
  94. return $tokenList;
  95. }
  96. /**
  97. * Tokenization stream API
  98. * Set input
  99. *
  100. * @param string $data
  101. */
  102. public function setInput($data, $encoding = '')
  103. {
  104. $this->_input = $data;
  105. $this->_encoding = $encoding;
  106. $this->reset();
  107. }
  108. /**
  109. * Reset token stream
  110. */
  111. abstract public function reset();
  112. /**
  113. * Tokenization stream API
  114. * Get next token
  115. * Returns null at the end of stream
  116. *
  117. * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
  118. *
  119. * @return Zend_Search_Lucene_Analysis_Token|null
  120. */
  121. abstract public function nextToken();
  122. /**
  123. * Set the default Analyzer implementation used by indexing code.
  124. *
  125. * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
  126. */
  127. public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
  128. {
  129. self::$_defaultImpl = $analyzer;
  130. }
  131. /**
  132. * Return the default Analyzer implementation used by indexing code.
  133. *
  134. * @return Zend_Search_Lucene_Analysis_Analyzer
  135. */
  136. public static function getDefault()
  137. {
  138. /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  139. require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  140. if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
  141. self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
  142. }
  143. return self::$_defaultImpl;
  144. }
  145. }