/zf/library/Zend/Search/Lucene/Analysis/Analyzer.php
PHP | 175 lines | 46 code | 30 blank | 99 comment | 3 complexity | 1f188aefb9c1a92204c66dc53f83c955 MD5 | raw file
Possible License(s): MIT, BSD-3-Clause, Apache-2.0, LGPL-2.1, LGPL-3.0, BSD-2-Clause
1<?php 2/** 3 * Zend Framework 4 * 5 * LICENSE 6 * 7 * This source file is subject to the new BSD license that is bundled 8 * with this package in the file LICENSE.txt. 9 * It is also available through the world-wide-web at this URL: 10 * http://framework.zend.com/license/new-bsd 11 * If you did not receive a copy of the license and are unable to 12 * obtain it through the world-wide-web, please send an email 13 * to license@zend.com so we can send you a copy immediately. 14 * 15 * @category Zend 16 * @package Zend_Search_Lucene 17 * @subpackage Analysis 18 * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com) 19 * @license http://framework.zend.com/license/new-bsd New BSD License 20 * @version $Id: Analyzer.php 23775 2011-03-01 17:25:24Z ralph $ 21 */ 22 23 24/** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */ 25/** @todo Section should be removed with ZF 2.0 release as obsolete */ 26if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) { 27 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ 28 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php'; 29 30 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */ 31 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php'; 32 33 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ 34 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php'; 35 36 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */ 37 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php'; 38 39 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ 40 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; 41 42 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ 43 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; 44 45 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */ 46 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; 47 48 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */ 49 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php'; 50} 51 52 53/** 54 * An Analyzer is used to analyze text. 55 * It thus represents a policy for extracting index terms from text. 56 * 57 * Note: 58 * Lucene Java implementation is oriented to streams. It provides effective work 59 * with a huge documents (more then 20Mb). 60 * But engine itself is not oriented such documents. 61 * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays). 62 * 63 * @category Zend 64 * @package Zend_Search_Lucene 65 * @subpackage Analysis 66 * @copyright Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com) 67 * @license http://framework.zend.com/license/new-bsd New BSD License 68 */ 69 70abstract class Zend_Search_Lucene_Analysis_Analyzer 71{ 72 /** 73 * The Analyzer implementation used by default. 74 * 75 * @var Zend_Search_Lucene_Analysis_Analyzer 76 */ 77 private static $_defaultImpl; 78 79 /** 80 * Input string 81 * 82 * @var string 83 */ 84 protected $_input = null; 85 86 /** 87 * Input string encoding 88 * 89 * @var string 90 */ 91 protected $_encoding = ''; 92 93 /** 94 * Tokenize text to a terms 95 * Returns array of Zend_Search_Lucene_Analysis_Token objects 96 * 97 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) 98 * 99 * @param string $data 100 * @return array 101 */ 102 public function tokenize($data, $encoding = '') 103 { 104 $this->setInput($data, $encoding); 105 106 $tokenList = array(); 107 while (($nextToken = $this->nextToken()) !== null) { 108 $tokenList[] = $nextToken; 109 } 110 111 return $tokenList; 112 } 113 114 115 /** 116 * Tokenization stream API 117 * Set input 118 * 119 * @param string $data 120 */ 121 public function setInput($data, $encoding = '') 122 { 123 $this->_input = $data; 124 $this->_encoding = $encoding; 125 $this->reset(); 126 } 127 128 /** 129 * Reset token stream 130 */ 131 abstract public function reset(); 132 133 /** 134 * Tokenization stream API 135 * Get next token 136 * Returns null at the end of stream 137 * 138 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) 139 * 140 * @return Zend_Search_Lucene_Analysis_Token|null 141 */ 142 abstract public function nextToken(); 143 144 145 146 147 /** 148 * Set the default Analyzer implementation used by indexing code. 149 * 150 * @param Zend_Search_Lucene_Analysis_Analyzer $similarity 151 */ 152 public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) 153 { 154 self::$_defaultImpl = $analyzer; 155 } 156 157 158 /** 159 * Return the default Analyzer implementation used by indexing code. 160 * 161 * @return Zend_Search_Lucene_Analysis_Analyzer 162 */ 163 public static function getDefault() 164 { 165 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ 166 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; 167 168 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { 169 self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); 170 } 171 172 return self::$_defaultImpl; 173 } 174} 175