PageRenderTime 31ms CodeModel.GetById 26ms app.highlight 3ms RepoModel.GetById 0ms app.codeStats 0ms

/zf/library/Zend/Search/Lucene/Analysis/Analyzer.php

http://github.com/eryx/php-framework-benchmark
PHP | 175 lines | 46 code | 30 blank | 99 comment | 3 complexity | 1f188aefb9c1a92204c66dc53f83c955 MD5 | raw file
Possible License(s): MIT, BSD-3-Clause, Apache-2.0, LGPL-2.1, LGPL-3.0, BSD-2-Clause
  1<?php
  2/**
  3 * Zend Framework
  4 *
  5 * LICENSE
  6 *
  7 * This source file is subject to the new BSD license that is bundled
  8 * with this package in the file LICENSE.txt.
  9 * It is also available through the world-wide-web at this URL:
 10 * http://framework.zend.com/license/new-bsd
 11 * If you did not receive a copy of the license and are unable to
 12 * obtain it through the world-wide-web, please send an email
 13 * to license@zend.com so we can send you a copy immediately.
 14 *
 15 * @category   Zend
 16 * @package    Zend_Search_Lucene
 17 * @subpackage Analysis
 18 * @copyright  Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
 19 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 20 * @version    $Id: Analyzer.php 23775 2011-03-01 17:25:24Z ralph $
 21 */
 22
 23
 24/** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */
 25/** @todo Section should be removed with ZF 2.0 release as obsolete                      */
 26if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) {
 27    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
 28    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
 29
 30    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
 31    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
 32
 33    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
 34    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
 35
 36    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
 37    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
 38
 39    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
 40    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
 41
 42    /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
 43    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
 44
 45    /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
 46    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
 47
 48    /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
 49    require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
 50}
 51
 52
 53/**
 54 * An Analyzer is used to analyze text.
 55 * It thus represents a policy for extracting index terms from text.
 56 *
 57 * Note:
 58 * Lucene Java implementation is oriented to streams. It provides effective work
 59 * with a huge documents (more then 20Mb).
 60 * But engine itself is not oriented such documents.
 61 * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
 62 *
 63 * @category   Zend
 64 * @package    Zend_Search_Lucene
 65 * @subpackage Analysis
 66 * @copyright  Copyright (c) 2005-2011 Zend Technologies USA Inc. (http://www.zend.com)
 67 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 68 */
 69
 70abstract class Zend_Search_Lucene_Analysis_Analyzer
 71{
 72    /**
 73     * The Analyzer implementation used by default.
 74     *
 75     * @var Zend_Search_Lucene_Analysis_Analyzer
 76     */
 77    private static $_defaultImpl;
 78
 79    /**
 80     * Input string
 81     *
 82     * @var string
 83     */
 84    protected $_input = null;
 85
 86    /**
 87     * Input string encoding
 88     *
 89     * @var string
 90     */
 91    protected $_encoding = '';
 92
 93    /**
 94     * Tokenize text to a terms
 95     * Returns array of Zend_Search_Lucene_Analysis_Token objects
 96     *
 97     * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
 98     *
 99     * @param string $data
100     * @return array
101     */
102    public function tokenize($data, $encoding = '')
103    {
104        $this->setInput($data, $encoding);
105
106        $tokenList = array();
107        while (($nextToken = $this->nextToken()) !== null) {
108            $tokenList[] = $nextToken;
109        }
110
111        return $tokenList;
112    }
113
114
115    /**
116     * Tokenization stream API
117     * Set input
118     *
119     * @param string $data
120     */
121    public function setInput($data, $encoding = '')
122    {
123        $this->_input    = $data;
124        $this->_encoding = $encoding;
125        $this->reset();
126    }
127
128    /**
129     * Reset token stream
130     */
131    abstract public function reset();
132
133    /**
134     * Tokenization stream API
135     * Get next token
136     * Returns null at the end of stream
137     *
138     * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
139     *
140     * @return Zend_Search_Lucene_Analysis_Token|null
141     */
142    abstract public function nextToken();
143
144
145
146
147    /**
148     * Set the default Analyzer implementation used by indexing code.
149     *
150     * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
151     */
152    public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
153    {
154        self::$_defaultImpl = $analyzer;
155    }
156
157
158    /**
159     * Return the default Analyzer implementation used by indexing code.
160     *
161     * @return Zend_Search_Lucene_Analysis_Analyzer
162     */
163    public static function getDefault()
164    {
165        /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
166        require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
167
168        if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
169            self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
170        }
171
172        return self::$_defaultImpl;
173    }
174}
175