PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java

https://github.com/adsabs/solr-affiliation-disambiguation
Java | 330 lines | 169 code | 49 blank | 112 comment | 14 complexity | bc5ad6a62d152c4716ca6ce76bcb8817 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.solr.handler;
  18. import org.apache.lucene.analysis.Analyzer;
  19. import org.apache.lucene.analysis.CharReader;
  20. import org.apache.lucene.analysis.CharStream;
  21. import org.apache.lucene.analysis.Token;
  22. import org.apache.lucene.analysis.TokenStream;
  23. import org.apache.solr.analysis.CharFilterFactory;
  24. import org.apache.solr.analysis.TokenFilterFactory;
  25. import org.apache.solr.analysis.TokenizerChain;
  26. import org.apache.solr.analysis.TokenizerFactory;
  27. import org.apache.solr.common.util.NamedList;
  28. import org.apache.solr.common.util.SimpleOrderedMap;
  29. import org.apache.solr.common.SolrException;
  30. import org.apache.solr.request.SolrQueryRequest;
  31. import org.apache.solr.request.SolrQueryResponse;
  32. import org.apache.solr.schema.FieldType;
  33. import java.io.IOException;
  34. import java.io.StringReader;
  35. import java.util.*;
  36. /**
  37. * A base class for all analysis request handlers.
  38. *
  39. * @version $Id: AnalysisRequestHandlerBase.java 827032 2009-10-20 11:01:47Z koji $
  40. * @since solr 1.4
  41. */
  42. public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
  43. public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
  44. rsp.add("analysis", doAnalysis(req));
  45. }
  46. /**
  47. * Performs the analysis based on the given solr request and returns the analysis result as a named list.
  48. *
  49. * @param req The solr request.
  50. *
  51. * @return The analysis result as a named list.
  52. *
  53. * @throws Exception When analysis fails.
  54. */
  55. protected abstract NamedList doAnalysis(SolrQueryRequest req) throws Exception;
  56. /**
  57. * Analyzes the given value using the given Analyzer.
  58. *
  59. * @param value Value to analyze
  60. * @param context The {@link AnalysisContext analysis context}.
  61. *
  62. * @return NamedList containing the tokens produced by analyzing the given value
  63. */
  64. protected NamedList<List<NamedList>> analyzeValue(String value, AnalysisContext context) {
  65. Analyzer analyzer = context.getAnalyzer();
  66. if (!TokenizerChain.class.isInstance(analyzer)) {
  67. TokenStream tokenStream = null;
  68. try {
  69. tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value));
  70. tokenStream.reset();
  71. } catch (IOException e) {
  72. throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
  73. }
  74. NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
  75. namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
  76. return namedList;
  77. }
  78. TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
  79. CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
  80. TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
  81. TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
  82. NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
  83. if( cfiltfacs != null ){
  84. String source = value;
  85. for(CharFilterFactory cfiltfac : cfiltfacs ){
  86. CharStream reader = CharReader.get(new StringReader(source));
  87. reader = cfiltfac.create(reader);
  88. source = writeCharStream(namedList, reader);
  89. }
  90. }
  91. TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value)));
  92. List<Token> tokens = analyzeTokenStream(tokenStream);
  93. namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
  94. ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens);
  95. for (TokenFilterFactory tokenFilterFactory : filtfacs) {
  96. tokenStream = tokenFilterFactory.create(listBasedTokenStream);
  97. List<Token> tokenList = analyzeTokenStream(tokenStream);
  98. namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context));
  99. listBasedTokenStream = new ListBasedTokenStream(tokenList);
  100. }
  101. return namedList;
  102. }
  103. /**
  104. * Analyzes the given text using the given analyzer and returns the produced tokens.
  105. *
  106. * @param value The value to analyze.
  107. * @param analyzer The analyzer to use.
  108. *
  109. * @return The produces token list.
  110. */
  111. protected List<Token> analyzeValue(String value, Analyzer analyzer) {
  112. TokenStream tokenStream = analyzer.tokenStream("", new StringReader(value));
  113. return analyzeTokenStream(tokenStream);
  114. }
  115. /**
  116. * Analyzes the given TokenStream, collecting the Tokens it produces.
  117. *
  118. * @param tokenStream TokenStream to analyze
  119. *
  120. * @return List of tokens produced from the TokenStream
  121. */
  122. private List<Token> analyzeTokenStream(TokenStream tokenStream) {
  123. List<Token> tokens = new ArrayList<Token>();
  124. Token reusableToken = new Token();
  125. Token token = null;
  126. try {
  127. while ((token = tokenStream.next(reusableToken)) != null) {
  128. tokens.add((Token) token.clone());
  129. }
  130. } catch (IOException ioe) {
  131. throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
  132. }
  133. return tokens;
  134. }
  135. /**
  136. * Converts the list of Tokens to a list of NamedLists representing the tokens.
  137. *
  138. * @param tokens Tokens to convert
  139. * @param context The analysis context
  140. *
  141. * @return List of NamedLists containing the relevant information taken from the tokens
  142. */
  143. private List<NamedList> convertTokensToNamedLists(List<Token> tokens, AnalysisContext context) {
  144. List<NamedList> tokensNamedLists = new ArrayList<NamedList>();
  145. Collections.sort(tokens, new Comparator<Token>() {
  146. public int compare(Token o1, Token o2) {
  147. return o1.endOffset() - o2.endOffset();
  148. }
  149. });
  150. int position = 0;
  151. FieldType fieldType = context.getFieldType();
  152. for (Token token : tokens) {
  153. NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
  154. String text = fieldType.indexedToReadable(token.term());
  155. tokenNamedList.add("text", text);
  156. if (!text.equals(token.term())) {
  157. tokenNamedList.add("raw_text", token.term());
  158. }
  159. tokenNamedList.add("type", token.type());
  160. tokenNamedList.add("start", token.startOffset());
  161. tokenNamedList.add("end", token.endOffset());
  162. position += token.getPositionIncrement();
  163. tokenNamedList.add("position", position);
  164. if (context.getTermsToMatch().contains(token.term())) {
  165. tokenNamedList.add("match", true);
  166. }
  167. if (token.getPayload() != null) {
  168. tokenNamedList.add("payload", token.getPayload());
  169. }
  170. tokensNamedLists.add(tokenNamedList);
  171. }
  172. return tokensNamedLists;
  173. }
  174. private String writeCharStream(NamedList out, CharStream input ){
  175. final int BUFFER_SIZE = 1024;
  176. char[] buf = new char[BUFFER_SIZE];
  177. int len = 0;
  178. StringBuilder sb = new StringBuilder();
  179. do {
  180. try {
  181. len = input.read( buf, 0, BUFFER_SIZE );
  182. } catch (IOException e) {
  183. throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
  184. }
  185. if( len > 0 )
  186. sb.append(buf, 0, len);
  187. } while( len == BUFFER_SIZE );
  188. out.add( input.getClass().getName(), sb.toString());
  189. return sb.toString();
  190. }
  191. // ================================================= Inner classes =================================================
  192. /**
  193. * TokenStream that iterates over a list of pre-existing Tokens
  194. */
  195. protected static class ListBasedTokenStream extends TokenStream {
  196. private final Iterator<Token> tokenIterator;
  197. /**
  198. * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
  199. *
  200. * @param tokens Source of tokens to be used
  201. */
  202. ListBasedTokenStream(List<Token> tokens) {
  203. tokenIterator = tokens.iterator();
  204. }
  205. /**
  206. * {@inheritDoc}
  207. */
  208. @Override
  209. public Token next(Token token) throws IOException {
  210. return (tokenIterator.hasNext()) ? tokenIterator.next() : null;
  211. }
  212. }
  213. /**
  214. * Serves as the context of an analysis process. This context contains the following constructs
  215. */
  216. protected static class AnalysisContext {
  217. private final String fieldName;
  218. private final FieldType fieldType;
  219. private final Analyzer analyzer;
  220. private final Set<String> termsToMatch;
  221. /**
  222. * Constructs a new AnalysisContext with a given field tpe, analyzer and
  223. * termsToMatch. By default the field name in this context will be
  224. * {@code null}. During the analysis processs, The produced tokens will
  225. * be compaired to the terms in the {@code termsToMatch} set. When found,
  226. * these tokens will be marked as a match.
  227. *
  228. * @param fieldType The type of the field the analysis is performed on.
  229. * @param analyzer The analyzer to be used.
  230. * @param termsToMatch Holds all the terms that should match during the
  231. * analysis process.
  232. */
  233. public AnalysisContext(FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) {
  234. this(null, fieldType, analyzer, termsToMatch);
  235. }
  236. /**
  237. * Constructs an AnalysisContext with a given field name, field type
  238. * and analyzer. By default this context will hold no terms to match
  239. *
  240. * @param fieldName The name of the field the analysis is performed on
  241. * (may be {@code null}).
  242. * @param fieldType The type of the field the analysis is performed on.
  243. * @param analyzer The analyzer to be used during the analysis process.
  244. *
  245. */
  246. public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer) {
  247. this(fieldName, fieldType, analyzer, Collections.EMPTY_SET);
  248. }
  249. /**
  250. * Constructs a new AnalysisContext with a given field tpe, analyzer and
  251. * termsToMatch. During the analysis processs, The produced tokens will be
  252. * compaired to the termes in the {@code termsToMatch} set. When found,
  253. * these tokens will be marked as a match.
  254. *
  255. * @param fieldName The name of the field the analysis is performed on
  256. * (may be {@code null}).
  257. * @param fieldType The type of the field the analysis is performed on.
  258. * @param analyzer The analyzer to be used.
  259. * @param termsToMatch Holds all the terms that should match during the
  260. * analysis process.
  261. */
  262. public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) {
  263. this.fieldName = fieldName;
  264. this.fieldType = fieldType;
  265. this.analyzer = analyzer;
  266. this.termsToMatch = termsToMatch;
  267. }
  268. public String getFieldName() {
  269. return fieldName;
  270. }
  271. public FieldType getFieldType() {
  272. return fieldType;
  273. }
  274. public Analyzer getAnalyzer() {
  275. return analyzer;
  276. }
  277. public Set<String> getTermsToMatch() {
  278. return termsToMatch;
  279. }
  280. }
  281. }