/test/lucandra/wikipedia/WikipediaIndexWorker.java

https://github.com/d5nguyenvan/Lucandra · Java · 130 lines · 77 code · 29 blank · 24 comment · 8 complexity · a6ee2bb058deb322fe0ae29baf81eb89 MD5 · raw file

  1. /**
  2. * Copyright 2010 T Jake Luciani
  3. *
  4. * Licensed to the Apache Software Foundation (ASF) under one
  5. * or more contributor license agreements. See the NOTICE file
  6. * distributed with this work for additional information
  7. * regarding copyright ownership. The ASF licenses this file
  8. * to you under the Apache License, Version 2.0 (the
  9. * "License"); you may not use this file except in compliance
  10. * with the License. You may obtain a copy of the License at
  11. *
  12. * http://www.apache.org/licenses/LICENSE-2.0
  13. *
  14. * Unless required by applicable law or agreed to in writing, software
  15. * distributed under the License is distributed on an "AS IS" BASIS,
  16. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. * See the License for the specific language governing permissions and
  18. * limitations under the License.
  19. */
  20. package lucandra.wikipedia;
  21. import java.util.List;
  22. import java.util.Random;
  23. import java.util.concurrent.Callable;
  24. import java.util.concurrent.ConcurrentLinkedQueue;
  25. import lucandra.CassandraUtils;
  26. import lucandra.IndexWriter;
  27. import org.apache.cassandra.thrift.Cassandra;
  28. import org.apache.cassandra.thrift.TokenRange;
  29. import org.apache.lucene.analysis.Analyzer;
  30. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  31. import org.apache.lucene.analysis.cjk.CJKAnalyzer;
  32. import org.apache.lucene.document.Document;
  33. import org.apache.lucene.document.Field;
  34. import org.apache.lucene.document.Field.Index;
  35. import org.apache.lucene.document.Field.Store;
  36. import org.apache.lucene.document.Field.TermVector;
  37. import org.apache.lucene.util.Version;
  38. import org.apache.thrift.transport.TTransportException;
  39. public class WikipediaIndexWorker implements Callable<Integer> {
  40. // each worker thread has a connection to cassandra
  41. private static ConcurrentLinkedQueue<lucandra.IndexWriter> allClients = new ConcurrentLinkedQueue<IndexWriter>();
  42. private static ThreadLocal<lucandra.IndexWriter> clientPool = new ThreadLocal<lucandra.IndexWriter>();
  43. private static ThreadLocal<Integer> batchCount = new ThreadLocal<Integer>();
  44. // get ring info
  45. private static List<TokenRange> ring;
  46. static {
  47. try {
  48. Cassandra.Iface client = CassandraUtils.createConnection();
  49. ring = client.describe_ring(CassandraUtils.keySpace);
  50. } catch (Exception e) {
  51. throw new RuntimeException(e);
  52. }
  53. }
  54. //Add shutdown hook for batched commits to complete
  55. static {
  56. Runtime.getRuntime().addShutdownHook(new Thread() {
  57. public void run() {
  58. lucandra.IndexWriter w;
  59. while ((w = allClients.poll()) != null) {
  60. w.commit();
  61. }
  62. System.err.println("committed");
  63. }
  64. });
  65. }
  66. // this is shared by all workers
  67. private static Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
  68. // this is the article to index
  69. private Article article;
  70. public WikipediaIndexWorker(Article article) {
  71. this.article = article;
  72. }
  73. private lucandra.IndexWriter getIndexWriter() throws TTransportException {
  74. lucandra.IndexWriter indexWriter = clientPool.get();
  75. if (indexWriter == null) {
  76. Random r = new Random();
  77. List<String> endpoints = ring.get(r.nextInt(ring.size())).endpoints;
  78. String endpoint = endpoints.get(r.nextInt(endpoints.size()));
  79. indexWriter = new lucandra.IndexWriter("wikipedia", CassandraUtils.createRobustConnection(endpoint, 9160, false, false));
  80. clientPool.set(indexWriter);
  81. indexWriter.setAutoCommit(false);
  82. batchCount.set(0);
  83. }
  84. return indexWriter;
  85. }
  86. public Integer call() throws Exception {
  87. lucandra.IndexWriter indexWriter = getIndexWriter();
  88. Document d = new Document();
  89. d.add(new Field("title", article.title, Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS));
  90. if (article.text != null)
  91. d.add(new Field("text", new String(article.text,"UTF-8"), Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
  92. d.add(new Field("url", article.url, Store.YES, Index.NOT_ANALYZED));
  93. indexWriter.addDocument(d, analyzer);
  94. Integer c = batchCount.get();
  95. if ((c + 1) % 64 == 0) {
  96. indexWriter.commit();
  97. }
  98. batchCount.set(c + 1);
  99. return article.getSize();
  100. }
  101. }