WikipediaIndexWorker.java

/test/lucandra/wikipedia/WikipediaIndexWorker.java

https://github.com/d5nguyenvan/Lucandra · Java · 130 lines · 77 code · 29 blank · 24 comment · 8 complexity · a6ee2bb058deb322fe0ae29baf81eb89 MD5 · raw file


/**
 * Copyright 2010 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra.wikipedia;

import java.util.List;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;

import lucandra.CassandraUtils;
import lucandra.IndexWriter;

import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.TokenRange;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.util.Version;
import org.apache.thrift.transport.TTransportException;

public class WikipediaIndexWorker implements Callable<Integer> {

    // each worker thread has a connection to cassandra
    private static ConcurrentLinkedQueue<lucandra.IndexWriter> allClients = new ConcurrentLinkedQueue<IndexWriter>();
    private static ThreadLocal<lucandra.IndexWriter> clientPool = new ThreadLocal<lucandra.IndexWriter>();
    private static ThreadLocal<Integer> batchCount = new ThreadLocal<Integer>();

    // get ring info
    private static List<TokenRange> ring;
    static {
        try {
            Cassandra.Iface client = CassandraUtils.createConnection();
            ring = client.describe_ring(CassandraUtils.keySpace);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    //Add shutdown hook for batched commits to complete
    static {
        Runtime.getRuntime().addShutdownHook(new Thread() {
            public void run() {
                lucandra.IndexWriter w;
                while ((w = allClients.poll()) != null) {
                    w.commit();
                }

                System.err.println("committed");
            }
        });
    }
    
    // this is shared by all workers
    private static Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);

    // this is the article to index
    private Article article;

    public WikipediaIndexWorker(Article article) {
        this.article = article;
    }

    private lucandra.IndexWriter getIndexWriter() throws TTransportException {
        lucandra.IndexWriter indexWriter = clientPool.get();

        if (indexWriter == null) {

            Random r = new Random();
            List<String> endpoints = ring.get(r.nextInt(ring.size())).endpoints;
            String endpoint = endpoints.get(r.nextInt(endpoints.size()));

            indexWriter = new lucandra.IndexWriter("wikipedia", CassandraUtils.createRobustConnection(endpoint, 9160, false, false));
            clientPool.set(indexWriter);

            indexWriter.setAutoCommit(false);

            batchCount.set(0);
        }

        return indexWriter;
    }

    public Integer call() throws Exception {

        lucandra.IndexWriter indexWriter = getIndexWriter();

        Document d = new Document();

        d.add(new Field("title", article.title, Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS));

        if (article.text != null)
            d.add(new Field("text", new String(article.text,"UTF-8"), Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));

        d.add(new Field("url", article.url, Store.YES, Index.NOT_ANALYZED));

        indexWriter.addDocument(d, analyzer);

        Integer c = batchCount.get();
        if ((c + 1) % 64 == 0) {
            indexWriter.commit();
        }

        batchCount.set(c + 1);

        return article.getSize();
    }

}

Tech Fingerprint

Alerts (6)

'java.util.Random' Security Info: For security-sensitive operations (e.g., session IDs, crypto keys), use java.security.SecureRandom instead of java.util.Random, which is predictable.
23
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
56
'new Thread(' Prefer using ExecutorService or other thread pool mechanisms (java.util.concurrent) for better resource management and control over threads.
63
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
94
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
105
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
121