/src/edu/stanford/nlp/objectbank/ObjectBank.java
Java | 382 lines | 173 code | 43 blank | 166 comment | 19 complexity | 696707f40995c192e1f68512a2a70b53 MD5 | raw file
- package edu.stanford.nlp.objectbank;
- import edu.stanford.nlp.util.AbstractIterator;
- import edu.stanford.nlp.util.Function;
- import java.util.List;
- import java.util.ArrayList;
- import java.util.Collection;
- import java.util.Collections;
- import java.util.Iterator;
- import java.util.NoSuchElementException;
- import java.io.Reader;
- import java.io.IOException;
- import java.io.Serializable;
- /**
- * The ObjectBank class is designed to make it easy to change the format/source
- * of data read in by other classes and to standardize how data is read in
- * javaNLP classes.
- * This should make reuse of existing code (by non-authors of the code)
- * easier because one has to just create a new ObjectBank which knows where to
- * look for the data and how to turn it into Objects, and then use the new
- * ObjectBank in the class. This will also make it easier to reuse code for
- * reading in the same data.
- * <p/>
- * An ObjectBank is a Collection of Objects. These objects are taken
- * from input sources and then tokenized and parsed into the desired
- * kind of Object. An ObjectBank requires a ReaderIteratorFactory and a
- * IteratorFromReaderFactory. The ReaderIteratorFactory is used to get
- * an Iterator over java.util.Readers which contain representations of
- * the Objects. A ReaderIteratorFactory resembles a collection that
- * takes input sources and dispenses Iterators over java.util.Readers
- * of those sources. A IteratorFromReaderFactory is used to turn a single
- * java.io.Reader into an Iterator over Objects. The
- * IteratorFromReaderFactory splits the contents of the java.util.Reader
- * into Strings and then parses them into appropriate Objects.
- * <p/>
- * <h3>Example Usage:</h3>
- * <p/>
- * You have a collection of files in the directory /u/nlp/data/gre/questions. Each file
- * contains several Puzzle documents which look like:
- * <pre>
- * <puzzle>
- * <preamble> some text </preamble>
- * <question> some intro text
- * <answer> answer1 </answer>
- * <answer> answer2 </answer>
- * <answer> answer3 </answer>
- * <answer> answer4 </answer>
- * </question>
- * <question> another question
- * <answer> answer1 </answer>
- * <answer> answer2 </answer>
- * <answer> answer3 </answer>
- * <answer> answer4 </answer>
- * </question>
- * </puzzle>
- * </pre>
- * <p/>
- * First you need to build a ReaderIteratorFactory which will provide java.io.Readers
- * over all the files in your directory:
- * <p/>
- * <pre>
- * Collection c = new FileSequentialCollection("/u/nlp/data/gre/questions/", "", false);
- * ReaderIteratorFactory rif = new ReaderIteratorFactory(c);
- * </pre>
- * <p/>
- * Next you need to make an IteratorFromReaderFactory which will take the
- * java.io.Readers vended by the ReaderIteratorFactory, split them up into
- * documents (Strings) and
- * then convert the Strings into Objects. In this case we want to keep everything
- * between each set of <puzzle> </puzzle> tags so we would use a BeginEndTokenizerFactory.
- * You would also need to write a class which extends Function and whose apply method
- * converts the String between the <puzzle> </puzzle> tags into Puzzle objects.
- * <p/>
- * <pre>
- * public class PuzzleParser implements Function {
- * public Object apply (Object o) {
- * String s = (String)o;
- * ...
- * Puzzle p = new Puzzle(...);
- * ...
- * return p;
- * }
- * }
- * </pre>
- * <p/>
- * Now to build the IteratorFromReaderFactory:
- * <p/>
- * <pre>
- * IteratorFromReaderFactory rtif = new BeginEndTokenizerFactory("<puzzle>", "</puzzle>", new PuzzleParser());
- * </pre>
- * <p/>
- * Now, to create your ObjectBank you just give it the ReaderIteratorFactory and
- * IteratorFromReaderFactory that you just created:
- * <p/>
- * <pre>
- * ObjectBank puzzles = new ObjectBank(rif, rtif);
- * </pre>
- * <p/>
- * Now, if you get a new set of puzzles that are located elsewhere and formatted differently
- * you create a new ObjectBank for reading them in and use that ObjectBank instead with only
- * trivial changes (or possible none at all if the ObjectBank is read in on a constructor)
- * to your code. Or even better, if someone else wants to use your code to evaluate their puzzles,
- * which are located elsewhere and formatted differently, they already know what they have to do
- * to make your code work for them.
- * <p/>
- * ToDO: There's still tricky generic stuff to get right here: toArray should
- * take an arg of a different generic type if we follow the Collections API,
- * and the OBIterator doesn't seem to do the generic typing right. Should it
- * rather be F extends E ? [cdm notes, sep 2007]
- *
- * @author Jenny Finkel <A HREF="mailto:jrfinkel@stanford.edu>jrfinkel@stanford.edu</A>
- * @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types
- */
- public class ObjectBank<E> implements Collection<E>, Serializable {
- /**
- * This creates a new ObjectBank with the given ReaderIteratorFactory
- * and ObjectIteratorFactory.
- *
- * @param rif The {@link ReaderIteratorFactory} from which to get Readers
- * @param ifrf The {@link IteratorFromReaderFactory} which turns java.io.Readers
- * into Iterators of Objects
- */
- public ObjectBank(ReaderIteratorFactory rif, IteratorFromReaderFactory<E> ifrf) {
- this.rif = rif;
- this.ifrf = ifrf;
- }
- protected ReaderIteratorFactory rif;
- protected IteratorFromReaderFactory<E> ifrf;
- private List<E> contents; // = null;
- public static <X> ObjectBank<X> getLineIteratorObjectBank(String fileOrString, Function<String,X> op) {
- Collection<String> c = new ArrayList<String>();
- c.add(fileOrString);
- return getLineIteratorObjectBank(c, op);
- }
-
- //TODO: Should the "files" collection be required to hold Files?
- public static <X> ObjectBank<X> getLineIteratorObjectBank(Collection files, Function<String,X> op) {
- return getLineIteratorObjectBank(files, op, "utf-8");
- }
-
- public static <X> ObjectBank<X> getLineIteratorObjectBank(Collection files, Function<String,X> op, String encoding) {
- ReaderIteratorFactory rif = new ReaderIteratorFactory(files, encoding);
- IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
- return new ObjectBank<X>(rif, ifrf);
- }
-
- public static ObjectBank<String> getLineIteratorObjectBank(String filename, String encoding) {
- return getLineIteratorObjectBank(Collections.singleton(filename), new IdentityFunction<String>(), encoding);
- }
-
- public static ObjectBank<String> getLineIteratorObjectBank(String filename) {
- return getLineIteratorObjectBank(filename, "utf-8");
- }
-
- public Iterator<E> iterator() {
- // basically concatenates Iterator's made from
- // each java.io.Reader.
- if (keepInMemory) {
- if (contents == null) {
- contents = new ArrayList<E>();
- Iterator<E> iter = new OBIterator();
- while(iter.hasNext()) {
- contents.add(iter.next());
- }
- }
- return contents.iterator();
- }
- return new OBIterator();
- }
- private boolean keepInMemory; // = false;
- /**
- * Tells the ObjectBank to store all of
- * its contents in memory so that it doesn't
- * have to be recomputed each time you iterate
- * through it. This is useful when the data
- * is small enough that it can be kept in
- * memory, but reading/processing it
- * is expensive/slow. Defaults to false.
- */
- public void keepInMemory(boolean keep) {
- keepInMemory = keep;
- }
- /**
- * If you are keeping the contents in memory,
- * this will clear hte memory, and they will be
- * recomputed the next time iterator() is
- * called.
- */
- public void clearMemory(){
- contents = null;
- }
- public boolean isEmpty() {
- return !iterator().hasNext();
- }
- /**
- * Can be slow. Usage not recommended.
- */
- public boolean contains(Object o) {
- Iterator<E> iter = iterator();
- while (iter.hasNext()) {
- if (iter.next() == o) {
- return true;
- }
- }
- return false;
- }
- /**
- * Can be slow. Usage not recommended.
- */
- public boolean containsAll(Collection<?> c) {
- for (Object obj : c) {
- if ( ! contains(obj)) {
- return false;
- }
- }
- return true;
- }
- /**
- * Can be slow. Usage not recommended.
- */
- public int size() {
- Iterator<E> iter = iterator();
- int size = 0;
- while (iter.hasNext()) {
- size++;
- iter.next();
- }
- return size;
- }
- public void clear() {
- rif = new ReaderIteratorFactory();
- }
- /**
- * Can be slow. Usage not recommended.
- */
- public Object[] toArray() {
- Iterator<E> iter = iterator();
- ArrayList<Object> al = new ArrayList<Object>();
- while (iter.hasNext()) {
- al.add(iter.next());
- }
- return al.toArray();
- }
- /**
- * Can be slow. Usage not recommended.
- */
- //TODO: Not sure if this is right. It used to have <E>, but that was shadowing the
- // class's parametrized type...
- public <T> T[] toArray(T[] o) {
- Iterator<E> iter = iterator();
- ArrayList<E> al = new ArrayList<E>();
- while (iter.hasNext()) {
- al.add(iter.next());
- }
- return al.toArray(o);
- }
- /**
- * Unsupported Operation. If you wish to add a new data source,
- * do so in the underlying ReaderIteratorFactory
- */
- public boolean add(E o) {
- throw new UnsupportedOperationException();
- }
- /**
- * Unsupported Operation. If you wish to remove a data source,
- * do so in the underlying ReaderIteratorFactory
- */
- public boolean remove(Object o) {
- throw new UnsupportedOperationException();
- }
- /**
- * Unsupported Operation. If you wish to add new data sources,
- * do so in the underlying ReaderIteratorFactory
- */
- public boolean addAll(Collection<? extends E> c) {
- throw new UnsupportedOperationException();
- }
- /**
- * Unsupported Operation. If you wish to remove data sources,
- * remove, do so in the underlying ReaderIteratorFactory
- */
- public boolean removeAll(Collection<?> c) {
- throw new UnsupportedOperationException();
- }
- /**
- * Unsupported Operation. If you wish to retain only certian data
- * sources, do so in the underlying ReaderIteratorFactory
- */
- public boolean retainAll(Collection<?> c) {
- throw new UnsupportedOperationException();
- }
- /**
- * Iterator of Objects
- */
- class OBIterator extends AbstractIterator<E> {
- Iterator<Reader> readerIterator;
- Iterator<E> tok;
- E nextObject;
- Reader currReader = null;
- public OBIterator() {
- readerIterator = rif.iterator();
- currReader = readerIterator.next();
- tok = ifrf.getIterator(currReader);
- setNextObject();
- }
- private void setNextObject() {
- if (tok.hasNext()) {
- nextObject = tok.next();
- return;
- }
- while (true) {
- if (readerIterator.hasNext()) {
- try {
- currReader.close();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- currReader = readerIterator.next();
- tok = ifrf.getIterator(currReader);
- } else {
- nextObject = null;
- return;
- }
- if (tok.hasNext()) {
- nextObject = tok.next();
- return;
- }
- }
- }
- @Override
- public boolean hasNext() {
- return nextObject != null;
- }
- @Override
- public E next() {
- if (nextObject == null) {
- throw new NoSuchElementException();
- }
- E tmp = nextObject;
- setNextObject();
- return tmp;
- }
- }
- private static final long serialVersionUID = -4030295596701541770L;
- }