PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/src/edu/stanford/nlp/objectbank/ObjectBank.java

https://github.com/thisandagain/stanford-ner
Java | 382 lines | 173 code | 43 blank | 166 comment | 19 complexity | 696707f40995c192e1f68512a2a70b53 MD5 | raw file
  1. package edu.stanford.nlp.objectbank;
  2. import edu.stanford.nlp.util.AbstractIterator;
  3. import edu.stanford.nlp.util.Function;
  4. import java.util.List;
  5. import java.util.ArrayList;
  6. import java.util.Collection;
  7. import java.util.Collections;
  8. import java.util.Iterator;
  9. import java.util.NoSuchElementException;
  10. import java.io.Reader;
  11. import java.io.IOException;
  12. import java.io.Serializable;
  13. /**
  14. * The ObjectBank class is designed to make it easy to change the format/source
  15. * of data read in by other classes and to standardize how data is read in
  16. * javaNLP classes.
  17. * This should make reuse of existing code (by non-authors of the code)
  18. * easier because one has to just create a new ObjectBank which knows where to
  19. * look for the data and how to turn it into Objects, and then use the new
  20. * ObjectBank in the class. This will also make it easier to reuse code for
  21. * reading in the same data.
  22. * <p/>
  23. * An ObjectBank is a Collection of Objects. These objects are taken
  24. * from input sources and then tokenized and parsed into the desired
  25. * kind of Object. An ObjectBank requires a ReaderIteratorFactory and a
  26. * IteratorFromReaderFactory. The ReaderIteratorFactory is used to get
  27. * an Iterator over java.util.Readers which contain representations of
  28. * the Objects. A ReaderIteratorFactory resembles a collection that
  29. * takes input sources and dispenses Iterators over java.util.Readers
  30. * of those sources. A IteratorFromReaderFactory is used to turn a single
  31. * java.io.Reader into an Iterator over Objects. The
  32. * IteratorFromReaderFactory splits the contents of the java.util.Reader
  33. * into Strings and then parses them into appropriate Objects.
  34. * <p/>
  35. * <h3>Example Usage:</h3>
  36. * <p/>
  37. * You have a collection of files in the directory /u/nlp/data/gre/questions. Each file
  38. * contains several Puzzle documents which look like:
  39. * <pre>
  40. * &lt;puzzle>
  41. * &lt;preamble> some text &lt;/preamble>
  42. * &lt;question> some intro text
  43. * &lt;answer> answer1 &lt;/answer>
  44. * &lt;answer> answer2 &lt;/answer>
  45. * &lt;answer> answer3 &lt;/answer>
  46. * &lt;answer> answer4 &lt;/answer>
  47. * &lt;/question>
  48. * &lt;question> another question
  49. * &lt;answer> answer1 &lt;/answer>
  50. * &lt;answer> answer2 &lt;/answer>
  51. * &lt;answer> answer3 &lt;/answer>
  52. * &lt;answer> answer4 &lt;/answer>
  53. * &lt;/question>
  54. * &lt;/puzzle>
  55. * </pre>
  56. * <p/>
  57. * First you need to build a ReaderIteratorFactory which will provide java.io.Readers
  58. * over all the files in your directory:
  59. * <p/>
  60. * <pre>
  61. * Collection c = new FileSequentialCollection("/u/nlp/data/gre/questions/", "", false);
  62. * ReaderIteratorFactory rif = new ReaderIteratorFactory(c);
  63. * </pre>
  64. * <p/>
  65. * Next you need to make an IteratorFromReaderFactory which will take the
  66. * java.io.Readers vended by the ReaderIteratorFactory, split them up into
  67. * documents (Strings) and
  68. * then convert the Strings into Objects. In this case we want to keep everything
  69. * between each set of <puzzle> </puzzle> tags so we would use a BeginEndTokenizerFactory.
  70. * You would also need to write a class which extends Function and whose apply method
  71. * converts the String between the <puzzle> </puzzle> tags into Puzzle objects.
  72. * <p/>
  73. * <pre>
  74. * public class PuzzleParser implements Function {
  75. * public Object apply (Object o) {
  76. * String s = (String)o;
  77. * ...
  78. * Puzzle p = new Puzzle(...);
  79. * ...
  80. * return p;
  81. * }
  82. * }
  83. * </pre>
  84. * <p/>
  85. * Now to build the IteratorFromReaderFactory:
  86. * <p/>
  87. * <pre>
  88. * IteratorFromReaderFactory rtif = new BeginEndTokenizerFactory("<puzzle>", "</puzzle>", new PuzzleParser());
  89. * </pre>
  90. * <p/>
  91. * Now, to create your ObjectBank you just give it the ReaderIteratorFactory and
  92. * IteratorFromReaderFactory that you just created:
  93. * <p/>
  94. * <pre>
  95. * ObjectBank puzzles = new ObjectBank(rif, rtif);
  96. * </pre>
  97. * <p/>
  98. * Now, if you get a new set of puzzles that are located elsewhere and formatted differently
  99. * you create a new ObjectBank for reading them in and use that ObjectBank instead with only
  100. * trivial changes (or possible none at all if the ObjectBank is read in on a constructor)
  101. * to your code. Or even better, if someone else wants to use your code to evaluate their puzzles,
  102. * which are located elsewhere and formatted differently, they already know what they have to do
  103. * to make your code work for them.
  104. * <p/>
  105. * ToDO: There's still tricky generic stuff to get right here: toArray should
  106. * take an arg of a different generic type if we follow the Collections API,
  107. * and the OBIterator doesn't seem to do the generic typing right. Should it
  108. * rather be F extends E ? [cdm notes, sep 2007]
  109. *
  110. * @author Jenny Finkel <A HREF="mailto:jrfinkel@stanford.edu>jrfinkel@stanford.edu</A>
  111. * @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types
  112. */
  113. public class ObjectBank<E> implements Collection<E>, Serializable {
  114. /**
  115. * This creates a new ObjectBank with the given ReaderIteratorFactory
  116. * and ObjectIteratorFactory.
  117. *
  118. * @param rif The {@link ReaderIteratorFactory} from which to get Readers
  119. * @param ifrf The {@link IteratorFromReaderFactory} which turns java.io.Readers
  120. * into Iterators of Objects
  121. */
  122. public ObjectBank(ReaderIteratorFactory rif, IteratorFromReaderFactory<E> ifrf) {
  123. this.rif = rif;
  124. this.ifrf = ifrf;
  125. }
  126. protected ReaderIteratorFactory rif;
  127. protected IteratorFromReaderFactory<E> ifrf;
  128. private List<E> contents; // = null;
  129. public static <X> ObjectBank<X> getLineIteratorObjectBank(String fileOrString, Function<String,X> op) {
  130. Collection<String> c = new ArrayList<String>();
  131. c.add(fileOrString);
  132. return getLineIteratorObjectBank(c, op);
  133. }
  134. //TODO: Should the "files" collection be required to hold Files?
  135. public static <X> ObjectBank<X> getLineIteratorObjectBank(Collection files, Function<String,X> op) {
  136. return getLineIteratorObjectBank(files, op, "utf-8");
  137. }
  138. public static <X> ObjectBank<X> getLineIteratorObjectBank(Collection files, Function<String,X> op, String encoding) {
  139. ReaderIteratorFactory rif = new ReaderIteratorFactory(files, encoding);
  140. IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
  141. return new ObjectBank<X>(rif, ifrf);
  142. }
  143. public static ObjectBank<String> getLineIteratorObjectBank(String filename, String encoding) {
  144. return getLineIteratorObjectBank(Collections.singleton(filename), new IdentityFunction<String>(), encoding);
  145. }
  146. public static ObjectBank<String> getLineIteratorObjectBank(String filename) {
  147. return getLineIteratorObjectBank(filename, "utf-8");
  148. }
  149. public Iterator<E> iterator() {
  150. // basically concatenates Iterator's made from
  151. // each java.io.Reader.
  152. if (keepInMemory) {
  153. if (contents == null) {
  154. contents = new ArrayList<E>();
  155. Iterator<E> iter = new OBIterator();
  156. while(iter.hasNext()) {
  157. contents.add(iter.next());
  158. }
  159. }
  160. return contents.iterator();
  161. }
  162. return new OBIterator();
  163. }
  164. private boolean keepInMemory; // = false;
  165. /**
  166. * Tells the ObjectBank to store all of
  167. * its contents in memory so that it doesn't
  168. * have to be recomputed each time you iterate
  169. * through it. This is useful when the data
  170. * is small enough that it can be kept in
  171. * memory, but reading/processing it
  172. * is expensive/slow. Defaults to false.
  173. */
  174. public void keepInMemory(boolean keep) {
  175. keepInMemory = keep;
  176. }
  177. /**
  178. * If you are keeping the contents in memory,
  179. * this will clear hte memory, and they will be
  180. * recomputed the next time iterator() is
  181. * called.
  182. */
  183. public void clearMemory(){
  184. contents = null;
  185. }
  186. public boolean isEmpty() {
  187. return !iterator().hasNext();
  188. }
  189. /**
  190. * Can be slow. Usage not recommended.
  191. */
  192. public boolean contains(Object o) {
  193. Iterator<E> iter = iterator();
  194. while (iter.hasNext()) {
  195. if (iter.next() == o) {
  196. return true;
  197. }
  198. }
  199. return false;
  200. }
  201. /**
  202. * Can be slow. Usage not recommended.
  203. */
  204. public boolean containsAll(Collection<?> c) {
  205. for (Object obj : c) {
  206. if ( ! contains(obj)) {
  207. return false;
  208. }
  209. }
  210. return true;
  211. }
  212. /**
  213. * Can be slow. Usage not recommended.
  214. */
  215. public int size() {
  216. Iterator<E> iter = iterator();
  217. int size = 0;
  218. while (iter.hasNext()) {
  219. size++;
  220. iter.next();
  221. }
  222. return size;
  223. }
  224. public void clear() {
  225. rif = new ReaderIteratorFactory();
  226. }
  227. /**
  228. * Can be slow. Usage not recommended.
  229. */
  230. public Object[] toArray() {
  231. Iterator<E> iter = iterator();
  232. ArrayList<Object> al = new ArrayList<Object>();
  233. while (iter.hasNext()) {
  234. al.add(iter.next());
  235. }
  236. return al.toArray();
  237. }
  238. /**
  239. * Can be slow. Usage not recommended.
  240. */
  241. //TODO: Not sure if this is right. It used to have <E>, but that was shadowing the
  242. // class's parametrized type...
  243. public <T> T[] toArray(T[] o) {
  244. Iterator<E> iter = iterator();
  245. ArrayList<E> al = new ArrayList<E>();
  246. while (iter.hasNext()) {
  247. al.add(iter.next());
  248. }
  249. return al.toArray(o);
  250. }
  251. /**
  252. * Unsupported Operation. If you wish to add a new data source,
  253. * do so in the underlying ReaderIteratorFactory
  254. */
  255. public boolean add(E o) {
  256. throw new UnsupportedOperationException();
  257. }
  258. /**
  259. * Unsupported Operation. If you wish to remove a data source,
  260. * do so in the underlying ReaderIteratorFactory
  261. */
  262. public boolean remove(Object o) {
  263. throw new UnsupportedOperationException();
  264. }
  265. /**
  266. * Unsupported Operation. If you wish to add new data sources,
  267. * do so in the underlying ReaderIteratorFactory
  268. */
  269. public boolean addAll(Collection<? extends E> c) {
  270. throw new UnsupportedOperationException();
  271. }
  272. /**
  273. * Unsupported Operation. If you wish to remove data sources,
  274. * remove, do so in the underlying ReaderIteratorFactory
  275. */
  276. public boolean removeAll(Collection<?> c) {
  277. throw new UnsupportedOperationException();
  278. }
  279. /**
  280. * Unsupported Operation. If you wish to retain only certian data
  281. * sources, do so in the underlying ReaderIteratorFactory
  282. */
  283. public boolean retainAll(Collection<?> c) {
  284. throw new UnsupportedOperationException();
  285. }
  286. /**
  287. * Iterator of Objects
  288. */
  289. class OBIterator extends AbstractIterator<E> {
  290. Iterator<Reader> readerIterator;
  291. Iterator<E> tok;
  292. E nextObject;
  293. Reader currReader = null;
  294. public OBIterator() {
  295. readerIterator = rif.iterator();
  296. currReader = readerIterator.next();
  297. tok = ifrf.getIterator(currReader);
  298. setNextObject();
  299. }
  300. private void setNextObject() {
  301. if (tok.hasNext()) {
  302. nextObject = tok.next();
  303. return;
  304. }
  305. while (true) {
  306. if (readerIterator.hasNext()) {
  307. try {
  308. currReader.close();
  309. } catch (IOException e) {
  310. throw new RuntimeException(e);
  311. }
  312. currReader = readerIterator.next();
  313. tok = ifrf.getIterator(currReader);
  314. } else {
  315. nextObject = null;
  316. return;
  317. }
  318. if (tok.hasNext()) {
  319. nextObject = tok.next();
  320. return;
  321. }
  322. }
  323. }
  324. @Override
  325. public boolean hasNext() {
  326. return nextObject != null;
  327. }
  328. @Override
  329. public E next() {
  330. if (nextObject == null) {
  331. throw new NoSuchElementException();
  332. }
  333. E tmp = nextObject;
  334. setNextObject();
  335. return tmp;
  336. }
  337. }
  338. private static final long serialVersionUID = -4030295596701541770L;
  339. }