PageRenderTime 73ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/projects/itext-5.0.3/core/com/itextpdf/text/pdf/hyphenation/HyphenationTree.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 456 lines | 250 code | 36 blank | 170 comment | 71 complexity | c20ed233ee95d8e58c596314bde9c4b1 MD5 | raw file
  1. /*
  2. * Copyright 1999-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /* $Id: HyphenationTree.java 4242 2010-01-02 23:22:20Z xlv $ */
  17. package com.itextpdf.text.pdf.hyphenation;
  18. import java.io.InputStream;
  19. import java.util.ArrayList;
  20. import java.util.HashMap;
  21. /**
  22. * This tree structure stores the hyphenation patterns in an efficient
  23. * way for fast lookup. It provides the provides the method to
  24. * hyphenate a word.
  25. *
  26. * @author Carlos Villegas <cav@uniscope.co.jp>
  27. */
  28. public class HyphenationTree extends TernaryTree
  29. implements PatternConsumer {
  30. private static final long serialVersionUID = -7763254239309429432L;
  31. /**
  32. * value space: stores the interletter values
  33. */
  34. protected ByteVector vspace;
  35. /**
  36. * This map stores hyphenation exceptions
  37. */
  38. protected HashMap<String, ArrayList<Object>> stoplist;
  39. /**
  40. * This map stores the character classes
  41. */
  42. protected TernaryTree classmap;
  43. /**
  44. * Temporary map to store interletter values on pattern loading.
  45. */
  46. private transient TernaryTree ivalues;
  47. public HyphenationTree() {
  48. stoplist = new HashMap<String, ArrayList<Object>>(23); // usually a small table
  49. classmap = new TernaryTree();
  50. vspace = new ByteVector();
  51. vspace.alloc(1); // this reserves index 0, which we don't use
  52. }
  53. /**
  54. * Packs the values by storing them in 4 bits, two values into a byte
  55. * Values range is from 0 to 9. We use zero as terminator,
  56. * so we'll add 1 to the value.
  57. * @param values a string of digits from '0' to '9' representing the
  58. * interletter values.
  59. * @return the index into the vspace array where the packed values
  60. * are stored.
  61. */
  62. protected int packValues(String values) {
  63. int i, n = values.length();
  64. int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
  65. int offset = vspace.alloc(m);
  66. byte[] va = vspace.getArray();
  67. for (i = 0; i < n; i++) {
  68. int j = i >> 1;
  69. byte v = (byte)(values.charAt(i) - '0' + 1 & 0x0f);
  70. if ((i & 1) == 1) {
  71. va[j + offset] = (byte)(va[j + offset] | v);
  72. } else {
  73. va[j + offset] = (byte)(v << 4); // big endian
  74. }
  75. }
  76. va[m - 1 + offset] = 0; // terminator
  77. return offset;
  78. }
  79. protected String unpackValues(int k) {
  80. StringBuffer buf = new StringBuffer();
  81. byte v = vspace.get(k++);
  82. while (v != 0) {
  83. char c = (char)((v >>> 4) - 1 + '0');
  84. buf.append(c);
  85. c = (char)(v & 0x0f);
  86. if (c == 0) {
  87. break;
  88. }
  89. c = (char)(c - 1 + '0');
  90. buf.append(c);
  91. v = vspace.get(k++);
  92. }
  93. return buf.toString();
  94. }
  95. public void loadSimplePatterns(InputStream stream) {
  96. SimplePatternParser pp = new SimplePatternParser();
  97. ivalues = new TernaryTree();
  98. pp.parse(stream, this);
  99. // patterns/values should be now in the tree
  100. // let's optimize a bit
  101. trimToSize();
  102. vspace.trimToSize();
  103. classmap.trimToSize();
  104. // get rid of the auxiliary map
  105. ivalues = null;
  106. }
  107. public String findPattern(String pat) {
  108. int k = super.find(pat);
  109. if (k >= 0) {
  110. return unpackValues(k);
  111. }
  112. return "";
  113. }
  114. /**
  115. * String compare, returns 0 if equal or
  116. * t is a substring of s
  117. */
  118. protected int hstrcmp(char[] s, int si, char[] t, int ti) {
  119. for (; s[si] == t[ti]; si++, ti++) {
  120. if (s[si] == 0) {
  121. return 0;
  122. }
  123. }
  124. if (t[ti] == 0) {
  125. return 0;
  126. }
  127. return s[si] - t[ti];
  128. }
  129. protected byte[] getValues(int k) {
  130. StringBuffer buf = new StringBuffer();
  131. byte v = vspace.get(k++);
  132. while (v != 0) {
  133. char c = (char)((v >>> 4) - 1);
  134. buf.append(c);
  135. c = (char)(v & 0x0f);
  136. if (c == 0) {
  137. break;
  138. }
  139. c = (char)(c - 1);
  140. buf.append(c);
  141. v = vspace.get(k++);
  142. }
  143. byte[] res = new byte[buf.length()];
  144. for (int i = 0; i < res.length; i++) {
  145. res[i] = (byte)buf.charAt(i);
  146. }
  147. return res;
  148. }
  149. /**
  150. * <p>Search for all possible partial matches of word starting
  151. * at index an update interletter values. In other words, it
  152. * does something like:</p>
  153. * <code>
  154. * for(i=0; i<patterns.length; i++) {
  155. * if ( word.substring(index).startsWidth(patterns[i]) )
  156. * update_interletter_values(patterns[i]);
  157. * }
  158. * </code>
  159. * <p>But it is done in an efficient way since the patterns are
  160. * stored in a ternary tree. In fact, this is the whole purpose
  161. * of having the tree: doing this search without having to test
  162. * every single pattern. The number of patterns for languages
  163. * such as English range from 4000 to 10000. Thus, doing thousands
  164. * of string comparisons for each word to hyphenate would be
  165. * really slow without the tree. The tradeoff is memory, but
  166. * using a ternary tree instead of a trie, almost halves the
  167. * the memory used by Lout or TeX. It's also faster than using
  168. * a hash table</p>
  169. * @param word null terminated word to match
  170. * @param index start index from word
  171. * @param il interletter values array to update
  172. */
  173. protected void searchPatterns(char[] word, int index, byte[] il) {
  174. byte[] values;
  175. int i = index;
  176. char p, q;
  177. char sp = word[i];
  178. p = root;
  179. while (p > 0 && p < sc.length) {
  180. if (sc[p] == 0xFFFF) {
  181. if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
  182. values = getValues(eq[p]); // data pointer is in eq[]
  183. int j = index;
  184. for (byte value : values) {
  185. if (j < il.length && value > il[j]) {
  186. il[j] = value;
  187. }
  188. j++;
  189. }
  190. }
  191. return;
  192. }
  193. int d = sp - sc[p];
  194. if (d == 0) {
  195. if (sp == 0) {
  196. break;
  197. }
  198. sp = word[++i];
  199. p = eq[p];
  200. q = p;
  201. // look for a pattern ending at this position by searching for
  202. // the null char ( splitchar == 0 )
  203. while (q > 0 && q < sc.length) {
  204. if (sc[q] == 0xFFFF) { // stop at compressed branch
  205. break;
  206. }
  207. if (sc[q] == 0) {
  208. values = getValues(eq[q]);
  209. int j = index;
  210. for (byte value : values) {
  211. if (j < il.length && value > il[j]) {
  212. il[j] = value;
  213. }
  214. j++;
  215. }
  216. break;
  217. } else {
  218. q = lo[q];
  219. /**
  220. * actually the code should be:
  221. * q = sc[q] < 0 ? hi[q] : lo[q];
  222. * but java chars are unsigned
  223. */
  224. }
  225. }
  226. } else {
  227. p = d < 0 ? lo[p] : hi[p];
  228. }
  229. }
  230. }
  231. /**
  232. * Hyphenate word and return a Hyphenation object.
  233. * @param word the word to be hyphenated
  234. * @param remainCharCount Minimum number of characters allowed
  235. * before the hyphenation point.
  236. * @param pushCharCount Minimum number of characters allowed after
  237. * the hyphenation point.
  238. * @return a {@link Hyphenation Hyphenation} object representing
  239. * the hyphenated word or null if word is not hyphenated.
  240. */
  241. public Hyphenation hyphenate(String word, int remainCharCount,
  242. int pushCharCount) {
  243. char[] w = word.toCharArray();
  244. return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
  245. }
  246. /**
  247. * w = "****nnllllllnnn*****",
  248. * where n is a non-letter, l is a letter,
  249. * all n may be absent, the first n is at offset,
  250. * the first l is at offset + iIgnoreAtBeginning;
  251. * word = ".llllll.'\0'***",
  252. * where all l in w are copied into word.
  253. * In the first part of the routine len = w.length,
  254. * in the second part of the routine len = word.length.
  255. * Three indices are used:
  256. * index(w), the index in w,
  257. * index(word), the index in word,
  258. * letterindex(word), the index in the letter part of word.
  259. * The following relations exist:
  260. * index(w) = offset + i - 1
  261. * index(word) = i - iIgnoreAtBeginning
  262. * letterindex(word) = index(word) - 1
  263. * (see first loop).
  264. * It follows that:
  265. * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
  266. * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
  267. */
  268. /**
  269. * Hyphenate word and return an array of hyphenation points.
  270. * @param w char array that contains the word
  271. * @param offset Offset to first character in word
  272. * @param len Length of word
  273. * @param remainCharCount Minimum number of characters allowed
  274. * before the hyphenation point.
  275. * @param pushCharCount Minimum number of characters allowed after
  276. * the hyphenation point.
  277. * @return a {@link Hyphenation Hyphenation} object representing
  278. * the hyphenated word or null if word is not hyphenated.
  279. */
  280. public Hyphenation hyphenate(char[] w, int offset, int len,
  281. int remainCharCount, int pushCharCount) {
  282. int i;
  283. char[] word = new char[len + 3];
  284. // normalize word
  285. char[] c = new char[2];
  286. int iIgnoreAtBeginning = 0;
  287. int iLength = len;
  288. boolean bEndOfLetters = false;
  289. for (i = 1; i <= len; i++) {
  290. c[0] = w[offset + i - 1];
  291. int nc = classmap.find(c, 0);
  292. if (nc < 0) { // found a non-letter character ...
  293. if (i == 1 + iIgnoreAtBeginning) {
  294. // ... before any letter character
  295. iIgnoreAtBeginning ++;
  296. } else {
  297. // ... after a letter character
  298. bEndOfLetters = true;
  299. }
  300. iLength --;
  301. } else {
  302. if (!bEndOfLetters) {
  303. word[i - iIgnoreAtBeginning] = (char)nc;
  304. } else {
  305. return null;
  306. }
  307. }
  308. }
  309. len = iLength;
  310. if (len < remainCharCount + pushCharCount) {
  311. // word is too short to be hyphenated
  312. return null;
  313. }
  314. int[] result = new int[len + 1];
  315. int k = 0;
  316. // check exception list first
  317. String sw = new String(word, 1, len);
  318. if (stoplist.containsKey(sw)) {
  319. // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
  320. ArrayList<Object> hw = stoplist.get(sw);
  321. int j = 0;
  322. for (i = 0; i < hw.size(); i++) {
  323. Object o = hw.get(i);
  324. // j = index(sw) = letterindex(word)?
  325. // result[k] = corresponding index(w)
  326. if (o instanceof String) {
  327. j += ((String)o).length();
  328. if (j >= remainCharCount && j < len - pushCharCount) {
  329. result[k++] = j + iIgnoreAtBeginning;
  330. }
  331. }
  332. }
  333. } else {
  334. // use algorithm to get hyphenation points
  335. word[0] = '.'; // word start marker
  336. word[len + 1] = '.'; // word end marker
  337. word[len + 2] = 0; // null terminated
  338. byte[] il = new byte[len + 3]; // initialized to zero
  339. for (i = 0; i < len + 1; i++) {
  340. searchPatterns(word, i, il);
  341. }
  342. // hyphenation points are located where interletter value is odd
  343. // i is letterindex(word),
  344. // i + 1 is index(word),
  345. // result[k] = corresponding index(w)
  346. for (i = 0; i < len; i++) {
  347. if ((il[i + 1] & 1) == 1 && i >= remainCharCount
  348. && i <= len - pushCharCount) {
  349. result[k++] = i + iIgnoreAtBeginning;
  350. }
  351. }
  352. }
  353. if (k > 0) {
  354. // trim result array
  355. int[] res = new int[k];
  356. System.arraycopy(result, 0, res, 0, k);
  357. return new Hyphenation(new String(w, offset, len), res);
  358. } else {
  359. return null;
  360. }
  361. }
  362. /**
  363. * Add a character class to the tree. It is used by
  364. * {@link SimplePatternParser SimplePatternParser} as callback to
  365. * add character classes. Character classes define the
  366. * valid word characters for hyphenation. If a word contains
  367. * a character not defined in any of the classes, it is not hyphenated.
  368. * It also defines a way to normalize the characters in order
  369. * to compare them with the stored patterns. Usually pattern
  370. * files use only lower case characters, in this case a class
  371. * for letter 'a', for example, should be defined as "aA", the first
  372. * character being the normalization char.
  373. */
  374. public void addClass(String chargroup) {
  375. if (chargroup.length() > 0) {
  376. char equivChar = chargroup.charAt(0);
  377. char[] key = new char[2];
  378. key[1] = 0;
  379. for (int i = 0; i < chargroup.length(); i++) {
  380. key[0] = chargroup.charAt(i);
  381. classmap.insert(key, 0, equivChar);
  382. }
  383. }
  384. }
  385. /**
  386. * Add an exception to the tree. It is used by
  387. * {@link SimplePatternParser SimplePatternParser} class as callback to
  388. * store the hyphenation exceptions.
  389. * @param word normalized word
  390. * @param hyphenatedword a vector of alternating strings and
  391. * {@link Hyphen hyphen} objects.
  392. */
  393. public void addException(String word, ArrayList<Object> hyphenatedword) {
  394. stoplist.put(word, hyphenatedword);
  395. }
  396. /**
  397. * Add a pattern to the tree. Mainly, to be used by
  398. * {@link SimplePatternParser SimplePatternParser} class as callback to
  399. * add a pattern to the tree.
  400. * @param pattern the hyphenation pattern
  401. * @param ivalue interletter weight values indicating the
  402. * desirability and priority of hyphenating at a given point
  403. * within the pattern. It should contain only digit characters.
  404. * (i.e. '0' to '9').
  405. */
  406. public void addPattern(String pattern, String ivalue) {
  407. int k = ivalues.find(ivalue);
  408. if (k <= 0) {
  409. k = packValues(ivalue);
  410. ivalues.insert(ivalue, (char)k);
  411. }
  412. insert(pattern, (char)k);
  413. }
  414. @Override
  415. public void printStats() {
  416. System.out.println("Value space size = "
  417. + Integer.toString(vspace.length()));
  418. super.printStats();
  419. }
  420. }