/ZipPla/iTextSharp/iTextSharp/text/pdf/hyphenation/TernaryTree.cs
C# | 631 lines | 387 code | 75 blank | 169 comment | 89 complexity | 680d22f2ccfc5cffed063cfeb9a429a2 MD5 | raw file
- using System;
- using System.Collections;
- using System.Text;
- /*
- * $Id$
- * Copyright (C) 2001 The Apache Software Foundation. All rights reserved.
- * For details on use and redistribution please refer to the
- * LICENSE file included with these sources.
- */
- namespace iTextSharp.text.pdf.hyphenation {
- /**
- * <h2>Ternary Search Tree</h2>
- *
- * <p>A ternary search tree is a hibrid between a binary tree and
- * a digital search tree (trie). Keys are limited to strings.
- * A data value of type char is stored in each leaf node.
- * It can be used as an index (or pointer) to the data.
- * Branches that only contain one key are compressed to one node
- * by storing a pointer to the trailer substring of the key.
- * This class is intended to serve as base class or helper class
- * to implement Dictionary collections or the like. Ternary trees
- * have some nice properties as the following: the tree can be
- * traversed in sorted order, partial matches (wildcard) can be
- * implemented, retrieval of all keys within a given distance
- * from the target, etc. The storage requirements are higher than
- * a binary tree but a lot less than a trie. Performance is
- * comparable with a hash table, sometimes it outperforms a hash
- * function (most of the time can determine a miss faster than a hash).</p>
- *
- * <p>The main purpose of this java port is to serve as a base for
- * implementing TeX's hyphenation algorithm (see The TeXBook,
- * appendix H). Each language requires from 5000 to 15000 hyphenation
- * patterns which will be keys in this tree. The strings patterns
- * are usually small (from 2 to 5 characters), but each char in the
- * tree is stored in a node. Thus memory usage is the main concern.
- * We will sacrify 'elegance' to keep memory requirenments to the
- * minimum. Using java's char type as pointer (yes, I know pointer
- * it is a forbidden word in java) we can keep the size of the node
- * to be just 8 bytes (3 pointers and the data char). This gives
- * room for about 65000 nodes. In my tests the english patterns
- * took 7694 nodes and the german patterns 10055 nodes,
- * so I think we are safe.</p>
- *
- * <p>All said, this is a map with strings as keys and char as value.
- * Pretty limited!. It can be extended to a general map by
- * using the string representation of an object and using the
- * char value as an index to an array that contains the object
- * values.</p>
- *
- * @author cav@uniscope.co.jp
- */
- public class TernaryTree : ICloneable {
- /**
- * We use 4 arrays to represent a node. I guess I should have created
- * a proper node class, but somehow Knuth's pascal code made me forget
- * we now have a portable language with memory management and
- * automatic garbage collection! And now is kind of late, furthermore,
- * if it ain't broken, don't fix it.
- */
- /**
- * Pointer to low branch and to rest of the key when it is
- * stored directly in this node, we don't have unions in java!
- */
- protected char[] lo;
- /**
- * Pointer to high branch.
- */
- protected char[] hi;
- /**
- * Pointer to equal branch and to data when this node is a string terminator.
- */
- protected char[] eq;
- /**
- * <P>The character stored in this node: splitchar
- * Two special values are reserved:</P>
- * <ul><li>0x0000 as string terminator</li>
- * <li>0xFFFF to indicate that the branch starting at
- * this node is compressed</li></ul>
- * <p>This shouldn't be a problem if we give the usual semantics to
- * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
- */
- protected char[] sc;
- /**
- * This vector holds the trailing of the keys when the branch is compressed.
- */
- protected CharVector kv;
- protected char root;
- protected char freenode;
- protected int length; // number of items in tree
- protected static int BLOCK_SIZE = 2048; // allocation size for arrays
- internal TernaryTree() {
- Init();
- }
- virtual protected void Init() {
- root = (char)0;
- freenode = (char)1;
- length = 0;
- lo = new char[BLOCK_SIZE];
- hi = new char[BLOCK_SIZE];
- eq = new char[BLOCK_SIZE];
- sc = new char[BLOCK_SIZE];
- kv = new CharVector();
- }
- /**
- * Branches are initially compressed, needing
- * one node per key plus the size of the string
- * key. They are decompressed as needed when
- * another key with same prefix
- * is inserted. This saves a lot of space,
- * specially for long keys.
- */
- virtual public void Insert(string key, char val) {
- // make sure we have enough room in the arrays
- int len = key.Length
- + 1; // maximum number of nodes that may be generated
- if (freenode + len > eq.Length)
- RedimNodeArrays(eq.Length + BLOCK_SIZE);
- char[] strkey = new char[len--];
- key.CopyTo(0, strkey, 0, len);
- strkey[len] = (char)0;
- root = Insert(root, strkey, 0, val);
- }
- virtual public void Insert(char[] key, int start, char val) {
- int len = Strlen(key) + 1;
- if (freenode + len > eq.Length)
- RedimNodeArrays(eq.Length + BLOCK_SIZE);
- root = Insert(root, key, start, val);
- }
- /**
- * The actual insertion function, recursive version.
- */
- private char Insert(char p, char[] key, int start, char val) {
- int len = Strlen(key, start);
- if (p == 0) {
- // this means there is no branch, this node will start a new branch.
- // Instead of doing that, we store the key somewhere else and create
- // only one node with a pointer to the key
- p = freenode++;
- eq[p] = val; // holds data
- length++;
- hi[p] = (char)0;
- if (len > 0) {
- sc[p] = (char)0xFFFF; // indicates branch is compressed
- lo[p] = (char)kv.Alloc(len
- + 1); // use 'lo' to hold pointer to key
- Strcpy(kv.Arr, lo[p], key, start);
- } else {
- sc[p] = (char)0;
- lo[p] = (char)0;
- }
- return p;
- }
- if (sc[p] == 0xFFFF) {
- // branch is compressed: need to decompress
- // this will generate garbage in the external key array
- // but we can do some garbage collection later
- char pp = freenode++;
- lo[pp] = lo[p]; // previous pointer to key
- eq[pp] = eq[p]; // previous pointer to data
- lo[p] = (char)0;
- if (len > 0) {
- sc[p] = kv[lo[pp]];
- eq[p] = pp;
- lo[pp]++;
- if (kv[lo[pp]] == 0) {
- // key completly decompressed leaving garbage in key array
- lo[pp] = (char)0;
- sc[pp] = (char)0;
- hi[pp] = (char)0;
- } else
- sc[pp] =
- (char)0xFFFF; // we only got first char of key, rest is still there
- } else {
- // In this case we can save a node by swapping the new node
- // with the compressed node
- sc[pp] = (char)0xFFFF;
- hi[p] = pp;
- sc[p] = (char)0;
- eq[p] = val;
- length++;
- return p;
- }
- }
- char s = key[start];
- if (s < sc[p])
- lo[p] = Insert(lo[p], key, start, val);
- else if (s == sc[p]) {
- if (s != 0)
- eq[p] = Insert(eq[p], key, start + 1, val);
- else {
- // key already in tree, overwrite data
- eq[p] = val;
- }
- } else
- hi[p] = Insert(hi[p], key, start, val);
- return p;
- }
- /**
- * Compares 2 null terminated char arrays
- */
- public static int Strcmp(char[] a, int startA, char[] b, int startB) {
- for (; a[startA] == b[startB]; startA++, startB++)
- if (a[startA] == 0)
- return 0;
- return a[startA] - b[startB];
- }
- /**
- * Compares a string with null terminated char array
- */
- public static int Strcmp(string str, char[] a, int start) {
- int i, d, len = str.Length;
- for (i = 0; i < len; i++) {
- d = (int)str[i] - a[start + i];
- if (d != 0)
- return d;
- if (a[start + i] == 0)
- return d;
- }
- if (a[start + i] != 0)
- return (int)-a[start + i];
- return 0;
- }
- public static void Strcpy(char[] dst, int di, char[] src, int si) {
- while (src[si] != 0)
- dst[di++] = src[si++];
- dst[di] = (char)0;
- }
- public static int Strlen(char[] a, int start) {
- int len = 0;
- for (int i = start; i < a.Length && a[i] != 0; i++)
- len++;
- return len;
- }
- public static int Strlen(char[] a) {
- return Strlen(a, 0);
- }
- virtual public int Find(string key) {
- int len = key.Length;
- char[] strkey = new char[len + 1];
- key.CopyTo(0, strkey, 0, len);
- strkey[len] = (char)0;
- return Find(strkey, 0);
- }
- virtual public int Find(char[] key, int start) {
- int d;
- char p = root;
- int i = start;
- char c;
- while (p != 0) {
- if (sc[p] == 0xFFFF) {
- if (Strcmp(key, i, kv.Arr, lo[p]) == 0)
- return eq[p];
- else
- return -1;
- }
- c = key[i];
- d = c - sc[p];
- if (d == 0) {
- if (c == 0)
- return eq[p];
- i++;
- p = eq[p];
- } else if (d < 0)
- p = lo[p];
- else
- p = hi[p];
- }
- return -1;
- }
- virtual public bool Knows(string key) {
- return (Find(key) >= 0);
- }
- // redimension the arrays
- private void RedimNodeArrays(int newsize) {
- int len = newsize < lo.Length ? newsize : lo.Length;
- char[] na = new char[newsize];
- Array.Copy(lo, 0, na, 0, len);
- lo = na;
- na = new char[newsize];
- Array.Copy(hi, 0, na, 0, len);
- hi = na;
- na = new char[newsize];
- Array.Copy(eq, 0, na, 0, len);
- eq = na;
- na = new char[newsize];
- Array.Copy(sc, 0, na, 0, len);
- sc = na;
- }
- virtual public int Size {
- get {
- return length;
- }
- }
- virtual public Object Clone() {
- TernaryTree t = new TernaryTree();
- t.lo = (char[])this.lo.Clone();
- t.hi = (char[])this.hi.Clone();
- t.eq = (char[])this.eq.Clone();
- t.sc = (char[])this.sc.Clone();
- t.kv = (CharVector)this.kv.Clone();
- t.root = this.root;
- t.freenode = this.freenode;
- t.length = this.length;
- return t;
- }
- /**
- * Recursively insert the median first and then the median of the
- * lower and upper halves, and so on in order to get a balanced
- * tree. The array of keys is assumed to be sorted in ascending
- * order.
- */
- virtual protected void InsertBalanced(string[] k, char[] v, int offset, int n) {
- int m;
- if (n < 1)
- return;
- m = n >> 1;
- Insert(k[m + offset], v[m + offset]);
- InsertBalanced(k, v, offset, m);
- InsertBalanced(k, v, offset + m + 1, n - m - 1);
- }
- /**
- * Balance the tree for best search performance
- */
- virtual public void Balance() {
- // System.out.Print("Before root splitchar = "); System.out.Println(sc[root]);
- int i = 0, n = length;
- string[] k = new string[n];
- char[] v = new char[n];
- Iterator iter = new Iterator(this);
- while (iter.HasMoreElements()) {
- v[i] = iter.Value;
- k[i++] = (string)iter.NextElement();
- }
- Init();
- InsertBalanced(k, v, 0, n);
- // With uniform letter distribution sc[root] should be around 'm'
- // System.out.Print("After root splitchar = "); System.out.Println(sc[root]);
- }
- /**
- * Each node stores a character (splitchar) which is part of
- * some Key(s). In a compressed branch (one that only contain
- * a single string key) the trailer of the key which is not
- * already in nodes is stored externally in the kv array.
- * As items are inserted, key substrings decrease.
- * Some substrings may completely disappear when the whole
- * branch is totally decompressed.
- * The tree is traversed to find the key substrings actually
- * used. In addition, duplicate substrings are removed using
- * a map (implemented with a TernaryTree!).
- *
- */
- virtual public void TrimToSize() {
- // first balance the tree for best performance
- Balance();
- // redimension the node arrays
- RedimNodeArrays(freenode);
- // ok, compact kv array
- CharVector kx = new CharVector();
- kx.Alloc(1);
- TernaryTree map = new TernaryTree();
- Compact(kx, map, root);
- kv = kx;
- kv.TrimToSize();
- }
- private void Compact(CharVector kx, TernaryTree map, char p) {
- int k;
- if (p == 0)
- return;
- if (sc[p] == 0xFFFF) {
- k = map.Find(kv.Arr, lo[p]);
- if (k < 0) {
- k = kx.Alloc(Strlen(kv.Arr, lo[p]) + 1);
- Strcpy(kx.Arr, k, kv.Arr, lo[p]);
- map.Insert(kx.Arr, k, (char)k);
- }
- lo[p] = (char)k;
- } else {
- Compact(kx, map, lo[p]);
- if (sc[p] != 0)
- Compact(kx, map, eq[p]);
- Compact(kx, map, hi[p]);
- }
- }
- virtual public Iterator Keys {
- get {
- return new Iterator(this);
- }
- }
- public class Iterator {
- /**
- * current node index
- */
- int cur;
- /**
- * current key
- */
- string curkey;
- /**
- * TernaryTree parent
- */
- TernaryTree parent;
- private class Item : ICloneable {
- internal char parent;
- internal char child;
- public Item() {
- parent = (char)0;
- child = (char)0;
- }
- public Item(char p, char c) {
- parent = p;
- child = c;
- }
- virtual public Object Clone() {
- return new Item(parent, child);
- }
- }
- /**
- * Node stack
- */
- Stack ns;
- /**
- * key stack implemented with a StringBuilder
- */
- StringBuilder ks;
- public Iterator(TernaryTree parent) {
- this.parent = parent;
- cur = -1;
- ns = new Stack();
- ks = new StringBuilder();
- Rewind();
- }
- virtual public void Rewind() {
- ns.Clear();
- ks.Length = 0;
- cur = parent.root;
- Run();
- }
- virtual public Object NextElement() {
- string res = curkey;
- cur = Up();
- Run();
- return res;
- }
- virtual public char Value {
- get {
- if (cur >= 0)
- return this.parent.eq[cur];
- return (char)0;
- }
- }
- virtual public bool HasMoreElements() {
- return (cur != -1);
- }
- /**
- * traverse upwards
- */
- private int Up() {
- Item i = new Item();
- int res = 0;
- if (ns.Count == 0)
- return -1;
- if (cur != 0 && parent.sc[cur] == 0)
- return parent.lo[cur];
- bool climb = true;
- while (climb) {
- i = (Item)ns.Pop();
- i.child++;
- switch (i.child) {
- case (char)1:
- if (parent.sc[i.parent] != 0) {
- res = parent.eq[i.parent];
- ns.Push(i.Clone());
- ks.Append(parent.sc[i.parent]);
- } else {
- i.child++;
- ns.Push(i.Clone());
- res = parent.hi[i.parent];
- }
- climb = false;
- break;
- case (char)2:
- res = parent.hi[i.parent];
- ns.Push(i.Clone());
- if (ks.Length > 0)
- ks.Length = ks.Length - 1; // pop
- climb = false;
- break;
- default:
- if (ns.Count == 0)
- return -1;
- climb = true;
- break;
- }
- }
- return res;
- }
- /**
- * traverse the tree to find next key
- */
- private int Run() {
- if (cur == -1)
- return -1;
- bool leaf = false;
- for (; ; ) {
- // first go down on low branch until leaf or compressed branch
- while (cur != 0) {
- if (parent.sc[cur] == 0xFFFF) {
- leaf = true;
- break;
- }
- ns.Push(new Item((char)cur, '\u0000'));
- if (parent.sc[cur] == 0) {
- leaf = true;
- break;
- }
- cur = parent.lo[cur];
- }
- if (leaf)
- break;
- // nothing found, go up one node and try again
- cur = Up();
- if (cur == -1) {
- return -1;
- }
- }
- // The current node should be a data node and
- // the key should be in the key stack (at least partially)
- StringBuilder buf = new StringBuilder(ks.ToString());
- if (parent.sc[cur] == 0xFFFF) {
- int p = parent.lo[cur];
- while (parent.kv[p] != 0)
- buf.Append(parent.kv[p++]);
- }
- curkey = buf.ToString();
- return 0;
- }
- }
- public virtual void PrintStats() {
- Console.Error.WriteLine("Number of keys = " + length.ToString());
- Console.Error.WriteLine("Node count = " + freenode.ToString());
- // Console.Error.WriteLine("Array length = " + int.ToString(eq.Length));
- Console.Error.WriteLine("Key Array length = "
- + kv.Length.ToString());
- /*
- * for (int i=0; i<kv.Length; i++)
- * if ( kv[i] != 0 )
- * System.out.Print(kv[i]);
- * else
- * System.out.Println("");
- * System.out.Println("Keys:");
- * for (Enumeration enum = Keys(); enum.HasMoreElements(); )
- * System.out.Println(enum.NextElement());
- */
- }
- }
- }