/tools/map-converter/src/main/java/crosby/binary/StringTable.java

https://bitbucket.org/leemur/logistica · Java · 138 lines · 58 code · 17 blank · 63 comment · 5 complexity · 442cd2e0a387e3c7151a1cf6032a61e9 MD5 · raw file

  1. /** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
  2. This program is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Lesser General Public License as
  4. published by the Free Software Foundation, either version 3 of the
  5. License, or (at your option) any later version.
  6. This program is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU General Public License for more details.
  10. You should have received a copy of the GNU General Public License
  11. along with this program. If not, see <http://www.gnu.org/licenses/>.
  12. */
  13. package crosby.binary;
  14. import java.util.Arrays;
  15. import java.util.Comparator;
  16. import java.util.HashMap;
  17. import com.google.protobuf.ByteString;
  18. /**
  19. * Class for mapping a set of strings to integers, giving frequently occuring
  20. * strings small integers.
  21. */
  22. public class StringTable {
  23. public StringTable() {
  24. clear();
  25. }
  26. private HashMap<String, Integer> counts;
  27. private HashMap<String, Integer> stringmap;
  28. private String set[];
  29. public void incr(String s) {
  30. if (counts.containsKey(s)) {
  31. counts.put(s, new Integer(counts.get(s).intValue() + 1));
  32. } else {
  33. counts.put(s, new Integer(1));
  34. }
  35. }
  36. /** After the stringtable has been built, return the offset of a string in it.
  37. *
  38. * Note, value '0' is reserved for use as a delimiter and will not be returned.
  39. * @param s
  40. * @return
  41. */
  42. public int getIndex(String s) {
  43. return stringmap.get(s).intValue();
  44. }
  45. public void finish() {
  46. Comparator<String> comparator = new Comparator<String>() {
  47. @Override
  48. public int compare(final String s1, String s2) {
  49. int diff = counts.get(s2) - counts.get(s1);
  50. return diff;
  51. }
  52. };
  53. /* Sort the stringtable */
  54. /*
  55. When a string is referenced, strings in the stringtable with indices:
  56. 0 : Is reserved (used as a delimiter in tags
  57. A: 1 to 127 : Uses can be represented with 1 byte
  58. B: 128 to 128**2-1 : Uses can be represented with 2 bytes,
  59. C: 128*128 to X : Uses can be represented with 3 bytes in the unlikely case we have >16k strings in a block. No block will contain enough strings that we'll need 4 bytes.
  60. There are goals that will improve compression:
  61. 1. I want to use 1 bytes for the most frequently occurring strings, then 2 bytes, then 3 bytes.
  62. 2. I want to use low integers as frequently as possible (for better
  63. entropy encoding out of deflate)
  64. 3. I want the stringtable to compress as small as possible.
  65. Condition 1 is obvious. Condition 2 makes deflate compress stringtable references more effectively.
  66. When compressing entities, delta coding causes small positive integers to occur more frequently
  67. than larger integers. Even though a stringtable references to indices of 1 and 127 both use one
  68. byte in a decompressed file, the small integer bias causes deflate to use fewer bits to represent
  69. the smaller index when compressed. Condition 3 is most effective when adjacent strings in the
  70. stringtable have a lot of common substrings.
  71. So, when I decide on the master stringtable to use, I put the 127 most frequently occurring
  72. strings into A (accomplishing goal 1), and sort them by frequency (to accomplish goal 2), but
  73. for B and C, which contain the less progressively less frequently encountered strings, I sort
  74. them lexiconographically, to maximize goal 3 and ignoring goal 2.
  75. Goal 1 is the most important. Goal 2 helped enough to be worth it, and goal 3 was pretty minor,
  76. but all should be re-benchmarked.
  77. */
  78. set = counts.keySet().toArray(new String[0]);
  79. if (set.length > 0) {
  80. // Sort based on the frequency.
  81. Arrays.sort(set, comparator);
  82. // Each group of keys that serializes to the same number of bytes is
  83. // sorted lexiconographically.
  84. // to maximize deflate compression.
  85. // Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
  86. //Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));
  87. Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
  88. set.length-1));
  89. Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
  90. set.length-1), comparator);
  91. }
  92. stringmap = new HashMap<String, Integer>(2 * set.length);
  93. for (int i = 0; i < set.length; i++) {
  94. stringmap.put(set[i], new Integer(i+1)); // Index 0 is reserved for use as a delimiter.
  95. }
  96. counts = null;
  97. }
  98. public void clear() {
  99. counts = new HashMap<String, Integer>(100);
  100. stringmap = null;
  101. set = null;
  102. }
  103. public Osmformat.StringTable.Builder serialize() {
  104. Osmformat.StringTable.Builder builder = Osmformat.StringTable
  105. .newBuilder();
  106. builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
  107. for (int i = 0; i < set.length; i++)
  108. builder.addS(ByteString.copyFromUtf8(set[i]));
  109. return builder;
  110. }
  111. }