PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/src/edu/umn/cs/spatialHadoop/io/TextSerializerHelper.java

https://github.com/skyswind/spatialhadoop2
Java | 612 lines | 452 code | 59 blank | 101 comment | 163 complexity | afdaeb574b6eb53fc36db52de0951e73 MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
  3. * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
  4. * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
  5. * except in compliance with the License. You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software distributed under the License is
  10. * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. * See the License for the specific language governing permissions and limitations under the License.
  12. */
  13. package edu.umn.cs.spatialHadoop.io;
  14. import java.nio.ByteBuffer;
  15. import java.util.Arrays;
  16. import java.util.Map;
  17. import org.apache.hadoop.io.Text;
  18. import com.esri.core.geometry.ogc.OGCGeometry;
  19. import com.vividsolutions.jts.geom.Geometry;
  20. import com.vividsolutions.jts.io.ParseException;
  21. import com.vividsolutions.jts.io.WKBReader;
  22. import com.vividsolutions.jts.io.WKBWriter;
  23. import com.vividsolutions.jts.io.WKTReader;
  24. public final class TextSerializerHelper {
  25. /**
  26. * All possible chars for representing a number as a String
  27. */
  28. final static byte[] digits = {
  29. '0' , '1' , '2' , '3' , '4' , '5' ,
  30. '6' , '7' , '8' , '9' , 'a' , 'b' ,
  31. 'c' , 'd' , 'e' , 'f' , 'g' , 'h' ,
  32. 'i' , 'j' , 'k' , 'l' , 'm' , 'n' ,
  33. 'o' , 'p' , 'q' , 'r' , 's' , 't' ,
  34. 'u' , 'v' , 'w' , 'x' , 'y' , 'z'
  35. };
  36. final static boolean[] HexadecimalChars;
  37. final static boolean[] DecimalChars;
  38. /**64 bytes to append to a string if necessary*/
  39. final static byte[] ToAppend = new byte[64];
  40. static {
  41. HexadecimalChars = new boolean[256];
  42. DecimalChars = new boolean[256];
  43. for (char i = 'a'; i <= 'f'; i++)
  44. HexadecimalChars[i] = true;
  45. for (char i = 'A'; i <= 'F'; i++)
  46. HexadecimalChars[i] = true;
  47. for (char i = '0'; i <= '9'; i++) {
  48. DecimalChars[i] = true;
  49. HexadecimalChars[i] = true;
  50. }
  51. HexadecimalChars['-'] = true;
  52. DecimalChars['-'] = true;
  53. Arrays.fill(ToAppend, (byte)' ');
  54. }
  55. /**
  56. * Appends hex representation of the given number to the given string.
  57. * If append is set to true, a comma is also appended to the text.
  58. * @param i
  59. * @param t
  60. * @param appendComma
  61. */
  62. public static void serializeHexLong(long i, Text t, char toAppend) {
  63. // Calculate number of bytes needed to serialize the given long
  64. int bytes_needed = 0;
  65. long temp;
  66. if (i < 0) {
  67. bytes_needed++; // An additional
  68. temp = -i;
  69. } else {
  70. temp = i;
  71. }
  72. do {
  73. bytes_needed += 1;
  74. temp >>>= 4;
  75. } while (temp != 0);
  76. if (toAppend != '\0')
  77. bytes_needed++;
  78. // Reserve the bytes needed in the text
  79. t.append(ToAppend, 0, bytes_needed);
  80. // Extract the underlying buffer array and fill it directly
  81. byte[] buffer = t.getBytes();
  82. // Position of the next character to write in the text
  83. int position = t.getLength() - 1;
  84. if (toAppend != '\0')
  85. buffer[position--] = (byte) toAppend;
  86. final int shift = 4;
  87. final int radix = 1 << shift;
  88. final long mask = radix - 1;
  89. // Negative sign is prepended separately for negative numbers
  90. boolean negative = false;
  91. if (i < 0) {
  92. i = -i;
  93. negative = true;
  94. }
  95. do {
  96. buffer[position--] = digits[(int)(i & mask)];
  97. i >>>= shift;
  98. } while (i != 0);
  99. if (negative)
  100. buffer[position--] = '-';
  101. }
  102. /**
  103. * Parses only long from the given byte array (string). The long starts at
  104. * offset and is len characters long.
  105. * @param buf
  106. * @param offset
  107. * @param len
  108. * @return
  109. */
  110. public static long deserializeHexLong(byte[] buf, int offset, int len) {
  111. boolean negative = false;
  112. if (buf[offset] == '-') {
  113. negative = true;
  114. offset++;
  115. len--;
  116. }
  117. long i = 0;
  118. while (len-- > 0) {
  119. i <<= 4;
  120. if (buf[offset] <= '9')
  121. i |= buf[offset++] - '0';
  122. else
  123. i |= buf[offset++] - 'a' + 10;
  124. }
  125. return negative ? -i : i;
  126. }
  127. /**
  128. * Deserializes and consumes a long from the given text. Consuming means all
  129. * characters read for deserialization are removed from the given text.
  130. * If separator is non-zero, a long is read and consumed up to the first
  131. * occurrence of this separator. The separator is also consumed.
  132. * @param text
  133. * @param separator
  134. * @return
  135. */
  136. public static long consumeHexLong(Text text, char separator) {
  137. int i = 0;
  138. byte[] bytes = text.getBytes();
  139. // Skip until the separator or end of text
  140. while (i < text.getLength() && HexadecimalChars[bytes[i]])
  141. i++;
  142. long l = deserializeHexLong(bytes, 0, i);
  143. // If the first char after the long is the separator, skip it
  144. if (i < text.getLength() && bytes[i] == separator)
  145. i++;
  146. // Shift bytes after the long
  147. System.arraycopy(bytes, i, bytes, 0, text.getLength() - i);
  148. text.set(bytes, 0, text.getLength() - i);
  149. return l;
  150. }
  151. /**
  152. * Deserializes and consumes a double from the given text. Consuming means all
  153. * characters read for deserialization are removed from the given text.
  154. * If separator is non-zero, a double is read and consumed up to the first
  155. * occurrence of this separator. The separator is also consumed.
  156. * @param text
  157. * @param separator
  158. * @return
  159. */
  160. public static double consumeDouble(Text text, char separator) {
  161. int i = 0;
  162. byte[] bytes = text.getBytes();
  163. // Skip until the separator or end of text
  164. while (i < text.getLength()
  165. && ((bytes[i] >= '0' && bytes[i] <= '9') || bytes[i] == 'e'
  166. || bytes[i] == 'E' || bytes[i] == '-' || bytes[i] == '+' || bytes[i] == '.'))
  167. i++;
  168. double d = Double.parseDouble(new String(bytes, 0, i));
  169. if (i < text.getLength() && bytes[i] == separator)
  170. i++;
  171. System.arraycopy(bytes, i, bytes, 0, text.getLength() - i);
  172. text.set(bytes, 0, text.getLength() - i);
  173. return d;
  174. }
  175. /**
  176. * Appends hex representation of the given number to the given string.
  177. * If append is set to true, a comma is also appended to the text.
  178. * @param i
  179. * @param t
  180. * @param appendComma
  181. */
  182. public static void serializeDouble(double d, Text t, char toAppend) {
  183. byte[] bytes = Double.toString(d).getBytes();
  184. t.append(bytes, 0, bytes.length);
  185. if (toAppend != '\0') {
  186. t.append(new byte[] {(byte)toAppend}, 0, 1);
  187. }
  188. }
  189. public static void serializeLong(long i, Text t, char toAppend) {
  190. // Calculate number of bytes needed to serialize the given long
  191. int bytes_needed = 0;
  192. long temp;
  193. if (i < 0) {
  194. bytes_needed++; // An additional
  195. temp = -i;
  196. } else {
  197. temp = i;
  198. }
  199. do {
  200. bytes_needed += 1;
  201. temp /= 10;
  202. } while (temp != 0);
  203. if (toAppend != '\0')
  204. bytes_needed++;
  205. // Reserve the bytes needed in the text
  206. t.append(ToAppend, 0, bytes_needed);
  207. // Extract the underlying buffer array and fill it directly
  208. byte[] buffer = t.getBytes();
  209. // Position of the next character to write in the text
  210. int position = t.getLength() - 1;
  211. if (toAppend != '\0')
  212. buffer[position--] = (byte) toAppend;
  213. // Negative sign is prepended separately for negative numbers
  214. boolean negative = false;
  215. if (i < 0) {
  216. i = -i;
  217. negative = true;
  218. }
  219. do {
  220. int digit = (int) (i % 10);
  221. buffer[position--] = digits[digit];
  222. i /= 10;
  223. } while (i != 0);
  224. if (negative)
  225. buffer[position--] = '-';
  226. }
  227. public static long deserializeLong(byte[] buf, int offset, int len) {
  228. boolean negative = false;
  229. if (buf[offset] == '-') {
  230. negative = true;
  231. offset++;
  232. len--;
  233. }
  234. long i = 0;
  235. while (len-- > 0) {
  236. i *= 10;
  237. i += buf[offset++] - '0';
  238. }
  239. return negative ? -i : i;
  240. }
  241. public static long consumeLong(Text text, char separator) {
  242. int i = 0;
  243. byte[] bytes = text.getBytes();
  244. // Skip until the separator or end of text
  245. while (i < text.getLength() && DecimalChars[bytes[i]])
  246. i++;
  247. long l = deserializeLong(bytes, 0, i);
  248. // If the first char after the long is the separator, skip it
  249. if (i < text.getLength() && bytes[i] == separator)
  250. i++;
  251. // Shift bytes after the long
  252. System.arraycopy(bytes, i, bytes, 0, text.getLength() - i);
  253. text.set(bytes, 0, text.getLength() - i);
  254. return l;
  255. }
  256. public static void serializeInt(int i, Text t, char toAppend) {
  257. // Calculate number of bytes needed to serialize the given long
  258. int bytes_needed = 0;
  259. int temp;
  260. if (i < 0) {
  261. bytes_needed++; // An additional
  262. temp = -i;
  263. } else {
  264. temp = i;
  265. }
  266. do {
  267. bytes_needed += 1;
  268. temp /= 10;
  269. } while (temp != 0);
  270. if (toAppend != '\0')
  271. bytes_needed++;
  272. // Reserve the bytes needed in the text
  273. t.append(ToAppend, 0, bytes_needed);
  274. // Extract the underlying buffer array and fill it directly
  275. byte[] buffer = t.getBytes();
  276. // Position of the next character to write in the text
  277. int position = t.getLength() - 1;
  278. if (toAppend != '\0')
  279. buffer[position--] = (byte) toAppend;
  280. // Negative sign is prepended separately for negative numbers
  281. boolean negative = false;
  282. if (i < 0) {
  283. i = -i;
  284. negative = true;
  285. }
  286. do {
  287. int digit = i % 10;
  288. buffer[position--] = digits[digit];
  289. i /= 10;
  290. } while (i != 0);
  291. if (negative)
  292. buffer[position--] = '-';
  293. }
  294. public static int deserializeInt(byte[] buf, int offset, int len) {
  295. boolean negative = false;
  296. if (buf[offset] == '-') {
  297. negative = true;
  298. offset++;
  299. len--;
  300. }
  301. int i = 0;
  302. while (len-- > 0) {
  303. i *= 10;
  304. i += buf[offset++] - '0';
  305. }
  306. return negative ? -i : i;
  307. }
  308. public static int consumeInt(Text text, char separator) {
  309. int i = 0;
  310. byte[] bytes = text.getBytes();
  311. // Skip until the separator or end of text
  312. while (i < text.getLength() && DecimalChars[bytes[i]])
  313. i++;
  314. int l = deserializeInt(bytes, 0, i);
  315. // If the first char after the long is the separator, skip it
  316. if (i < text.getLength() && bytes[i] == separator)
  317. i++;
  318. // Shift bytes after the long
  319. System.arraycopy(bytes, i, bytes, 0, text.getLength() - i);
  320. text.set(bytes, 0, text.getLength() - i);
  321. return l;
  322. }
  323. private static final byte[] Separators = {'[', '#', ',', ']'};
  324. private static final int MapStart = 0, KeyValueSeparator = 1,
  325. FieldSeparator = 2, MapEnd = 3;
  326. public static void consumeMap(Text text, Map<String, String> tags) {
  327. tags.clear();
  328. if (text.getLength() > 0) {
  329. byte[] tagsBytes = text.getBytes();
  330. if (tagsBytes[0] != Separators[MapStart])
  331. return;
  332. int i1 = 1;
  333. while (i1 < text.getLength() && tagsBytes[i1] != Separators[MapEnd]) {
  334. int i2 = i1 + 1;
  335. while (i2 < text.getLength() && tagsBytes[i2] != Separators[KeyValueSeparator])
  336. i2++;
  337. String key = new String(tagsBytes, i1, i2 - i1);
  338. i1 = i2 + 1;
  339. i2 = i1 + 1;
  340. while (i2 < text.getLength() && tagsBytes[i2] != Separators[FieldSeparator] && tagsBytes[i2] != Separators[MapEnd])
  341. i2++;
  342. String value = new String(tagsBytes, i1, i2 - i1);
  343. tags.put(key, value);
  344. i1 = i2;
  345. if (i1 < text.getLength() && tagsBytes[i1] == Separators[FieldSeparator])
  346. i1++;
  347. }
  348. text.set(tagsBytes, i1, text.getLength() - i1);
  349. }
  350. }
  351. public static Text serializeMap(Text text, Map<String, String> tags) {
  352. if (!tags.isEmpty()) {
  353. boolean first = true;
  354. text.append(Separators, MapStart, 1);
  355. for (Map.Entry<String, String> entry : tags.entrySet()) {
  356. if (first) {
  357. first = false;
  358. } else {
  359. first = true;
  360. text.append(Separators, FieldSeparator, 1);
  361. }
  362. byte[] k = entry.getKey().getBytes();
  363. text.append(k, 0, k.length);
  364. text.append(Separators, KeyValueSeparator, 1);
  365. byte[] v = entry.getValue().getBytes();
  366. text.append(v, 0, v.length);
  367. }
  368. text.append(Separators, MapEnd, 1);
  369. }
  370. return text;
  371. }
  372. private static final byte[][] ShapeNames = { "LINESTRING".getBytes(),
  373. "POINT".getBytes(), "POLYGON".getBytes(), "MULTIPOINT".getBytes(),
  374. "MULTILINESTRING".getBytes(), "MULTIPOLYGON".getBytes(),
  375. "GEOMETRYCOLLECTION".getBytes() };
  376. public static OGCGeometry consumeGeometryESRI(Text text, char separator) {
  377. // Check whether this text is a Well Known Text (WKT) or a hexed string
  378. boolean wkt = false;
  379. byte[] bytes = text.getBytes();
  380. int length = text.getLength();
  381. int i_shape = 0;
  382. while (!wkt && i_shape < ShapeNames.length) {
  383. byte[] shapeName = ShapeNames[i_shape];
  384. if (length > shapeName.length) {
  385. int i = 0;
  386. while (i < shapeName.length && shapeName[i] == bytes[i])
  387. i++;
  388. if (i == shapeName.length) {
  389. wkt = true;
  390. break;
  391. }
  392. }
  393. i_shape++;
  394. }
  395. // Look for the terminator of the shape text
  396. int i1 = 0;
  397. if (bytes[i1] == '\'' || bytes[i1] == '\"') {
  398. separator = (char) bytes[i1++];
  399. }
  400. int i2 = i1;
  401. while (i2 < length && bytes[i2] != separator)
  402. i2++;
  403. String str = new String(bytes, i1, i2-i1);
  404. // Remove consumed bytes from the text
  405. text.set(bytes, i2, text.getLength() - i2);
  406. OGCGeometry geom = parseText(str);
  407. return geom;
  408. }
  409. public static OGCGeometry parseText(String str) {
  410. OGCGeometry geom = null;
  411. try {
  412. // Parse string as well known text (WKT)
  413. geom = OGCGeometry.fromText(str);
  414. } catch (IllegalArgumentException e) {
  415. try {
  416. // Error parsing from WKT, try hex string instead
  417. byte[] binary = hexToBytes(str);
  418. geom = OGCGeometry.fromBinary(ByteBuffer.wrap(binary));
  419. } catch (RuntimeException e1) {
  420. throw new RuntimeException("Cannot parse the shape: "+str, e1);
  421. }
  422. }
  423. return geom;
  424. }
  425. /**
  426. * Convert a string containing a hex string to a byte array of binary.
  427. * For example, the string "AABB" is converted to the byte array {0xAA, 0XBB}
  428. * @param hex
  429. * @return
  430. */
  431. public static byte[] hexToBytes(String hex) {
  432. byte[] bytes = new byte[(hex.length() + 1) / 2];
  433. for (int i = 0; i < hex.length(); i++) {
  434. byte x = (byte) hex.charAt(i);
  435. if (x >= '0' && x <= '9')
  436. x -= '0';
  437. else if (x >= 'a' && x <= 'f')
  438. x = (byte) ((x - 'a') + 0xa);
  439. else if (x >= 'A' && x <= 'F')
  440. x = (byte) ((x - 'A') + 0xA);
  441. else
  442. throw new RuntimeException("Invalid hex char "+x+" at position "+i);
  443. if (i % 2 == 0)
  444. x <<= 4;
  445. bytes[i / 2] |= x;
  446. }
  447. return bytes;
  448. }
  449. public static void serializeGeometry(Text text, OGCGeometry geom, char toAppend) {
  450. String str = bytesToHex(geom.asBinary().array());
  451. byte[] str_b = str.getBytes();
  452. text.append(str_b, 0, str_b.length);
  453. if (toAppend != '\0')
  454. text.append(new byte[] {(byte) toAppend}, 0, 1);
  455. }
  456. private static final WKTReader wktReader = new WKTReader();
  457. private static final WKBWriter wkbWriter = new WKBWriter();
  458. private static final WKBReader wkbReader = new WKBReader();
  459. public static void serializeGeometry(Text text, Geometry geom, char toAppend) {
  460. String wkt = geom.toText();
  461. byte[] wkt_b = wkt.getBytes();
  462. text.append(wkt_b, 0, wkt_b.length);
  463. if (toAppend != '\0')
  464. text.append(new byte[] {(byte) toAppend}, 0, 1);
  465. }
  466. public static Geometry consumeGeometryJTS(Text text, char separator) {
  467. // Check whether this text is a Well Known Text (WKT) or a hexed string
  468. boolean wkt = false;
  469. byte[] bytes = text.getBytes();
  470. int i_shape = 0;
  471. while (!wkt && i_shape < ShapeNames.length) {
  472. byte[] shapeName = ShapeNames[i_shape];
  473. if (text.getLength() > shapeName.length) {
  474. int i = 0;
  475. while (i < shapeName.length && shapeName[i] == bytes[i])
  476. i++;
  477. if (i == shapeName.length) {
  478. wkt = true;
  479. break;
  480. }
  481. }
  482. i_shape++;
  483. }
  484. int i_end;
  485. Geometry geom;
  486. if (i_shape < ShapeNames.length) {
  487. // Look for the terminator of the shape text
  488. i_end = 0;
  489. while (i_end < text.getLength() && bytes[i_end] != '(')
  490. i_end++;
  491. int nesting = 1;
  492. while (i_end < text.getLength() && nesting > 0) {
  493. if (bytes[i_end] == '(')
  494. nesting++;
  495. else if (bytes[i_end] == ')')
  496. nesting--;
  497. i_end++;
  498. }
  499. String wkt_text = new String(bytes, 0, i_end);
  500. try {
  501. geom = wktReader.read(wkt_text);
  502. } catch (ParseException e) {
  503. throw new RuntimeException("Error parsing WKT '"+wkt_text+"'", e);
  504. }
  505. } else {
  506. i_end = 0;
  507. while (i_end < text.getLength() && IsHex[bytes[i_end]])
  508. i_end++;
  509. String hex_string = new String(bytes, 0, i_end);
  510. byte[] binary = hexToBytes(hex_string);
  511. try {
  512. geom = wkbReader.read(binary);
  513. } catch (ParseException e) {
  514. throw new RuntimeException("Error parsing Hex seting '"+hex_string+"'", e);
  515. }
  516. }
  517. // Remove consumed bytes from the text
  518. if (i_end < text.getLength() && bytes[i_end] == separator)
  519. i_end++;
  520. if (i_end >= text.getLength())
  521. text.clear();
  522. else
  523. text.set(bytes, i_end, text.getLength() - i_end);
  524. return geom;
  525. }
  526. private static final boolean[] IsHex = new boolean[256];
  527. private static final byte[] HexLookupTable = {
  528. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
  529. 'A', 'B', 'C', 'D', 'E', 'F'
  530. };
  531. static {
  532. for (char c = '0'; c <= '9'; c++)
  533. IsHex[c] = true;
  534. for (char c = 'A'; c <= 'F'; c++)
  535. IsHex[c] = true;
  536. for (char c = 'a'; c <= 'f'; c++)
  537. IsHex[c] = true;
  538. }
  539. /**
  540. * Convert binary array to a hex string.
  541. * @param binary
  542. * @return
  543. */
  544. public static String bytesToHex(byte[] binary) {
  545. // Each byte is converted to two hex values
  546. byte[] hex = new byte[binary.length * 2];
  547. for (int i = 0; i < binary.length; i++) {
  548. hex[2*i] = HexLookupTable[(binary[i] & 0xFF) >>> 4];
  549. hex[2*i+1] = HexLookupTable[binary[i] & 0xF];
  550. }
  551. return new String(hex);
  552. }
  553. }