PageRenderTime 39ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/lang/php/lib/avro/data_file.php

http://github.com/apache/avro
PHP | 551 lines | 253 code | 72 blank | 226 comment | 26 complexity | 8d4f23363a6c3a8d20627e02e8bee5c6 MD5 | raw file
Possible License(s): Apache-2.0, JSON, BSD-3-Clause
  1. <?php
  2. /**
  3. * Licensed to the Apache Software Foundation (ASF) under one
  4. * or more contributor license agreements. See the NOTICE file
  5. * distributed with this work for additional information
  6. * regarding copyright ownership. The ASF licenses this file
  7. * to you under the Apache License, Version 2.0 (the
  8. * "License"); you may not use this file except in compliance
  9. * with the License. You may obtain a copy of the License at
  10. *
  11. * https://www.apache.org/licenses/LICENSE-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS,
  15. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. * See the License for the specific language governing permissions and
  17. * limitations under the License.
  18. */
  19. /**
  20. * Classes handling reading and writing from and to AvroIO objects
  21. * @package Avro
  22. */
  23. /**
  24. * Raised when something unkind happens with respect to AvroDataIO.
  25. * @package Avro
  26. */
  27. class AvroDataIOException extends AvroException {}
  28. /**
  29. * @package Avro
  30. */
  31. class AvroDataIO
  32. {
  33. /**
  34. * @var int used in file header
  35. */
  36. const VERSION = 1;
  37. /**
  38. * @var int count of bytes in synchronization marker
  39. */
  40. const SYNC_SIZE = 16;
  41. /**
  42. * @var int count of items per block, arbitrarily set to 4000 * SYNC_SIZE
  43. * @todo make this value configurable
  44. */
  45. const SYNC_INTERVAL = 64000;
  46. /**
  47. * @var string map key for datafile metadata codec value
  48. */
  49. const METADATA_CODEC_ATTR = 'avro.codec';
  50. /**
  51. * @var string map key for datafile metadata schema value
  52. */
  53. const METADATA_SCHEMA_ATTR = 'avro.schema';
  54. /**
  55. * @var string JSON for datafile metadata schema
  56. */
  57. const METADATA_SCHEMA_JSON = '{"type":"map","values":"bytes"}';
  58. /**
  59. * @var string codec value for NULL codec
  60. */
  61. const NULL_CODEC = 'null';
  62. /**
  63. * @var string codec value for deflate codec
  64. */
  65. const DEFLATE_CODEC = 'deflate';
  66. /**
  67. * @var array array of valid codec names
  68. */
  69. private static $valid_codecs = array(self::NULL_CODEC, self::DEFLATE_CODEC);
  70. /**
  71. * @var AvroSchema cached version of metadata schema object
  72. */
  73. private static $metadata_schema;
  74. /**
  75. * @returns the initial "magic" segment of an Avro container file header.
  76. */
  77. public static function magic() { return ('Obj' . pack('c', self::VERSION)); }
  78. /**
  79. * @returns int count of bytes in the initial "magic" segment of the
  80. * Avro container file header
  81. */
  82. public static function magic_size() { return strlen(self::magic()); }
  83. /**
  84. * @returns AvroSchema object of Avro container file metadata.
  85. */
  86. public static function metadata_schema()
  87. {
  88. if (is_null(self::$metadata_schema))
  89. self::$metadata_schema = AvroSchema::parse(self::METADATA_SCHEMA_JSON);
  90. return self::$metadata_schema;
  91. }
  92. /**
  93. * @param string $file_path file_path of file to open
  94. * @param string $mode one of AvroFile::READ_MODE or AvroFile::WRITE_MODE
  95. * @param string $schema_json JSON of writer's schema
  96. * @param string $codec compression codec
  97. * @returns AvroDataIOWriter instance of AvroDataIOWriter
  98. *
  99. * @throws AvroDataIOException if $writers_schema is not provided
  100. * or if an invalid $mode is given.
  101. */
  102. public static function open_file($file_path, $mode=AvroFile::READ_MODE,
  103. $schema_json=null, $codec=self::NULL_CODEC)
  104. {
  105. $schema = !is_null($schema_json)
  106. ? AvroSchema::parse($schema_json) : null;
  107. $io = false;
  108. switch ($mode)
  109. {
  110. case AvroFile::WRITE_MODE:
  111. if (is_null($schema))
  112. throw new AvroDataIOException('Writing an Avro file requires a schema.');
  113. $file = new AvroFile($file_path, AvroFile::WRITE_MODE);
  114. $io = self::open_writer($file, $schema, $codec);
  115. break;
  116. case AvroFile::READ_MODE:
  117. $file = new AvroFile($file_path, AvroFile::READ_MODE);
  118. $io = self::open_reader($file, $schema);
  119. break;
  120. default:
  121. throw new AvroDataIOException(
  122. sprintf("Only modes '%s' and '%s' allowed. You gave '%s'.",
  123. AvroFile::READ_MODE, AvroFile::WRITE_MODE, $mode));
  124. }
  125. return $io;
  126. }
  127. /**
  128. * @returns array array of valid codecs
  129. */
  130. public static function valid_codecs()
  131. {
  132. return self::$valid_codecs;
  133. }
  134. /**
  135. * @param string $codec
  136. * @returns boolean true if $codec is a valid codec value and false otherwise
  137. */
  138. public static function is_valid_codec($codec)
  139. {
  140. return in_array($codec, self::valid_codecs());
  141. }
  142. /**
  143. * @param AvroIO $io
  144. * @param AvroSchema $schema
  145. * @param string $codec
  146. * @returns AvroDataIOWriter
  147. */
  148. protected static function open_writer($io, $schema, $codec=self::NULL_CODEC)
  149. {
  150. $writer = new AvroIODatumWriter($schema);
  151. return new AvroDataIOWriter($io, $writer, $schema, $codec);
  152. }
  153. /**
  154. * @param AvroIO $io
  155. * @param AvroSchema $schema
  156. * @returns AvroDataIOReader
  157. */
  158. protected static function open_reader($io, $schema)
  159. {
  160. $reader = new AvroIODatumReader(null, $schema);
  161. return new AvroDataIOReader($io, $reader);
  162. }
  163. }
  164. /**
  165. *
  166. * Reads Avro data from an AvroIO source using an AvroSchema.
  167. * @package Avro
  168. */
  169. class AvroDataIOReader
  170. {
  171. /**
  172. * @var AvroIO
  173. */
  174. private $io;
  175. /**
  176. * @var AvroIOBinaryDecoder
  177. */
  178. private $decoder;
  179. /**
  180. * @var AvroIODatumReader
  181. */
  182. private $datum_reader;
  183. /**
  184. * @var string
  185. */
  186. public $sync_marker;
  187. /**
  188. * @var array object container metadata
  189. */
  190. public $metadata;
  191. /**
  192. * @var int count of items in block
  193. */
  194. private $block_count;
  195. /**
  196. * @var compression codec
  197. */
  198. private $codec;
  199. /**
  200. * @param AvroIO $io source from which to read
  201. * @param AvroIODatumReader $datum_reader reader that understands
  202. * the data schema
  203. * @throws AvroDataIOException if $io is not an instance of AvroIO
  204. * or the codec specified in the header
  205. * is not supported
  206. * @uses read_header()
  207. */
  208. public function __construct($io, $datum_reader)
  209. {
  210. if (!($io instanceof AvroIO))
  211. throw new AvroDataIOException('io must be instance of AvroIO');
  212. $this->io = $io;
  213. $this->decoder = new AvroIOBinaryDecoder($this->io);
  214. $this->datum_reader = $datum_reader;
  215. $this->read_header();
  216. $codec = AvroUtil::array_value($this->metadata,
  217. AvroDataIO::METADATA_CODEC_ATTR);
  218. if ($codec && !AvroDataIO::is_valid_codec($codec))
  219. throw new AvroDataIOException(sprintf('Unknown codec: %s', $codec));
  220. $this->codec = $codec;
  221. $this->block_count = 0;
  222. // FIXME: Seems unsanitary to set writers_schema here.
  223. // Can't constructor take it as an argument?
  224. $this->datum_reader->set_writers_schema(
  225. AvroSchema::parse($this->metadata[AvroDataIO::METADATA_SCHEMA_ATTR]));
  226. }
  227. /**
  228. * Reads header of object container
  229. * @throws AvroDataIOException if the file is not an Avro data file.
  230. */
  231. private function read_header()
  232. {
  233. $this->seek(0, AvroIO::SEEK_SET);
  234. $magic = $this->read(AvroDataIO::magic_size());
  235. if (strlen($magic) < AvroDataIO::magic_size())
  236. throw new AvroDataIOException(
  237. 'Not an Avro data file: shorter than the Avro magic block');
  238. if (AvroDataIO::magic() != $magic)
  239. throw new AvroDataIOException(
  240. sprintf('Not an Avro data file: %s does not match %s',
  241. $magic, AvroDataIO::magic()));
  242. $this->metadata = $this->datum_reader->read_data(AvroDataIO::metadata_schema(),
  243. AvroDataIO::metadata_schema(),
  244. $this->decoder);
  245. $this->sync_marker = $this->read(AvroDataIO::SYNC_SIZE);
  246. }
  247. /**
  248. * @internal Would be nice to implement data() as an iterator, I think
  249. * @returns array of data from object container.
  250. */
  251. public function data()
  252. {
  253. $data = array();
  254. while (true)
  255. {
  256. if (0 == $this->block_count)
  257. {
  258. if ($this->is_eof())
  259. break;
  260. if ($this->skip_sync())
  261. if ($this->is_eof())
  262. break;
  263. $length = $this->read_block_header();
  264. $decoder = $this->decoder;
  265. if ($this->codec == AvroDataIO::DEFLATE_CODEC) {
  266. $compressed = $decoder->read($length);
  267. $datum = gzinflate($compressed);
  268. $decoder = new AvroIOBinaryDecoder(new AvroStringIO($datum));
  269. }
  270. }
  271. $data []= $this->datum_reader->read($decoder);
  272. $this->block_count -= 1;
  273. }
  274. return $data;
  275. }
  276. /**
  277. * Closes this writer (and its AvroIO object.)
  278. * @uses AvroIO::close()
  279. */
  280. public function close() { return $this->io->close(); }
  281. /**
  282. * @uses AvroIO::seek()
  283. */
  284. private function seek($offset, $whence)
  285. {
  286. return $this->io->seek($offset, $whence);
  287. }
  288. /**
  289. * @uses AvroIO::read()
  290. */
  291. private function read($len) { return $this->io->read($len); }
  292. /**
  293. * @uses AvroIO::is_eof()
  294. */
  295. private function is_eof() { return $this->io->is_eof(); }
  296. private function skip_sync()
  297. {
  298. $proposed_sync_marker = $this->read(AvroDataIO::SYNC_SIZE);
  299. if ($proposed_sync_marker != $this->sync_marker)
  300. {
  301. $this->seek(-AvroDataIO::SYNC_SIZE, AvroIO::SEEK_CUR);
  302. return false;
  303. }
  304. return true;
  305. }
  306. /**
  307. * Reads the block header (which includes the count of items in the block
  308. * and the length in bytes of the block)
  309. * @returns int length in bytes of the block.
  310. */
  311. private function read_block_header()
  312. {
  313. $this->block_count = $this->decoder->read_long();
  314. return $this->decoder->read_long();
  315. }
  316. }
  317. /**
  318. * Writes Avro data to an AvroIO source using an AvroSchema
  319. * @package Avro
  320. */
  321. class AvroDataIOWriter
  322. {
  323. /**
  324. * @returns string a new, unique sync marker.
  325. */
  326. private static function generate_sync_marker()
  327. {
  328. // From https://php.net/manual/en/function.mt-rand.php comments
  329. return pack('S8',
  330. mt_rand(0, 0xffff), mt_rand(0, 0xffff),
  331. mt_rand(0, 0xffff),
  332. mt_rand(0, 0xffff) | 0x4000,
  333. mt_rand(0, 0xffff) | 0x8000,
  334. mt_rand(0, 0xffff), mt_rand(0, 0xffff), mt_rand(0, 0xffff));
  335. }
  336. /**
  337. * @var AvroIO object container where data is written
  338. */
  339. private $io;
  340. /**
  341. * @var AvroIOBinaryEncoder encoder for object container
  342. */
  343. private $encoder;
  344. /**
  345. * @var AvroDatumWriter
  346. */
  347. private $datum_writer;
  348. /**
  349. * @var AvroStringIO buffer for writing
  350. */
  351. private $buffer;
  352. /**
  353. * @var AvroIOBinaryEncoder encoder for buffer
  354. */
  355. private $buffer_encoder; // AvroIOBinaryEncoder
  356. /**
  357. * @var int count of items written to block
  358. */
  359. private $block_count;
  360. /**
  361. * @var array map of object container metadata
  362. */
  363. private $metadata;
  364. /**
  365. * @var compression codec
  366. */
  367. private $codec;
  368. /**
  369. * @param AvroIO $io
  370. * @param AvroIODatumWriter $datum_writer
  371. * @param AvroSchema $writers_schema
  372. * @param string $codec
  373. */
  374. public function __construct($io, $datum_writer, $writers_schema=null, $codec=AvroDataIO::NULL_CODEC)
  375. {
  376. if (!($io instanceof AvroIO))
  377. throw new AvroDataIOException('io must be instance of AvroIO');
  378. $this->io = $io;
  379. $this->encoder = new AvroIOBinaryEncoder($this->io);
  380. $this->datum_writer = $datum_writer;
  381. $this->buffer = new AvroStringIO();
  382. $this->buffer_encoder = new AvroIOBinaryEncoder($this->buffer);
  383. $this->block_count = 0;
  384. $this->metadata = array();
  385. if ($writers_schema)
  386. {
  387. if (!AvroDataIO::is_valid_codec($codec))
  388. throw new AvroDataIOException(
  389. sprintf('codec %s is not supported', $codec));
  390. $this->sync_marker = self::generate_sync_marker();
  391. $this->metadata[AvroDataIO::METADATA_CODEC_ATTR] = $this->codec = $codec;
  392. $this->metadata[AvroDataIO::METADATA_SCHEMA_ATTR] = (string) $writers_schema;
  393. $this->write_header();
  394. }
  395. else
  396. {
  397. $dfr = new AvroDataIOReader($this->io, new AvroIODatumReader());
  398. $this->sync_marker = $dfr->sync_marker;
  399. $this->metadata[AvroDataIO::METADATA_CODEC_ATTR] = $this->codec
  400. = $dfr->metadata[AvroDataIO::METADATA_CODEC_ATTR];
  401. $schema_from_file = $dfr->metadata[AvroDataIO::METADATA_SCHEMA_ATTR];
  402. $this->metadata[AvroDataIO::METADATA_SCHEMA_ATTR] = $schema_from_file;
  403. $this->datum_writer->writers_schema = AvroSchema::parse($schema_from_file);
  404. $this->seek(0, SEEK_END);
  405. }
  406. }
  407. /**
  408. * @param mixed $datum
  409. */
  410. public function append($datum)
  411. {
  412. $this->datum_writer->write($datum, $this->buffer_encoder);
  413. $this->block_count++;
  414. if ($this->buffer->length() >= AvroDataIO::SYNC_INTERVAL)
  415. $this->write_block();
  416. }
  417. /**
  418. * Flushes buffer to AvroIO object container and closes it.
  419. * @return mixed value of $io->close()
  420. * @see AvroIO::close()
  421. */
  422. public function close()
  423. {
  424. $this->flush();
  425. return $this->io->close();
  426. }
  427. /**
  428. * Flushes biffer to AvroIO object container.
  429. * @returns mixed value of $io->flush()
  430. * @see AvroIO::flush()
  431. */
  432. private function flush()
  433. {
  434. $this->write_block();
  435. return $this->io->flush();
  436. }
  437. /**
  438. * Writes a block of data to the AvroIO object container.
  439. */
  440. private function write_block()
  441. {
  442. if ($this->block_count > 0)
  443. {
  444. $this->encoder->write_long($this->block_count);
  445. $to_write = strval($this->buffer);
  446. if ($this->codec == AvroDataIO::DEFLATE_CODEC)
  447. $to_write = gzdeflate($to_write);
  448. $this->encoder->write_long(strlen($to_write));
  449. $this->write($to_write);
  450. $this->write($this->sync_marker);
  451. $this->buffer->truncate();
  452. $this->block_count = 0;
  453. }
  454. }
  455. /**
  456. * Writes the header of the AvroIO object container
  457. */
  458. private function write_header()
  459. {
  460. $this->write(AvroDataIO::magic());
  461. $this->datum_writer->write_data(AvroDataIO::metadata_schema(),
  462. $this->metadata, $this->encoder);
  463. $this->write($this->sync_marker);
  464. }
  465. /**
  466. * @param string $bytes
  467. * @uses AvroIO::write()
  468. */
  469. private function write($bytes) { return $this->io->write($bytes); }
  470. /**
  471. * @param int $offset
  472. * @param int $whence
  473. * @uses AvroIO::seek()
  474. */
  475. private function seek($offset, $whence)
  476. {
  477. return $this->io->seek($offset, $whence);
  478. }
  479. }