/usr.bin/gzip/unpack.c

https://bitbucket.org/freebsd/freebsd-head/ · C · 329 lines · 153 code · 42 blank · 134 comment · 44 complexity · c50c1aea90b7d7d2346c9055f9c3aaee MD5 · raw file

  1. /*-
  2. * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24. * SUCH DAMAGE.
  25. *
  26. * $FreeBSD$
  27. */
  28. /* This file is #included by gzip.c */
  29. /*
  30. * pack(1) file format:
  31. *
  32. * The first 7 bytes is the header:
  33. * 00, 01 - Signature (US, RS), we already validated it earlier.
  34. * 02..05 - Uncompressed size
  35. * 06 - Level for the huffman tree (<=24)
  36. *
  37. * pack(1) will then store symbols (leaf) nodes count in each huffman
  38. * tree levels, each level would consume 1 byte (See [1]).
  39. *
  40. * After the symbol count table, there is the symbol table, storing
  41. * symbols represented by corresponding leaf node. EOB is not being
  42. * explicitly transmitted (not necessary anyway) in the symbol table.
  43. *
  44. * Compressed data goes after the symbol table.
  45. *
  46. * NOTES
  47. *
  48. * [1] If we count EOB into the symbols, that would mean that we will
  49. * have at most 256 symbols in the huffman tree. pack(1) rejects empty
  50. * file and files that just repeats one character, which means that we
  51. * will have at least 2 symbols. Therefore, pack(1) would reduce the
  52. * last level symbol count by 2 which makes it a number in
  53. * range [0..254], so all levels' symbol count would fit into 1 byte.
  54. */
  55. #define PACK_HEADER_LENGTH 7
  56. #define HTREE_MAXLEVEL 24
  57. /*
  58. * unpack descriptor
  59. *
  60. * Represent the huffman tree in a similar way that pack(1) would
  61. * store in a packed file. We store all symbols in a linear table,
  62. * and store pointers to each level's first symbol. In addition to
  63. * that, maintain two counts for each level: inner nodes count and
  64. * leaf nodes count.
  65. */
  66. typedef struct {
  67. int symbol_size; /* Size of the symbol table */
  68. int treelevels; /* Levels for the huffman tree */
  69. int *symbolsin; /* Table of leaf symbols count in
  70. each level */
  71. int *inodesin; /* Table of internal nodes count in
  72. each level */
  73. char *symbol; /* The symbol table */
  74. char *symbol_eob; /* Pointer to the EOB symbol */
  75. char **tree; /* Decoding huffman tree (pointers to
  76. first symbol of each tree level */
  77. off_t uncompressed_size; /* Uncompressed size */
  78. FILE *fpIn; /* Input stream */
  79. FILE *fpOut; /* Output stream */
  80. } unpack_descriptor_t;
  81. /*
  82. * Release resource allocated to an unpack descriptor.
  83. *
  84. * Caller is responsible to make sure that all of these pointers are
  85. * initialized (in our case, they all point to valid memory block).
  86. * We don't zero out pointers here because nobody else would ever
  87. * reference the memory block without scrubbing them.
  88. */
  89. static void
  90. unpack_descriptor_fini(unpack_descriptor_t *unpackd)
  91. {
  92. free(unpackd->symbolsin);
  93. free(unpackd->inodesin);
  94. free(unpackd->symbol);
  95. free(unpackd->tree);
  96. fclose(unpackd->fpIn);
  97. fclose(unpackd->fpOut);
  98. }
  99. /*
  100. * Recursively fill the internal node count table
  101. */
  102. static void
  103. unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
  104. {
  105. /*
  106. * The internal nodes would be 1/2 of total internal nodes and
  107. * leaf nodes in the next level. For the last level there
  108. * would be no internal node by definition.
  109. */
  110. if (level < unpackd->treelevels) {
  111. unpackd_fill_inodesin(unpackd, level + 1);
  112. unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
  113. unpackd->symbolsin[level + 1]) / 2;
  114. } else
  115. unpackd->inodesin[level] = 0;
  116. }
  117. /*
  118. * Update counter for accepted bytes
  119. */
  120. static void
  121. accepted_bytes(off_t *bytes_in, off_t newbytes)
  122. {
  123. if (bytes_in != NULL)
  124. (*bytes_in) += newbytes;
  125. }
  126. /*
  127. * Read file header and construct the tree. Also, prepare the buffered I/O
  128. * for decode routine.
  129. *
  130. * Return value is uncompressed size.
  131. */
  132. static void
  133. unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
  134. unpack_descriptor_t *unpackd)
  135. {
  136. unsigned char hdr[PACK_HEADER_LENGTH]; /* buffer for header */
  137. ssize_t bytesread; /* Bytes read from the file */
  138. int i, j, thisbyte;
  139. /* Prepend the header buffer if we already read some data */
  140. if (prelen != 0)
  141. memcpy(hdr, pre, prelen);
  142. /* Read in and fill the rest bytes of header */
  143. bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
  144. if (bytesread < 0)
  145. maybe_err("Error reading pack header");
  146. accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
  147. /* Obtain uncompressed length (bytes 2,3,4,5)*/
  148. unpackd->uncompressed_size = 0;
  149. for (i = 2; i <= 5; i++) {
  150. unpackd->uncompressed_size <<= 8;
  151. unpackd->uncompressed_size |= hdr[i];
  152. }
  153. /* Get the levels of the tree */
  154. unpackd->treelevels = hdr[6];
  155. if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
  156. maybe_errx("Huffman tree has insane levels");
  157. /* Let libc take care for buffering from now on */
  158. if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
  159. maybe_err("Can not fdopen() input stream");
  160. if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
  161. maybe_err("Can not fdopen() output stream");
  162. /* Allocate for the tables of bounds and the tree itself */
  163. unpackd->inodesin =
  164. calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
  165. unpackd->symbolsin =
  166. calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
  167. unpackd->tree =
  168. calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
  169. if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
  170. unpackd->tree == NULL)
  171. maybe_err("calloc");
  172. /* We count from 0 so adjust to match array upper bound */
  173. unpackd->treelevels--;
  174. /* Read the levels symbol count table and calculate total */
  175. unpackd->symbol_size = 1; /* EOB */
  176. for (i = 0; i <= unpackd->treelevels; i++) {
  177. if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
  178. maybe_err("File appears to be truncated");
  179. unpackd->symbolsin[i] = (unsigned char)thisbyte;
  180. unpackd->symbol_size += unpackd->symbolsin[i];
  181. }
  182. accepted_bytes(bytes_in, unpackd->treelevels);
  183. if (unpackd->symbol_size > 256)
  184. maybe_errx("Bad symbol table");
  185. /* Allocate for the symbol table, point symbol_eob at the beginning */
  186. unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
  187. if (unpackd->symbol == NULL)
  188. maybe_err("calloc");
  189. /*
  190. * Read in the symbol table, which contain [2, 256] symbols.
  191. * In order to fit the count in one byte, pack(1) would offset
  192. * it by reducing 2 from the actual number from the last level.
  193. *
  194. * We adjust the last level's symbol count by 1 here, because
  195. * the EOB symbol is not being transmitted explicitly. Another
  196. * adjustment would be done later afterward.
  197. */
  198. unpackd->symbolsin[unpackd->treelevels]++;
  199. for (i = 0; i <= unpackd->treelevels; i++) {
  200. unpackd->tree[i] = unpackd->symbol_eob;
  201. for (j = 0; j < unpackd->symbolsin[i]; j++) {
  202. if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
  203. maybe_errx("Symbol table truncated");
  204. *unpackd->symbol_eob++ = (char)thisbyte;
  205. }
  206. accepted_bytes(bytes_in, unpackd->symbolsin[i]);
  207. }
  208. /* Now, take account for the EOB symbol as well */
  209. unpackd->symbolsin[unpackd->treelevels]++;
  210. /*
  211. * The symbolsin table has been constructed now.
  212. * Calculate the internal nodes count table based on it.
  213. */
  214. unpackd_fill_inodesin(unpackd, 0);
  215. }
  216. /*
  217. * Decode huffman stream, based on the huffman tree.
  218. */
  219. static void
  220. unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
  221. {
  222. int thislevel, thiscode, thisbyte, inlevelindex;
  223. int i;
  224. off_t bytes_out = 0;
  225. const char *thissymbol; /* The symbol pointer decoded from stream */
  226. /*
  227. * Decode huffman. Fetch every bytes from the file, get it
  228. * into 'thiscode' bit-by-bit, then output the symbol we got
  229. * when one has been found.
  230. *
  231. * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
  232. * bad things could happen if not.
  233. */
  234. thislevel = 0;
  235. thiscode = thisbyte = 0;
  236. while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
  237. accepted_bytes(bytes_in, 1);
  238. /*
  239. * Split one bit from thisbyte, from highest to lowest,
  240. * feed the bit into thiscode, until we got a symbol from
  241. * the tree.
  242. */
  243. for (i = 7; i >= 0; i--) {
  244. thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
  245. /* Did we got a symbol? (referencing leaf node) */
  246. if (thiscode >= unpackd->inodesin[thislevel]) {
  247. inlevelindex =
  248. thiscode - unpackd->inodesin[thislevel];
  249. if (inlevelindex > unpackd->symbolsin[thislevel])
  250. maybe_errx("File corrupt");
  251. thissymbol =
  252. &(unpackd->tree[thislevel][inlevelindex]);
  253. if ((thissymbol == unpackd->symbol_eob) &&
  254. (bytes_out == unpackd->uncompressed_size))
  255. goto finished;
  256. fputc((*thissymbol), unpackd->fpOut);
  257. bytes_out++;
  258. /* Prepare for next input */
  259. thislevel = 0; thiscode = 0;
  260. } else {
  261. thislevel++;
  262. if (thislevel > unpackd->treelevels)
  263. maybe_errx("File corrupt");
  264. }
  265. }
  266. }
  267. finished:
  268. if (bytes_out != unpackd->uncompressed_size)
  269. maybe_errx("Premature EOF");
  270. }
  271. /* Handler for pack(1)'ed file */
  272. static off_t
  273. unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
  274. {
  275. unpack_descriptor_t unpackd;
  276. in = dup(in);
  277. if (in == -1)
  278. maybe_err("dup");
  279. out = dup(out);
  280. if (out == -1)
  281. maybe_err("dup");
  282. unpack_parse_header(in, out, pre, prelen, bytes_in, &unpackd);
  283. unpack_decode(&unpackd, bytes_in);
  284. unpack_descriptor_fini(&unpackd);
  285. /* If we reached here, the unpack was successful */
  286. return (unpackd.uncompressed_size);
  287. }