/PRILOHY/PRILOHY_DO_REPOZITARE/WARC-TOOLS/warc-tools-read-only/app/wrecordbody.c

http://github.com/MartinProkop/MojeWebarchivBakalarskaPrace · C · 255 lines · 167 code · 62 blank · 26 comment · 38 complexity · 07991b2f3b40a7deba0eae60b7a85078 MD5 · raw file

  1. /* ------------------------------------------------------------------- */
  2. /* Copyright (c) 2007-2008 Hanzo Archives Limited. */
  3. /* */
  4. /* Licensed under the Apache License, Version 2.0 (the "License"); */
  5. /* you may not use this file except in compliance with the License. */
  6. /* You may obtain a copy of the License at */
  7. /* */
  8. /* http://www.apache.org/licenses/LICENSE-2.0 */
  9. /* */
  10. /* Unless required by applicable law or agreed to in writing, software */
  11. /* distributed under the License is distributed on an "AS IS" BASIS, */
  12. /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */
  13. /* implied. */
  14. /* See the License for the specific language governing permissions and */
  15. /* limitations under the License. */
  16. /* */
  17. /* You may find more information about Hanzo Archives at */
  18. /* */
  19. /* http://www.hanzoarchives.com/ */
  20. /* */
  21. /* You may find more information about the WARC Tools project at */
  22. /* */
  23. /* http://code.google.com/p/warc-tools/ */
  24. /* ------------------------------------------------------------------- */
  25. #include <stdio.h>
  26. #include <string.h>
  27. #include <warc.h>
  28. #include <wmktmp.h>
  29. #ifndef WARC_MAX_SIZE
  30. /* 16 Go by default */
  31. #define WARC_MAX_SIZE 17179869184ULL
  32. #endif
  33. #ifndef WARC_CHUNK_SIZE
  34. #define WARC_CHUNK_SIZE 512
  35. #endif
  36. #define uS(s) ((warc_u8_t *) (s))
  37. #define makeS(s) uS(s), w_strlen (uS(s))
  38. void printOrdinaryDate (warc_u8_t * field)
  39. {
  40. warc_u32_t i = 0;
  41. warc_u32_t size = w_strlen (field);
  42. fprintf (stderr, "\t");
  43. while (i < size)
  44. {
  45. if (field[i] != ':' && field[i] != '-' && field[i] != 'Z' && field[i] != 'T' && field[i] != 't' && field[i] != 'z')
  46. fprintf (stderr, "%c", field [i]);
  47. i++;
  48. }
  49. }
  50. int main (int argc, const char ** argv)
  51. {
  52. warc_u8_t * off = NIL;
  53. warc_u8_t * fname = NIL;
  54. warc_u8_t * field = NIL;
  55. warc_u8_t * wdir = uS(".");
  56. warc_u64_t offset = 0;
  57. warc_u8_t * flags = uS ("f:o:t:e");
  58. warc_u8_t buffer [WARC_CHUNK_SIZE+1];
  59. warc_u8_t http_code [4];
  60. warc_i32_t c;
  61. wfile_comp_t cmode = WARC_FILE_DETECT_COMPRESSION;
  62. warc_bool_t with_http = WARC_FALSE;
  63. warc_u32_t rsize = 0;
  64. warc_bool_t stop = WARC_FALSE;
  65. void * r = NIL;
  66. void * w = NIL;
  67. void * p = NIL;
  68. void * tfile = NIL;
  69. FILE * otfile = NIL;
  70. if (argc < 3 || argc > 7)
  71. {
  72. fprintf (stderr, "Extract WARC's content block only\n");
  73. fprintf (stderr, "Usage: %s -f <file.warc> <-o offset> [-e] [-t <working_dir>]\n", argv [0]);
  74. fprintf (stderr, "\t-f : valid WARC file name\n");
  75. fprintf (stderr, "\t-o : the offset of the record\n");
  76. fprintf (stderr, "\t[-t] : temporary working directory (default \".\")\n");
  77. fprintf (stderr, "\t[-e] : whether the payload content is after the HTTP response (default \"no\")\n");
  78. return (2);
  79. }
  80. p = bless (WGetOpt, makeS (flags) );
  81. assert (p);
  82. /* parse command line parameters */
  83. while ( (c = WGetOpt_parse (p, argc, argv) ) != -1)
  84. {
  85. switch (c)
  86. {
  87. case 'f' :
  88. if (w_index (flags, c) [1] == ':')
  89. fname = uS(WGetOpt_argument (p));
  90. break;
  91. case 'o' :
  92. if (w_index (flags, c) [1] == ':')
  93. off = (warc_u8_t *) WGetOpt_argument (p);
  94. if (w_atou (off, w_strlen(off), & offset))
  95. {
  96. fprintf (stderr, "invalid offset number: %s\n", off);
  97. destroy (p);
  98. return (3);
  99. }
  100. break;
  101. case 't' :
  102. if (w_index (flags, c) [1] == ':')
  103. wdir = uS(WGetOpt_argument (p));
  104. break;
  105. case 'e' :
  106. with_http = WARC_TRUE;
  107. break;
  108. case '?' : /* illegal option or missing argument */
  109. destroy (p);
  110. return (1);
  111. }
  112. }
  113. unless (fname)
  114. {
  115. fprintf (stderr, "missing WARC file name. Use -f option\n");
  116. return (4);
  117. }
  118. w = bless (WFile, fname, WARC_MAX_SIZE,
  119. WARC_FILE_READER, cmode, wdir);
  120. unless (w)
  121. {
  122. fprintf (stderr, "Could not open WARC file %s\n", fname);
  123. return (5);
  124. }
  125. destroy (p);
  126. if (WFile_seek (w, offset))
  127. {
  128. fprintf (stderr, "Could not reach the offset %llu\n", (unsigned long long) offset);
  129. destroy (w);
  130. return (9);
  131. }
  132. unless (WFile_hasMoreRecords (w))
  133. {
  134. fprintf (stderr, "End of file reached\n");
  135. destroy (w);
  136. return (6);
  137. }
  138. r = WFile_nextRecord (w);
  139. unless (r)
  140. {
  141. fprintf (stderr, "No valid record at this offset \n");
  142. destroy (w);
  143. return (7);
  144. }
  145. field = uS(WRecord_getTargetUri (r));
  146. unless (field)
  147. field = uS("unknown");
  148. else unless (w_strcmp (field, uS("")))
  149. field = uS("unknown");
  150. fprintf (stderr, "\"%s\"", field);
  151. field = NIL;
  152. field = uS(WRecord_getDate (r));
  153. printOrdinaryDate (field);
  154. field = NIL;
  155. field = uS(WRecord_getPayloadType (r));
  156. unless (field || (field != NIL && !w_strcmp (field, uS(""))))
  157. {
  158. field = uS(WRecord_getContentType (r));
  159. unless (field || (field != NIL && !w_strcmp (field, uS(""))))
  160. field = uS("unknown");
  161. }
  162. fprintf (stderr, "\t%s", field);
  163. field = NIL;
  164. otfile = WRecord_getBloc (r, w, with_http, http_code);
  165. unless (otfile)
  166. {
  167. fprintf (stderr, "A problem appeared while reading data \n");
  168. destroy (r);
  169. destroy (w);
  170. return (8);
  171. }
  172. fprintf (stderr, "\t%s", http_code);
  173. fprintf (stderr, "\t%d\n", (const warc_u32_t) (WRecord_getCompressedSize (r) + offset));
  174. tfile = WTempFile_handle (otfile);
  175. w_fseek_start (tfile);
  176. while (! stop)
  177. {
  178. rsize = w_fread (buffer, 1, WARC_CHUNK_SIZE, tfile);
  179. if (fwrite( buffer, 1, rsize, stdout) != rsize || ferror(stdout))
  180. {
  181. destroy (r);
  182. destroy (w);
  183. destroy (otfile);
  184. return (10);
  185. }
  186. if (rsize < WARC_CHUNK_SIZE)
  187. stop = WARC_TRUE;
  188. }
  189. destroy (w);
  190. destroy (otfile);
  191. destroy (r);
  192. return (0);
  193. }