PageRenderTime 39ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/phase3/includes/media/JpegMetadataExtractor.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 241 lines | 137 code | 25 blank | 79 comment | 34 complexity | 6a13dfd44946d02a67915b7d27b10135 MD5 | raw file
  1. <?php
  2. /**
  3. * Class for reading jpegs and extracting metadata.
  4. * see also BitmapMetadataHandler.
  5. *
  6. * Based somewhat on GIFMetadataExtrator.
  7. */
  8. class JpegMetadataExtractor {
  9. const MAX_JPEG_SEGMENTS = 200;
  10. // the max segment is a sanity check.
  11. // A jpeg file should never even remotely have
  12. // that many segments. Your average file has about 10.
  13. /** Function to extract metadata segments of interest from jpeg files
  14. * based on GIFMetadataExtractor.
  15. *
  16. * we can almost use getimagesize to do this
  17. * but gis doesn't support having multiple app1 segments
  18. * and those can't extract xmp on files containing both exif and xmp data
  19. *
  20. * @param String $filename name of jpeg file
  21. * @return Array of interesting segments.
  22. * @throws MWException if given invalid file.
  23. */
  24. static function segmentSplitter ( $filename ) {
  25. $showXMP = function_exists( 'xml_parser_create_ns' );
  26. $segmentCount = 0;
  27. $segments = array(
  28. 'XMP_ext' => array(),
  29. 'COM' => array(),
  30. );
  31. if ( !$filename ) {
  32. throw new MWException( "No filename specified for " . __METHOD__ );
  33. }
  34. if ( !file_exists( $filename ) || is_dir( $filename ) ) {
  35. throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
  36. }
  37. $fh = fopen( $filename, "rb" );
  38. if ( !$fh ) {
  39. throw new MWException( "Could not open file $filename" );
  40. }
  41. $buffer = fread( $fh, 2 );
  42. if ( $buffer !== "\xFF\xD8" ) {
  43. throw new MWException( "Not a jpeg, no SOI" );
  44. }
  45. while ( !feof( $fh ) ) {
  46. $buffer = fread( $fh, 1 );
  47. $segmentCount++;
  48. if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
  49. // this is just a sanity check
  50. throw new MWException( 'Too many jpeg segments. Aborting' );
  51. }
  52. if ( $buffer !== "\xFF" ) {
  53. throw new MWException( "Error reading jpeg file marker" );
  54. }
  55. $buffer = fread( $fh, 1 );
  56. if ( $buffer === "\xFE" ) {
  57. // COM section -- file comment
  58. // First see if valid utf-8,
  59. // if not try to convert it to windows-1252.
  60. $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
  61. UtfNormal::quickIsNFCVerify( $com );
  62. // turns $com to valid utf-8.
  63. // thus if no change, its utf-8, otherwise its something else.
  64. if ( $com !== $oldCom ) {
  65. wfSuppressWarnings();
  66. $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
  67. wfRestoreWarnings();
  68. }
  69. // Try it again, if its still not a valid string, then probably
  70. // binary junk or some really weird encoding, so don't extract.
  71. UtfNormal::quickIsNFCVerify( $com );
  72. if ( $com === $oldCom ) {
  73. $segments["COM"][] = $oldCom;
  74. } else {
  75. wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
  76. }
  77. } elseif ( $buffer === "\xE1" ) {
  78. // APP1 section (Exif, XMP, and XMP extended)
  79. // only extract if XMP is enabled.
  80. $temp = self::jpegExtractMarker( $fh );
  81. // check what type of app segment this is.
  82. if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
  83. $segments["XMP"] = substr( $temp, 29 );
  84. } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
  85. $segments["XMP_ext"][] = substr( $temp, 35 );
  86. } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
  87. // Some images (especially flickr images) seem to have this.
  88. // I really have no idea what the deal is with them, but
  89. // whatever...
  90. $segments["XMP"] = substr( $temp, 29 );
  91. wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
  92. . "Using anyways.\n" );
  93. } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
  94. // Just need to find out what the byte order is.
  95. // because php's exif plugin sucks...
  96. // This is a II for little Endian, MM for big. Not a unicode BOM.
  97. $byteOrderMarker = substr( $temp, 6, 2 );
  98. if ( $byteOrderMarker === 'MM' ) {
  99. $segments['byteOrder'] = 'BE';
  100. } elseif ( $byteOrderMarker === 'II' ) {
  101. $segments['byteOrder'] = 'LE';
  102. } else {
  103. wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
  104. }
  105. }
  106. } elseif ( $buffer === "\xED" ) {
  107. // APP13 - PSIR. IPTC and some photoshop stuff
  108. $temp = self::jpegExtractMarker( $fh );
  109. if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
  110. $segments["PSIR"] = $temp;
  111. }
  112. } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
  113. // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
  114. return $segments;
  115. } else {
  116. // segment we don't care about, so skip
  117. $size = unpack( "nint", fread( $fh, 2 ) );
  118. if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
  119. fseek( $fh, $size['int'] - 2, SEEK_CUR );
  120. }
  121. }
  122. // shouldn't get here.
  123. throw new MWException( "Reached end of jpeg file unexpectedly" );
  124. }
  125. /**
  126. * Helper function for jpegSegmentSplitter
  127. * @param &$fh FileHandle for jpeg file
  128. * @return data content of segment.
  129. */
  130. private static function jpegExtractMarker( &$fh ) {
  131. $size = unpack( "nint", fread( $fh, 2 ) );
  132. if ( $size['int'] <= 2 ) throw new MWException( "invalid marker size in jpeg" );
  133. return fread( $fh, $size['int'] - 2 );
  134. }
  135. /**
  136. * This reads the photoshop image resource.
  137. * Currently it only compares the iptc/iim hash
  138. * with the stored hash, which is used to determine the precedence
  139. * of the iptc data. In future it may extract some other info, like
  140. * url of copyright license.
  141. *
  142. * This should generally be called by BitmapMetadataHandler::doApp13()
  143. *
  144. * @param String $app13 photoshop psir app13 block from jpg.
  145. * @return String if the iptc hash is good or not.
  146. */
  147. public static function doPSIR ( $app13 ) {
  148. if ( !$app13 ) {
  149. return;
  150. }
  151. // First compare hash with real thing
  152. // 0x404 contains IPTC, 0x425 has hash
  153. // This is used to determine if the iptc is newer than
  154. // the xmp data, as xmp programs update the hash,
  155. // where non-xmp programs don't.
  156. $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
  157. $appLen = strlen( $app13 );
  158. $realHash = "";
  159. $recordedHash = "";
  160. // the +12 is the length of an empty item.
  161. while ( $offset + 12 <= $appLen ) {
  162. $valid = true;
  163. if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
  164. // its supposed to be 8BIM
  165. // but apparently sometimes isn't esp. in
  166. // really old jpg's
  167. $valid = false;
  168. }
  169. $offset += 4;
  170. $id = substr( $app13, $offset, 2 );
  171. // id is a 2 byte id number which identifies
  172. // the piece of info this record contains.
  173. $offset += 2;
  174. // some record types can contain a name, which
  175. // is a pascal string 0-padded to be an even
  176. // number of bytes. Most times (and any time
  177. // we care) this is empty, making it two null bytes.
  178. $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
  179. // we never use the name so skip it. +1 for length byte
  180. if ( $lenName % 2 == 1 ) {
  181. $lenName++;
  182. } // pad to even.
  183. $offset += $lenName;
  184. // now length of data (unsigned long big endian)
  185. $lenData = unpack( 'Nlen', substr( $app13, $offset, 4 ) );
  186. $offset += 4; // 4bytes length field;
  187. // this should not happen, but check.
  188. if ( $lenData['len'] + $offset > $appLen ) {
  189. wfDebug( __METHOD__ . " PSIR data too long.\n" );
  190. return 'iptc-no-hash';
  191. }
  192. if ( $valid ) {
  193. switch ( $id ) {
  194. case "\x04\x04":
  195. // IPTC block
  196. $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
  197. break;
  198. case "\x04\x25":
  199. $recordedHash = substr( $app13, $offset, $lenData['len'] );
  200. break;
  201. }
  202. }
  203. // if odd, add 1 to length to account for
  204. // null pad byte.
  205. if ( $lenData['len'] % 2 == 1 ) $lenData['len']++;
  206. $offset += $lenData['len'];
  207. }
  208. if ( !$realHash || !$recordedHash ) {
  209. return 'iptc-no-hash';
  210. } elseif ( $realHash === $recordedHash ) {
  211. return 'iptc-good-hash';
  212. } else { /*$realHash !== $recordedHash */
  213. return 'iptc-bad-hash';
  214. }
  215. }
  216. }