PageRenderTime 118ms CodeModel.GetById 17ms app.highlight 84ms RepoModel.GetById 1ms app.codeStats 1ms

/xbmc/visualizations/XBMCProjectM/libprojectM/stb_image_aug.c

http://github.com/xbmc/xbmc
C | 3163 lines | 2541 code | 280 blank | 342 comment | 812 complexity | 0a67e6320b93c9e60a9dfaff421fd43a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/* stbi-1.03 - public domain JPEG/PNG reader - http://nothings.org/stb_image.c
   2                      when you control the images you're loading
   3
   4   QUICK NOTES:
   5      Primarily of interest to game developers and other people who can
   6          avoid problematic images and only need the trivial interface
   7
   8      JPEG baseline (no JPEG progressive, no oddball channel decimations)
   9      PNG non-interlaced
  10      BMP non-1bpp, non-RLE
  11      TGA (not sure what subset, if a subset)
  12      HDR (radiance rgbE format)
  13      writes BMP,TGA (define STBI_NO_WRITE to remove code)
  14      decoded from memory or through stdio FILE (define STBI_NO_STDIO to remove code)
  15
  16   TODO:
  17      stbi_info_*
  18      PSD loader
  19
  20   history:
  21      1.03   bugfixes to STBI_NO_STDIO, STBI_NO_HDR
  22      1.02   support for (subset of) HDR files, float interface for preferred access to them
  23      1.01   fix bug: possible bug in handling right-side up bmps... not sure
  24             fix bug: the stbi_bmp_load() and stbi_tga_load() functions didn't work at all
  25      1.00   interface to zlib that skips zlib header
  26      0.99   correct handling of alpha in palette
  27      0.98   TGA loader by lonesock; dynamically add loaders (untested)
  28      0.97   jpeg errors on too large a file; also catch another malloc failure
  29      0.96   fix detection of invalid v value - particleman@mollyrocket forum
  30      0.95   during header scan, seek to markers in case of padding
  31      0.94   STBI_NO_STDIO to disable stdio usage; rename all #defines the same
  32      0.93   handle jpegtran output; verbose errors
  33      0.92   read 4,8,16,24,32-bit BMP files of several formats
  34      0.91   output 24-bit Windows 3.0 BMP files
  35      0.90   fix a few more warnings; bump version number to approach 1.0
  36      0.61   bugfixes due to Marc LeBlanc, Christopher Lloyd
  37      0.60   fix compiling as c++
  38      0.59   fix warnings: merge Dave Moore's -Wall fixes
  39      0.58   fix bug: zlib uncompressed mode len/nlen was wrong endian
  40      0.57   fix bug: jpg last huffman symbol before marker was >9 bits but less
  41                      than 16 available
  42      0.56   fix bug: zlib uncompressed mode len vs. nlen
  43      0.55   fix bug: restart_interval not initialized to 0
  44      0.54   allow NULL for 'int *comp'
  45      0.53   fix bug in png 3->4; speedup png decoding
  46      0.52   png handles req_comp=3,4 directly; minor cleanup; jpeg comments
  47      0.51   obey req_comp requests, 1-component jpegs return as 1-component,
  48             on 'test' only check type, not whether we support this variant
  49*/
  50
  51#include "stb_image_aug.h"
  52
  53#ifndef STBI_NO_STDIO
  54#include <stdio.h>
  55#endif
  56#include <stdlib.h>
  57#include <memory.h>
  58#include <assert.h>
  59#include <stdarg.h>
  60
  61#ifndef _MSC_VER
  62#define __forceinline
  63#endif
  64
  65// implementation:
  66typedef unsigned char uint8;
  67typedef unsigned short uint16;
  68typedef   signed short  int16;
  69typedef unsigned int   uint32;
  70typedef   signed int    int32;
  71typedef unsigned int   uint;
  72
  73// should produce compiler error if size is wrong
  74typedef unsigned char validate_uint32[sizeof(uint32)==4];
  75
  76#if defined(STBI_NO_STDIO) && !defined(STBI_NO_WRITE)
  77#define STBI_NO_WRITE
  78#endif
  79
  80#ifndef STBI_NO_DDS
  81#include "stbi_DDS_aug.h"
  82#endif
  83
  84//	I (JLD) want full messages for SOIL
  85#define STBI_FAILURE_USERMSG 1
  86
  87//////////////////////////////////////////////////////////////////////////////
  88//
  89// Generic API that works on all image types
  90//
  91
  92static char *failure_reason;
  93
  94char *stbi_failure_reason(void)
  95{
  96   return failure_reason;
  97}
  98
  99static int e(char *str)
 100{
 101   failure_reason = str;
 102   return 0;
 103}
 104
 105#ifdef STBI_NO_FAILURE_STRINGS
 106   #define e(x,y)  0
 107#elif defined(STBI_FAILURE_USERMSG)
 108   #define e(x,y)  e(y)
 109#else
 110   #define e(x,y)  e(x)
 111#endif
 112
 113#define ep(x,y)   (e(x,y)?NULL:NULL)
 114
 115void stbi_image_free(unsigned char *retval_from_stbi_load)
 116{
 117   free(retval_from_stbi_load);
 118}
 119
 120#define MAX_LOADERS  32
 121stbi_loader *loaders[MAX_LOADERS];
 122static int max_loaders = 0;
 123
 124int stbi_register_loader(stbi_loader *loader)
 125{
 126   int i;
 127   for (i=0; i < MAX_LOADERS; ++i) {
 128      // already present?
 129      if (loaders[i] == loader)
 130         return 1;
 131      // end of the list?
 132      if (loaders[i] == NULL) {
 133         loaders[i] = loader;
 134         max_loaders = i+1;
 135         return 1;
 136      }
 137   }
 138   // no room for it
 139   return 0;
 140}
 141
 142#ifndef STBI_NO_HDR
 143static float   *ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 144static stbi_uc *hdr_to_ldr(float   *data, int x, int y, int comp);
 145#endif
 146
 147#ifndef STBI_NO_STDIO
 148unsigned char *stbi_load(char *filename, int *x, int *y, int *comp, int req_comp)
 149{
 150   FILE *f = fopen(filename, "rb");
 151   unsigned char *result;
 152   if (!f) return ep("can't fopen", "Unable to open file");
 153   result = stbi_load_from_file(f,x,y,comp,req_comp);
 154   fclose(f);
 155   return result;
 156}
 157
 158unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 159{
 160   int i;
 161   if (stbi_jpeg_test_file(f))
 162      return stbi_jpeg_load_from_file(f,x,y,comp,req_comp);
 163   if (stbi_png_test_file(f))
 164      return stbi_png_load_from_file(f,x,y,comp,req_comp);
 165   if (stbi_bmp_test_file(f))
 166      return stbi_bmp_load_from_file(f,x,y,comp,req_comp);
 167   #ifndef STBI_NO_DDS
 168   if (stbi_dds_test_file(f))
 169      return stbi_dds_load_from_file(f,x,y,comp,req_comp);
 170   #endif
 171   #ifndef STBI_NO_HDR
 172   if (stbi_hdr_test_file(f)) {
 173      float *hdr = stbi_hdr_load_from_file(f, x,y,comp,req_comp);
 174      return hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
 175   }
 176   #endif
 177   for (i=0; i < max_loaders; ++i)
 178      if (loaders[i]->test_file(f))
 179         return loaders[i]->load_from_file(f,x,y,comp,req_comp);
 180   // test tga last because it's a crappy test!
 181   if (stbi_tga_test_file(f))
 182      return stbi_tga_load_from_file(f,x,y,comp,req_comp);
 183   return ep("unknown image type", "Image not of any known type, or corrupt");
 184}
 185#endif
 186
 187unsigned char *stbi_load_from_memory(stbi_uc *buffer, int len, int *x, int *y, int *comp, int req_comp)
 188{
 189   int i;
 190   if (stbi_jpeg_test_memory(buffer,len))
 191      return stbi_jpeg_load_from_memory(buffer,len,x,y,comp,req_comp);
 192   if (stbi_png_test_memory(buffer,len))
 193      return stbi_png_load_from_memory(buffer,len,x,y,comp,req_comp);
 194   if (stbi_bmp_test_memory(buffer,len))
 195      return stbi_bmp_load_from_memory(buffer,len,x,y,comp,req_comp);
 196   #ifndef STBI_NO_DDS
 197   if (stbi_dds_test_memory(buffer,len))
 198      return stbi_dds_load_from_memory(buffer,len,x,y,comp,req_comp);
 199   #endif
 200   #ifndef STBI_NO_HDR
 201   if (stbi_hdr_test_memory(buffer, len)) {
 202      float *hdr = stbi_hdr_load_from_memory(buffer, len,x,y,comp,req_comp);
 203      return hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
 204   }
 205   #endif
 206   for (i=0; i < max_loaders; ++i)
 207      if (loaders[i]->test_memory(buffer,len))
 208         return loaders[i]->load_from_memory(buffer,len,x,y,comp,req_comp);
 209   // test tga last because it's a crappy test!
 210   if (stbi_tga_test_memory(buffer,len))
 211      return stbi_tga_load_from_memory(buffer,len,x,y,comp,req_comp);
 212   return ep("unknown image type", "Image not of any known type, or corrupt");
 213}
 214
 215#ifndef STBI_NO_HDR
 216
 217#ifndef STBI_NO_STDIO
 218float *stbi_loadf(char *filename, int *x, int *y, int *comp, int req_comp)
 219{
 220   FILE *f = fopen(filename, "rb");
 221   float *result;
 222   if (!f) return ep("can't fopen", "Unable to open file");
 223   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
 224   fclose(f);
 225   return result;
 226}
 227
 228float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 229{
 230   unsigned char *data;
 231   #ifndef STBI_NO_HDR
 232   if (stbi_hdr_test_file(f))
 233      return stbi_hdr_load_from_file(f,x,y,comp,req_comp);
 234   #endif
 235   data = stbi_load_from_file(f, x, y, comp, req_comp);
 236   if (data)
 237      return ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
 238   return ep("unknown image type", "Image not of any known type, or corrupt");
 239}
 240#endif
 241
 242float *stbi_loadf_from_memory(stbi_uc *buffer, int len, int *x, int *y, int *comp, int req_comp)
 243{
 244   stbi_uc *data;
 245   #ifndef STBI_NO_HDR
 246   if (stbi_hdr_test_memory(buffer, len))
 247      return stbi_hdr_load_from_memory(buffer, len,x,y,comp,req_comp);
 248   #endif
 249   data = stbi_load_from_memory(buffer, len, x, y, comp, req_comp);
 250   if (data)
 251      return ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
 252   return ep("unknown image type", "Image not of any known type, or corrupt");
 253}
 254#endif
 255
 256// these is-hdr-or-not is defined independent of whether STBI_NO_HDR is
 257// defined, for API simplicity; if STBI_NO_HDR is defined, it always
 258// reports false!
 259
 260extern int      stbi_is_hdr_from_memory(stbi_uc *buffer, int len)
 261{
 262   #ifndef STBI_NO_HDR
 263   return stbi_hdr_test_memory(buffer, len);
 264   #else
 265   return 0;
 266   #endif
 267}
 268
 269#ifndef STBI_NO_STDIO
 270extern int      stbi_is_hdr          (char *filename)
 271{
 272   FILE *f = fopen(filename, "rb");
 273   int result=0;
 274   if (f) {
 275      result = stbi_is_hdr_from_file(f);
 276      fclose(f);
 277   }
 278   return result;
 279}
 280
 281extern int      stbi_is_hdr_from_file(FILE *f)
 282{
 283   #ifndef STBI_NO_HDR
 284   return stbi_hdr_test_file(f);
 285   #else
 286   return 0;
 287   #endif
 288}
 289
 290#endif
 291
 292// @TODO: get image dimensions & components without fully decoding
 293#ifndef STBI_NO_STDIO
 294extern int      stbi_info            (char *filename,           int *x, int *y, int *comp);
 295extern int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
 296#endif
 297extern int      stbi_info_from_memory(stbi_uc *buffer, int len, int *x, int *y, int *comp);
 298
 299#ifndef STBI_NO_HDR
 300static float h2l_gamma_i=1.0f/2.2f, h2l_scale_i=1.0f;
 301static float l2h_gamma=2.2f, l2h_scale=1.0f;
 302
 303void   stbi_hdr_to_ldr_gamma(float gamma) { h2l_gamma_i = 1/gamma; }
 304void   stbi_hdr_to_ldr_scale(float scale) { h2l_scale_i = 1/scale; }
 305
 306void   stbi_ldr_to_hdr_gamma(float gamma) { l2h_gamma = gamma; }
 307void   stbi_ldr_to_hdr_scale(float scale) { l2h_scale = scale; }
 308#endif
 309
 310
 311//////////////////////////////////////////////////////////////////////////////
 312//
 313// Common code used by all image loaders
 314//
 315
 316// image width, height, # components
 317static uint32 img_x, img_y;
 318static int img_n, img_out_n;
 319
 320enum
 321{
 322   SCAN_load=0,
 323   SCAN_type,
 324   SCAN_header,
 325};
 326
 327// An API for reading either from memory or file.
 328#ifndef STBI_NO_STDIO
 329static FILE  *img_file;
 330#endif
 331static uint8 *img_buffer, *img_buffer_end;
 332
 333#ifndef STBI_NO_STDIO
 334static void start_file(FILE *f)
 335{
 336   img_file = f;
 337}
 338#endif
 339
 340static void start_mem(uint8 *buffer, int len)
 341{
 342#ifndef STBI_NO_STDIO
 343   img_file = NULL;
 344#endif
 345   img_buffer = buffer;
 346   img_buffer_end = buffer+len;
 347}
 348
 349static int get8(void)
 350{
 351#ifndef STBI_NO_STDIO
 352   if (img_file) {
 353      int c = fgetc(img_file);
 354      return c == EOF ? 0 : c;
 355   }
 356#endif
 357   if (img_buffer < img_buffer_end)
 358      return *img_buffer++;
 359   return 0;
 360}
 361
 362static int at_eof(void)
 363{
 364#ifndef STBI_NO_STDIO
 365   if (img_file)
 366      return feof(img_file);
 367#endif
 368   return img_buffer >= img_buffer_end;
 369}
 370
 371static uint8 get8u(void)
 372{
 373   return (uint8) get8();
 374}
 375
 376static void skip(int n)
 377{
 378#ifndef STBI_NO_STDIO
 379   if (img_file)
 380      fseek(img_file, n, SEEK_CUR);
 381   else
 382#endif
 383      img_buffer += n;
 384}
 385
 386static int get16(void)
 387{
 388   int z = get8();
 389   return (z << 8) + get8();
 390}
 391
 392static uint32 get32(void)
 393{
 394   uint32 z = get16();
 395   return (z << 16) + get16();
 396}
 397
 398static int get16le(void)
 399{
 400   int z = get8();
 401   return z + (get8() << 8);
 402}
 403
 404static uint32 get32le(void)
 405{
 406   uint32 z = get16le();
 407   return z + (get16le() << 16);
 408}
 409
 410static void getn(stbi_uc *buffer, int n)
 411{
 412#ifndef STBI_NO_STDIO
 413   if (img_file) {
 414      fread(buffer, 1, n, img_file);
 415      return;
 416   }
 417#endif
 418   memcpy(buffer, img_buffer, n);
 419   img_buffer += n;
 420}
 421
 422//////////////////////////////////////////////////////////////////////////////
 423//
 424//  generic converter from built-in img_n to req_comp
 425//    individual types do this automatically as much as possible (e.g. jpeg
 426//    does all cases internally since it needs to colorspace convert anyway,
 427//    and it never has alpha, so very few cases ). png can automatically
 428//    interleave an alpha=255 channel, but falls back to this for other cases
 429//
 430//  assume data buffer is malloced, so malloc a new one and free that one
 431//  only failure mode is malloc failing
 432
 433static uint8 compute_y(int r, int g, int b)
 434{
 435   return (uint8) (((r*77) + (g*150) +  (29*b)) >> 8);
 436}
 437
 438static unsigned char *convert_format(unsigned char *data, int img_n, int req_comp)
 439{
 440   uint i,j;
 441   unsigned char *good;
 442
 443   if (req_comp == img_n) return data;
 444   assert(req_comp >= 1 && req_comp <= 4);
 445
 446   good = (unsigned char *) malloc(req_comp * img_x * img_y);
 447   if (good == NULL) {
 448      free(data);
 449      return ep("outofmem", "Out of memory");
 450   }
 451
 452   for (j=0; j < img_y; ++j) {
 453      unsigned char *src  = data + j * img_x * img_n   ;
 454      unsigned char *dest = good + j * img_x * req_comp;
 455
 456      #define COMBO(a,b)  ((a)*8+(b))
 457      #define CASE(a,b)   case COMBO(a,b): for(i=0; i < img_x; ++i, src += a, dest += b)
 458
 459      // convert source image with img_n components to one with req_comp components;
 460      // avoid switch per pixel, so use switch per scanline and massive macros
 461      switch(COMBO(img_n, req_comp)) {
 462         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
 463         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
 464         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
 465         CASE(2,1) dest[0]=src[0]; break;
 466         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
 467         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
 468         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
 469         CASE(3,1) dest[0]=compute_y(src[0],src[1],src[2]); break;
 470         CASE(3,2) dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
 471         CASE(4,1) dest[0]=compute_y(src[0],src[1],src[2]); break;
 472         CASE(4,2) dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
 473         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
 474         default: assert(0);
 475      }
 476      #undef CASE
 477   }
 478
 479   free(data);
 480   img_out_n = req_comp;
 481   return good;
 482}
 483
 484#ifndef STBI_NO_HDR
 485static float   *ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 486{
 487   int i,k,n;
 488   float *output = (float *) malloc(x * y * comp * sizeof(float));
 489   if (output == NULL) { free(data); return ep("outofmem", "Out of memory"); }
 490   // compute number of non-alpha components
 491   if (comp & 1) n = comp; else n = comp-1;
 492   for (i=0; i < x*y; ++i) {
 493      for (k=0; k < n; ++k) {
 494         output[i*comp + k] = (float) pow(data[i*comp+k]/255.0, l2h_gamma) * l2h_scale;
 495      }
 496      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
 497   }
 498   free(data);
 499   return output;
 500}
 501
 502#define float2int(x)   ((int) (x))
 503static stbi_uc *hdr_to_ldr(float   *data, int x, int y, int comp)
 504{
 505   int i,k,n;
 506   stbi_uc *output = (stbi_uc *) malloc(x * y * comp);
 507   if (output == NULL) { free(data); return ep("outofmem", "Out of memory"); }
 508   // compute number of non-alpha components
 509   if (comp & 1) n = comp; else n = comp-1;
 510   for (i=0; i < x*y; ++i) {
 511      for (k=0; k < n; ++k) {
 512         float z = (float) pow(data[i*comp+k]*h2l_scale_i, h2l_gamma_i) * 255 + 0.5f;
 513         if (z < 0) z = 0;
 514         if (z > 255) z = 255;
 515         output[i*comp + k] = float2int(z);
 516      }
 517      if (k < comp) {
 518         float z = data[i*comp+k] * 255 + 0.5f;
 519         if (z < 0) z = 0;
 520         if (z > 255) z = 255;
 521         output[i*comp + k] = float2int(z);
 522      }
 523   }
 524   free(data);
 525   return output;
 526}
 527#endif
 528
 529//////////////////////////////////////////////////////////////////////////////
 530//
 531//  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
 532//
 533//    simple implementation
 534//      - channel subsampling of at most 2 in each dimension
 535//      - doesn't support delayed output of y-dimension
 536//      - simple interface (only one output format: 8-bit interleaved RGB)
 537//      - doesn't try to recover corrupt jpegs
 538//      - doesn't allow partial loading, loading multiple at once
 539//      - still fast on x86 (copying globals into locals doesn't help x86)
 540//      - allocates lots of intermediate memory (full size of all components)
 541//        - non-interleaved case requires this anyway
 542//        - allows good upsampling (see next)
 543//    high-quality
 544//      - upsampled channels are bilinearly interpolated, even across blocks
 545//      - quality integer IDCT derived from IJG's 'slow'
 546//    performance
 547//      - fast huffman; reasonable integer IDCT
 548//      - uses a lot of intermediate memory, could cache poorly
 549//      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
 550//          stb_jpeg:   1.34 seconds (MSVC6, default release build)
 551//          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
 552//          IJL11.dll:  1.08 seconds (compiled by intel)
 553//          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
 554//          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
 555
 556int stbi_jpeg_dc_only;
 557
 558// huffman decoding acceleration
 559#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
 560
 561typedef struct
 562{
 563   uint8  fast[1 << FAST_BITS];
 564   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
 565   uint16 code[256];
 566   uint8  values[256];
 567   uint8  size[257];
 568   unsigned int maxcode[18];
 569   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
 570} huffman;
 571
 572static huffman huff_dc[4];  // baseline is 2 tables, extended is 4
 573static huffman huff_ac[4];
 574static uint8 dequant[4][64];
 575
 576static int build_huffman(huffman *h, int *count)
 577{
 578   int i,j,k=0,code;
 579   // build size list for each symbol (from JPEG spec)
 580   for (i=0; i < 16; ++i)
 581      for (j=0; j < count[i]; ++j)
 582         h->size[k++] = (uint8) (i+1);
 583   h->size[k] = 0;
 584
 585   // compute actual symbols (from jpeg spec)
 586   code = 0;
 587   k = 0;
 588   for(j=1; j <= 16; ++j) {
 589      // compute delta to add to code to compute symbol id
 590      h->delta[j] = k - code;
 591      if (h->size[k] == j) {
 592         while (h->size[k] == j)
 593            h->code[k++] = (uint16) (code++);
 594         if (code-1 >= (1 << j)) return e("bad code lengths","Corrupt JPEG");
 595      }
 596      // compute largest code + 1 for this size, preshifted as needed later
 597      h->maxcode[j] = code << (16-j);
 598      code <<= 1;
 599   }
 600   h->maxcode[j] = 0xffffffff;
 601
 602   // build non-spec acceleration table; 255 is flag for not-accelerated
 603   memset(h->fast, 255, 1 << FAST_BITS);
 604   for (i=0; i < k; ++i) {
 605      int s = h->size[i];
 606      if (s <= FAST_BITS) {
 607         int c = h->code[i] << (FAST_BITS-s);
 608         int m = 1 << (FAST_BITS-s);
 609         for (j=0; j < m; ++j) {
 610            h->fast[c+j] = (uint8) i;
 611         }
 612      }
 613   }
 614   return 1;
 615}
 616
 617// sizes for components, interleaved MCUs
 618static int img_h_max, img_v_max;
 619static int img_mcu_x, img_mcu_y;
 620static int img_mcu_w, img_mcu_h;
 621
 622// definition of jpeg image component
 623static struct
 624{
 625   int id;
 626   int h,v;
 627   int tq;
 628   int hd,ha;
 629   int dc_pred;
 630
 631   int x,y,w2,h2;
 632   uint8 *data;
 633} img_comp[4];
 634
 635static unsigned long  code_buffer; // jpeg entropy-coded buffer
 636static int            code_bits;   // number of valid bits
 637static unsigned char  marker;      // marker seen while filling entropy buffer
 638static int            nomore;      // flag if we saw a marker so must stop
 639
 640static void grow_buffer_unsafe(void)
 641{
 642   do {
 643      int b = nomore ? 0 : get8();
 644      if (b == 0xff) {
 645         int c = get8();
 646         if (c != 0) {
 647            marker = (unsigned char) c;
 648            nomore = 1;
 649            return;
 650         }
 651      }
 652      code_buffer = (code_buffer << 8) | b;
 653      code_bits += 8;
 654   } while (code_bits <= 24);
 655}
 656
 657// (1 << n) - 1
 658static unsigned long bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 659
 660// decode a jpeg huffman value from the bitstream
 661__forceinline static int decode(huffman *h)
 662{
 663   unsigned int temp;
 664   int c,k;
 665
 666   if (code_bits < 16) grow_buffer_unsafe();
 667
 668   // look at the top FAST_BITS and determine what symbol ID it is,
 669   // if the code is <= FAST_BITS
 670   c = (code_buffer >> (code_bits - FAST_BITS)) & ((1 << FAST_BITS)-1);
 671   k = h->fast[c];
 672   if (k < 255) {
 673      if (h->size[k] > code_bits)
 674         return -1;
 675      code_bits -= h->size[k];
 676      return h->values[k];
 677   }
 678
 679   // naive test is to shift the code_buffer down so k bits are
 680   // valid, then test against maxcode. To speed this up, we've
 681   // preshifted maxcode left so that it has (16-k) 0s at the
 682   // end; in other words, regardless of the number of bits, it
 683   // wants to be compared against something shifted to have 16;
 684   // that way we don't need to shift inside the loop.
 685   if (code_bits < 16)
 686      temp = (code_buffer << (16 - code_bits)) & 0xffff;
 687   else
 688      temp = (code_buffer >> (code_bits - 16)) & 0xffff;
 689   for (k=FAST_BITS+1 ; ; ++k)
 690      if (temp < h->maxcode[k])
 691         break;
 692   if (k == 17) {
 693      // error! code not found
 694      code_bits -= 16;
 695      return -1;
 696   }
 697
 698   if (k > code_bits)
 699      return -1;
 700
 701   // convert the huffman code to the symbol id
 702   c = ((code_buffer >> (code_bits - k)) & bmask[k]) + h->delta[k];
 703   assert((((code_buffer) >> (code_bits - h->size[c])) & bmask[h->size[c]]) == h->code[c]);
 704
 705   // convert the id to a symbol
 706   code_bits -= k;
 707   return h->values[c];
 708}
 709
 710// combined JPEG 'receive' and JPEG 'extend', since baseline
 711// always extends everything it receives.
 712__forceinline static int extend_receive(int n)
 713{
 714   unsigned int m = 1 << (n-1);
 715   unsigned int k;
 716   if (code_bits < n) grow_buffer_unsafe();
 717   k = (code_buffer >> (code_bits - n)) & bmask[n];
 718   code_bits -= n;
 719   // the following test is probably a random branch that won't
 720   // predict well. I tried to table accelerate it but failed.
 721   // maybe it's compiling as a conditional move?
 722   if (k < m)
 723      return (-1 << n) + k + 1;
 724   else
 725      return k;
 726}
 727
 728// given a value that's at position X in the zigzag stream,
 729// where does it appear in the 8x8 matrix coded as row-major?
 730static uint8 dezigzag[64+15] =
 731{
 732    0,  1,  8, 16,  9,  2,  3, 10,
 733   17, 24, 32, 25, 18, 11,  4,  5,
 734   12, 19, 26, 33, 40, 48, 41, 34,
 735   27, 20, 13,  6,  7, 14, 21, 28,
 736   35, 42, 49, 56, 57, 50, 43, 36,
 737   29, 22, 15, 23, 30, 37, 44, 51,
 738   58, 59, 52, 45, 38, 31, 39, 46,
 739   53, 60, 61, 54, 47, 55, 62, 63,
 740   // let corrupt input sample past end
 741   63, 63, 63, 63, 63, 63, 63, 63,
 742   63, 63, 63, 63, 63, 63, 63
 743};
 744
 745// decode one 64-entry block--
 746static int decode_block(short data[64], huffman *hdc, huffman *hac, int b)
 747{
 748   int diff,dc,k;
 749   int t = decode(hdc);
 750   if (t < 0) return e("bad huffman code","Corrupt JPEG");
 751
 752   // 0 all the ac values now so we can do it 32-bits at a time
 753   memset(data,0,64*sizeof(data[0]));
 754
 755   diff = t ? extend_receive(t) : 0;
 756   dc = img_comp[b].dc_pred + diff;
 757   img_comp[b].dc_pred = dc;
 758   data[0] = (short) dc;
 759
 760   // decode AC components, see JPEG spec
 761   k = 1;
 762   do {
 763      int r,s;
 764      int rs = decode(hac);
 765      if (rs < 0) return e("bad huffman code","Corrupt JPEG");
 766      s = rs & 15;
 767      r = rs >> 4;
 768      if (s == 0) {
 769         if (rs != 0xf0) break; // end block
 770         k += 16;
 771      } else {
 772         k += r;
 773         // decode into unzigzag'd location
 774         data[dezigzag[k++]] = (short) extend_receive(s);
 775      }
 776   } while (k < 64);
 777   return 1;
 778}
 779
 780// take a -128..127 value and clamp it and convert to 0..255
 781__forceinline static uint8 clamp(int x)
 782{
 783   x += 128;
 784   // trick to use a single test to catch both cases
 785   if ((unsigned int) x > 255) {
 786      if (x < 0) return 0;
 787      if (x > 255) return 255;
 788   }
 789   return (uint8) x;
 790}
 791
 792#define f2f(x)  (int) (((x) * 4096 + 0.5))
 793#define fsh(x)  ((x) << 12)
 794
 795// derived from jidctint -- DCT_ISLOW
 796#define IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7)       \
 797   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
 798   p2 = s2;                                    \
 799   p3 = s6;                                    \
 800   p1 = (p2+p3) * f2f(0.5411961f);             \
 801   t2 = p1 + p3*f2f(-1.847759065f);            \
 802   t3 = p1 + p2*f2f( 0.765366865f);            \
 803   p2 = s0;                                    \
 804   p3 = s4;                                    \
 805   t0 = fsh(p2+p3);                            \
 806   t1 = fsh(p2-p3);                            \
 807   x0 = t0+t3;                                 \
 808   x3 = t0-t3;                                 \
 809   x1 = t1+t2;                                 \
 810   x2 = t1-t2;                                 \
 811   t0 = s7;                                    \
 812   t1 = s5;                                    \
 813   t2 = s3;                                    \
 814   t3 = s1;                                    \
 815   p3 = t0+t2;                                 \
 816   p4 = t1+t3;                                 \
 817   p1 = t0+t3;                                 \
 818   p2 = t1+t2;                                 \
 819   p5 = (p3+p4)*f2f( 1.175875602f);            \
 820   t0 = t0*f2f( 0.298631336f);                 \
 821   t1 = t1*f2f( 2.053119869f);                 \
 822   t2 = t2*f2f( 3.072711026f);                 \
 823   t3 = t3*f2f( 1.501321110f);                 \
 824   p1 = p5 + p1*f2f(-0.899976223f);            \
 825   p2 = p5 + p2*f2f(-2.562915447f);            \
 826   p3 = p3*f2f(-1.961570560f);                 \
 827   p4 = p4*f2f(-0.390180644f);                 \
 828   t3 += p1+p4;                                \
 829   t2 += p2+p3;                                \
 830   t1 += p2+p4;                                \
 831   t0 += p1+p3;
 832
 833// .344 seconds on 3*anemones.jpg
 834static void idct_block(uint8 *out, int out_stride, short data[64], uint8 *dequantize)
 835{
 836   int i,val[64],*v=val;
 837   uint8 *o,*dq = dequantize;
 838   short *d = data;
 839
 840   if (stbi_jpeg_dc_only) {
 841      // ok, I don't really know why this is right, but it seems to be:
 842      int z = 128 + ((d[0] * dq[0]) >> 3);
 843      for (i=0; i < 8; ++i) {
 844         out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = z;
 845         out += out_stride;
 846      }
 847      return;
 848   }
 849
 850   // columns
 851   for (i=0; i < 8; ++i,++d,++dq, ++v) {
 852      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
 853      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
 854           && d[40]==0 && d[48]==0 && d[56]==0) {
 855         //    no shortcut                 0     seconds
 856         //    (1|2|3|4|5|6|7)==0          0     seconds
 857         //    all separate               -0.047 seconds
 858         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
 859         int dcterm = d[0] * dq[0] << 2;
 860         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
 861      } else {
 862         IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
 863                 d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56])
 864         // constants scaled things up by 1<<12; let's bring them back
 865         // down, but keep 2 extra bits of precision
 866         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
 867         v[ 0] = (x0+t3) >> 10;
 868         v[56] = (x0-t3) >> 10;
 869         v[ 8] = (x1+t2) >> 10;
 870         v[48] = (x1-t2) >> 10;
 871         v[16] = (x2+t1) >> 10;
 872         v[40] = (x2-t1) >> 10;
 873         v[24] = (x3+t0) >> 10;
 874         v[32] = (x3-t0) >> 10;
 875      }
 876   }
 877
 878   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
 879      // no fast case since the first 1D IDCT spread components out
 880      IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
 881      // constants scaled things up by 1<<12, plus we had 1<<2 from first
 882      // loop, plus horizontal and vertical each scale by sqrt(8) so together
 883      // we've got an extra 1<<3, so 1<<17 total we need to remove.
 884      x0 += 65536; x1 += 65536; x2 += 65536; x3 += 65536;
 885      o[0] = clamp((x0+t3) >> 17);
 886      o[7] = clamp((x0-t3) >> 17);
 887      o[1] = clamp((x1+t2) >> 17);
 888      o[6] = clamp((x1-t2) >> 17);
 889      o[2] = clamp((x2+t1) >> 17);
 890      o[5] = clamp((x2-t1) >> 17);
 891      o[3] = clamp((x3+t0) >> 17);
 892      o[4] = clamp((x3-t0) >> 17);
 893   }
 894}
 895
 896#define MARKER_none  0xff
 897// if there's a pending marker from the entropy stream, return that
 898// otherwise, fetch from the stream and get a marker. if there's no
 899// marker, return 0xff, which is never a valid marker value
 900static uint8 get_marker(void)
 901{
 902   uint8 x;
 903   if (marker != MARKER_none) { x = marker; marker = MARKER_none; return x; }
 904   x = get8u();
 905   if (x != 0xff) return MARKER_none;
 906   while (x == 0xff)
 907      x = get8u();
 908   return x;
 909}
 910
 911// in each scan, we'll have scan_n components, and the order
 912// of the components is specified by order[]
 913static int scan_n, order[4];
 914static int restart_interval, todo;
 915#define RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
 916
 917// after a restart interval, reset the entropy decoder and
 918// the dc prediction
 919static void reset(void)
 920{
 921   code_bits = 0;
 922   code_buffer = 0;
 923   nomore = 0;
 924   img_comp[0].dc_pred = img_comp[1].dc_pred = img_comp[2].dc_pred = 0;
 925   marker = MARKER_none;
 926   todo = restart_interval ? restart_interval : 0x7fffffff;
 927   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
 928   // since we don't even allow 1<<30 pixels
 929}
 930
 931static int parse_entropy_coded_data(void)
 932{
 933   reset();
 934   if (scan_n == 1) {
 935      int i,j;
 936      short data[64];
 937      int n = order[0];
 938      // non-interleaved data, we just need to process one block at a time,
 939      // in trivial scanline order
 940      // number of blocks to do just depends on how many actual "pixels" this
 941      // component has, independent of interleaved MCU blocking and such
 942      int w = (img_comp[n].x+7) >> 3;
 943      int h = (img_comp[n].y+7) >> 3;
 944      for (j=0; j < h; ++j) {
 945         for (i=0; i < w; ++i) {
 946            if (!decode_block(data, huff_dc+img_comp[n].hd, huff_ac+img_comp[n].ha, n)) return 0;
 947            idct_block(img_comp[n].data+img_comp[n].w2*j*8+i*8, img_comp[n].w2, data, dequant[img_comp[n].tq]);
 948            // every data block is an MCU, so countdown the restart interval
 949            if (--todo <= 0) {
 950               if (code_bits < 24) grow_buffer_unsafe();
 951               // if it's NOT a restart, then just bail, so we get corrupt data
 952               // rather than no data
 953               if (!RESTART(marker)) return 1;
 954               reset();
 955            }
 956         }
 957      }
 958   } else { // interleaved!
 959      int i,j,k,x,y;
 960      short data[64];
 961      for (j=0; j < img_mcu_y; ++j) {
 962         for (i=0; i < img_mcu_x; ++i) {
 963            // scan an interleaved mcu... process scan_n components in order
 964            for (k=0; k < scan_n; ++k) {
 965               int n = order[k];
 966               // scan out an mcu's worth of this component; that's just determined
 967               // by the basic H and V specified for the component
 968               for (y=0; y < img_comp[n].v; ++y) {
 969                  for (x=0; x < img_comp[n].h; ++x) {
 970                     int x2 = (i*img_comp[n].h + x)*8;
 971                     int y2 = (j*img_comp[n].v + y)*8;
 972                     if (!decode_block(data, huff_dc+img_comp[n].hd, huff_ac+img_comp[n].ha, n)) return 0;
 973                     idct_block(img_comp[n].data+img_comp[n].w2*y2+x2, img_comp[n].w2, data, dequant[img_comp[n].tq]);
 974                  }
 975               }
 976            }
 977            // after all interleaved components, that's an interleaved MCU,
 978            // so now count down the restart interval
 979            if (--todo <= 0) {
 980               if (code_bits < 24) grow_buffer_unsafe();
 981               // if it's NOT a restart, then just bail, so we get corrupt data
 982               // rather than no data
 983               if (!RESTART(marker)) return 1;
 984               reset();
 985            }
 986         }
 987      }
 988   }
 989   return 1;
 990}
 991
 992static int process_marker(int m)
 993{
 994   int L;
 995   switch (m) {
 996      case MARKER_none: // no marker found
 997         return e("expected marker","Corrupt JPEG");
 998
 999      case 0xC2: // SOF - progressive
1000         return e("progressive jpeg","JPEG format not supported (progressive)");
1001
1002      case 0xDD: // DRI - specify restart interval
1003         if (get16() != 4) return e("bad DRI len","Corrupt JPEG");
1004         restart_interval = get16();
1005         return 1;
1006
1007      case 0xDB: // DQT - define quantization table
1008         L = get16()-2;
1009         while (L > 0) {
1010            int z = get8();
1011            int p = z >> 4;
1012            int t = z & 15,i;
1013            if (p != 0) return e("bad DQT type","Corrupt JPEG");
1014            if (t > 3) return e("bad DQT table","Corrupt JPEG");
1015            for (i=0; i < 64; ++i)
1016               dequant[t][dezigzag[i]] = get8u();
1017            L -= 65;
1018         }
1019         return L==0;
1020
1021      case 0xC4: // DHT - define huffman table
1022         L = get16()-2;
1023         while (L > 0) {
1024            uint8 *v;
1025            int sizes[16],i,m=0;
1026            int z = get8();
1027            int tc = z >> 4;
1028            int th = z & 15;
1029            if (tc > 1 || th > 3) return e("bad DHT header","Corrupt JPEG");
1030            for (i=0; i < 16; ++i) {
1031               sizes[i] = get8();
1032               m += sizes[i];
1033            }
1034            L -= 17;
1035            if (tc == 0) {
1036               if (!build_huffman(huff_dc+th, sizes)) return 0;
1037               v = huff_dc[th].values;
1038            } else {
1039               if (!build_huffman(huff_ac+th, sizes)) return 0;
1040               v = huff_ac[th].values;
1041            }
1042            for (i=0; i < m; ++i)
1043               v[i] = get8u();
1044            L -= m;
1045         }
1046         return L==0;
1047   }
1048   // check for comment block or APP blocks
1049   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
1050      skip(get16()-2);
1051      return 1;
1052   }
1053   return 0;
1054}
1055
1056// after we see SOS
1057static int process_scan_header(void)
1058{
1059   int i;
1060   int Ls = get16();
1061   scan_n = get8();
1062   if (scan_n < 1 || scan_n > 4 || scan_n > (int) img_n) return e("bad SOS component count","Corrupt JPEG");
1063   if (Ls != 6+2*scan_n) return e("bad SOS len","Corrupt JPEG");
1064   for (i=0; i < scan_n; ++i) {
1065      int id = get8(), which;
1066      int z = get8();
1067      for (which = 0; which < img_n; ++which)
1068         if (img_comp[which].id == id)
1069            break;
1070      if (which == img_n) return 0;
1071      img_comp[which].hd = z >> 4;   if (img_comp[which].hd > 3) return e("bad DC huff","Corrupt JPEG");
1072      img_comp[which].ha = z & 15;   if (img_comp[which].ha > 3) return e("bad AC huff","Corrupt JPEG");
1073      order[i] = which;
1074   }
1075   if (get8() != 0) return e("bad SOS","Corrupt JPEG");
1076   get8(); // should be 63, but might be 0
1077   if (get8() != 0) return e("bad SOS","Corrupt JPEG");
1078
1079   return 1;
1080}
1081
1082static int process_frame_header(int scan)
1083{
1084   int Lf,p,i,z, h_max=1,v_max=1;
1085   Lf = get16();         if (Lf < 11) return e("bad SOF len","Corrupt JPEG"); // JPEG
1086   p  = get8();          if (p != 8) return e("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
1087   img_y = get16();      if (img_y == 0) return e("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
1088   img_x = get16();      if (img_x == 0) return e("0 width","Corrupt JPEG"); // JPEG requires
1089   img_n = get8();
1090   if (img_n != 3 && img_n != 1) return e("bad component count","Corrupt JPEG");    // JFIF requires
1091
1092   if (Lf != 8+3*img_n) return e("bad SOF len","Corrupt JPEG");
1093
1094   for (i=0; i < img_n; ++i) {
1095      img_comp[i].id = get8();
1096      if (img_comp[i].id != i+1)   // JFIF requires
1097         if (img_comp[i].id != i)  // jpegtran outputs non-JFIF-compliant files!
1098            return e("bad component ID","Corrupt JPEG");
1099      z = get8();
1100      img_comp[i].h = (z >> 4);  if (!img_comp[i].h || img_comp[i].h > 4) return e("bad H","Corrupt JPEG");
1101      img_comp[i].v = z & 15;    if (!img_comp[i].v || img_comp[i].v > 4) return e("bad V","Corrupt JPEG");
1102      img_comp[i].tq = get8();   if (img_comp[i].tq > 3) return e("bad TQ","Corrupt JPEG");
1103   }
1104
1105   if (scan != SCAN_load) return 1;
1106
1107   if ((1 << 30) / img_x / img_n < img_y) return e("too large", "Image too large to decode");
1108
1109   for (i=0; i < img_n; ++i) {
1110      if (img_comp[i].h > h_max) h_max = img_comp[i].h;
1111      if (img_comp[i].v > v_max) v_max = img_comp[i].v;
1112   }
1113
1114   // compute interleaved mcu info
1115   img_h_max = h_max;
1116   img_v_max = v_max;
1117   img_mcu_w = h_max * 8;
1118   img_mcu_h = v_max * 8;
1119   img_mcu_x = (img_x + img_mcu_w-1) / img_mcu_w;
1120   img_mcu_y = (img_y + img_mcu_h-1) / img_mcu_h;
1121
1122   for (i=0; i < img_n; ++i) {
1123      // number of effective pixels (e.g. for non-interleaved MCU)
1124      img_comp[i].x = (img_x * img_comp[i].h + h_max-1) / h_max;
1125      img_comp[i].y = (img_y * img_comp[i].v + v_max-1) / v_max;
1126      // to simplify generation, we'll allocate enough memory to decode
1127      // the bogus oversized data from using interleaved MCUs and their
1128      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1129      // discard the extra data until colorspace conversion
1130      img_comp[i].w2 = img_mcu_x * img_comp[i].h * 8;
1131      img_comp[i].h2 = img_mcu_y * img_comp[i].v * 8;
1132      img_comp[i].data = (uint8 *) malloc(img_comp[i].w2 * img_comp[i].h2);
1133      if (img_comp[i].data == NULL) {
1134         for(--i; i >= 0; --i)
1135            free(img_comp[i].data);
1136         return e("outofmem", "Out of memory");
1137      }
1138   }
1139
1140   return 1;
1141}
1142
1143// use comparisons since in some cases we handle more than one case (e.g. SOF)
1144#define DNL(x)         ((x) == 0xdc)
1145#define SOI(x)         ((x) == 0xd8)
1146#define EOI(x)         ((x) == 0xd9)
1147#define SOF(x)         ((x) == 0xc0 || (x) == 0xc1)
1148#define SOS(x)         ((x) == 0xda)
1149
1150static int decode_jpeg_header(int scan)
1151{
1152   int m;
1153   marker = MARKER_none; // initialize cached marker to empty
1154   m = get_marker();
1155   if (!SOI(m)) return e("no SOI","Corrupt JPEG");
1156   if (scan == SCAN_type) return 1;
1157   m = get_marker();
1158   while (!SOF(m)) {
1159      if (!process_marker(m)) return 0;
1160      m = get_marker();
1161      while (m == MARKER_none) {
1162         // some files have extra padding after their blocks, so ok, we'll scan
1163         if (at_eof()) return e("no SOF", "Corrupt JPEG");
1164         m = get_marker();
1165      }
1166   }
1167   if (!process_frame_header(scan)) return 0;
1168   return 1;
1169}
1170
1171static int decode_jpeg_image(void)
1172{
1173   int m;
1174   restart_interval = 0;
1175   if (!decode_jpeg_header(SCAN_load)) return 0;
1176   m = get_marker();
1177   while (!EOI(m)) {
1178      if (SOS(m)) {
1179         if (!process_scan_header()) return 0;
1180         if (!parse_entropy_coded_data()) return 0;
1181      } else {
1182         if (!process_marker(m)) return 0;
1183      }
1184      m = get_marker();
1185   }
1186   return 1;
1187}
1188
1189// static jfif-centered resampling with cross-block smoothing
1190// here by cross-block smoothing what I mean is that the resampling
1191// is bilerp and crosses blocks; I dunno what IJG means
1192
1193#define div4(x) ((uint8) ((x) >> 2))
1194
1195static void resample_v_2(uint8 *out1, uint8 *input, int w, int h, int s)
1196{
1197   // need to generate two samples vertically for every one in input
1198   uint8 *above;
1199   uint8 *below;
1200   uint8 *source;
1201   uint8 *out2;
1202   int i,j;
1203   source = input;
1204   out2 = out1+w;
1205   for (j=0; j < h; ++j) {
1206      above = source;
1207      source = input + j*s;
1208      below = source + s; if (j == h-1) below = source;
1209      for (i=0; i < w; ++i) {
1210         int n = source[i]*3;
1211         out1[i] = div4(above[i] + n);
1212         out2[i] = div4(below[i] + n);
1213      }
1214      out1 += w*2;
1215      out2 += w*2;
1216   }
1217}
1218
1219static void resample_h_2(uint8 *out, uint8 *input, int w, int h, int s)
1220{
1221   // need to generate two samples horizontally for every one in input
1222   int i,j;
1223   if (w == 1) {
1224      for (j=0; j < h; ++j)
1225         out[j*2+0] = out[j*2+1] = input[j*s];
1226      return;
1227   }
1228   for (j=0; j < h; ++j) {
1229      out[0] = input[0];
1230      out[1] = div4(input[0]*3 + input[1]);
1231      for (i=1; i < w-1; ++i) {
1232         int n = input[i]*3;
1233         out[i*2-2] = div4(input[i-1] + n);
1234         out[i*2-1] = div4(input[i+1] + n);
1235      }
1236      out[w*2-2] = div4(input[w-2]*3 + input[w-1]);
1237      out[w*2-1] = input[w-1];
1238      out += w*2;
1239      input += s;
1240   }
1241}
1242
1243// .172 seconds on 3*anemones.jpg
1244static void resample_hv_2(uint8 *out, uint8 *input, int w, int h, int s)
1245{
1246   // need to generate 2x2 samples for every one in input
1247   int i,j;
1248   int os = w*2;
1249   // generate edge samples... @TODO lerp them!
1250   for (i=0; i < w; ++i) {
1251      out[i*2+0] = out[i*2+1] = input[i];
1252      out[i*2+(2*h-1)*os+0] = out[i*2+(2*h-1)*os+1] = input[i+(h-1)*w];
1253   }
1254   for (j=0; j < h; ++j) {
1255      out[j*os*2+0] = out[j*os*2+os+0] = input[j*w];
1256      out[j*os*2+os-1] = out[j*os*2+os+os-1] = input[j*w+i-1];
1257   }
1258   // now generate interior samples; i & j point to top left of input
1259   for (j=0; j < h-1; ++j) {
1260      uint8 *in1 = input+j*s;
1261      uint8 *in2 = in1 + s;
1262      uint8 *out1 = out + (j*2+1)*os + 1;
1263      uint8 *out2 = out1 + os;
1264      for (i=0; i < w-1; ++i) {
1265         int p00 = in1[0], p01=in1[1], p10=in2[0], p11=in2[1];
1266         int p00_3 = p00*3, p01_3 = p01*3, p10_3 = p10*3, p11_3 = p11*3;
1267
1268         #define div16(x)  ((uint8) ((x) >> 4))
1269
1270         out1[0] = div16(p00*9 + p01_3 + p10_3 + p11);
1271         out1[1] = div16(p01*9 + p00_3 + p01_3 + p10);
1272         out2[0] = div16(p10*9 + p11_3 + p00_3 + p01);
1273         out2[1] = div16(p11*9 + p10_3 + p01_3 + p00);
1274         out1 += 2;
1275         out2 += 2;
1276         ++in1;
1277         ++in2;
1278      }
1279   }
1280}
1281
1282#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
1283
1284// 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
1285// VC6 without processor=Pro is generating multiple LEAs per multiply!
1286static void YCbCr_to_RGB_row(uint8 *out, uint8 *y, uint8 *pcb, uint8 *pcr, int count, int step)
1287{
1288   int i;
1289   for (i=0; i < count; ++i) {
1290      int y_fixed = (y[i] << 16) + 32768; // rounding
1291      int r,g,b;
1292      int cr = pcr[i] - 128;
1293      int cb = pcb[i] - 128;
1294      r = y_fixed + cr*float2fixed(1.40200f);
1295      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
1296      b = y_fixed                            + cb*float2fixed(1.77200f);
1297      r >>= 16;
1298      g >>= 16;
1299      b >>= 16;
1300      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
1301      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
1302      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
1303      out[0] = (uint8)r;
1304      out[1] = (uint8)g;
1305      out[2] = (uint8)b;
1306      if (step == 4) out[3] = 255;
1307      out += step;
1308   }
1309}
1310
1311// clean up the temporary component buffers
1312static void cleanup_jpeg(void)
1313{
1314   int i;
1315   for (i=0; i < img_n; ++i) {
1316      if (img_comp[i].data) {
1317         free(img_comp[i].data);
1318         img_comp[i].data = NULL;
1319      }
1320   }
1321}
1322
1323static uint8 *load_jpeg_image(int *out_x, int *out_y, int *comp, int req_comp)
1324{
1325   int i, n;
1326   // validate req_comp
1327   if (req_comp < 0 || req_comp > 4) return ep("bad req_comp", "Internal error");
1328
1329   // load a jpeg image from whichever source
1330   if (!decode_jpeg_image()) { cleanup_jpeg(); return NULL; }
1331
1332   // determine actual number of components to generate
1333   n = req_comp ? req_comp : img_n;
1334
1335   // resample components to full size... memory wasteful, but this
1336   // lets us bilerp across blocks while upsampling
1337   for (i=0; i < img_n; ++i) {
1338      // if we're outputting fewer than 3 components, we're grey not RGB;
1339      // in that case, don't bother upsampling Cb or Cr
1340      if (n < 3 && i) continue;
1341
1342      // check if the component scale is less than max; if so it needs upsampling
1343      if (img_comp[i].h != img_h_max || img_comp[i].v != img_v_max) {
1344         int stride = img_x;
1345         // allocate final size; make sure it's big enough for upsampling off
1346         // the edges with upsample up to 4x4 (although we only support 2x2
1347         // currently)
1348         uint8 *new_data = (uint8 *) malloc((img_x+3)*(img_y+3));
1349         if (new_data == NULL) {
1350            cleanup_jpeg();
1351            return ep("outofmem", "Out of memory (image too large?)");
1352         }
1353         if (img_comp[i].h*2 == img_h_max && img_comp[i].v*2 == img_v_max) {
1354            int tx = (img_x+1)>>1;
1355            resample_hv_2(new_data, img_comp[i].data, tx,(img_y+1)>>1, img_comp[i].w2);
1356            stride = tx*2;
1357         } else if (img_comp[i].h == img_h_max && img_comp[i].v*2 == img_v_max) {
1358            resample_v_2(new_data, img_comp[i].data, img_x,(img_y+1)>>1, img_comp[i].w2);
1359         } else if (img_comp[i].h*2 == img_h_max && img_comp[i].v == img_v_max) {
1360            int tx = (img_x+1)>>1;
1361            resample_h_2(new_data, img_comp[i].data, tx,img_y, img_comp[i].w2);
1362            stride = tx*2;
1363         } else {
1364            // @TODO resample uncommon sampling pattern with nearest neighbor
1365            free(new_data);
1366            cleanup_jpeg();
1367            return ep("uncommon H or V", "JPEG not supported: atypical downsampling mode");
1368         }
1369         img_comp[i].w2 = stride;
1370         free(img_comp[i].data);
1371         img_comp[i].data = new_data;
1372      }
1373   }
1374
1375   // now convert components to output image
1376   {
1377      uint32 i,j;
1378      uint8 *output = (uint8 *) malloc(n * img_x * img_y + 1);
1379      if (n >= 3) { // output STBI_rgb_*
1380         for (j=0; j < img_y; ++j) {
1381            uint8 *y  = img_comp[0].data + j*img_comp[0].w2;
1382            uint8 *out = output + n * img_x * j;
1383            if (img_n == 3) {
1384               uint8 *cb = img_comp[1].data + j*img_comp[1].w2;
1385               uint8 *cr = img_comp[2].data + j*img_comp[2].w2;
1386               YCbCr_to_RGB_row(out, y, cb, cr, img_x, n);
1387            } else {
1388               for (i=0; i < img_x; ++i) {
1389                  out[0] = out[1] = out[2] = y[i];
1390                  out[3] = 255; // not used if n == 3
1391                  out += n;
1392               }
1393            }
1394         }
1395      } else {      // output STBI_grey_*
1396         for (j=0; j < img_y; ++j) {
1397            uint8 *y  = img_comp[0].data + j*img_comp[0].w2;
1398            uint8 *out = output + n * img_x * j;
1399            if (n == 1)
1400               for (i=0; i < img_x; ++i) *out++ = *y++;
1401            else
1402               for (i=0; i < img_x; ++i) *out++ = *y++, *out++ = 255;
1403         }
1404      }
1405      cleanup_jpeg();
1406      *out_x = img_x;
1407      *out_y = img_y;
1408      if (comp) *comp  = n; // Changed JLD: report output components
1409      //if (comp) *comp  = img_n; // report original components, not output
1410      return output;
1411   }
1412}
1413
1414#ifndef STBI_NO_STDIO
1415unsigned char *stbi_jpeg_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
1416{
1417   start_file(f);
1418   return load_jpeg_image(x,y,comp,req_comp);
1419}
1420
1421unsigned char *stbi_jpeg_load(char *filename, int *x, int *y, int *comp, int req_comp)
1422{
1423   unsigned char *data;
1424   FILE *f = fopen(filename, "rb");
1425   if (!f) return NULL;
1426   data = stbi_jpeg_load_from_file(f,x,y,comp,req_comp);
1427   fclose(f);
1428   return data;
1429}
1430#endif
1431
1432unsigned char *stbi_jpeg_load_from_memory(stbi_uc *buffer, int len, int *x, int *y, int *comp, int req_comp)
1433{
1434   start_mem(buffer,len);
1435   return load_jpeg_image(x,y,comp,req_comp);
1436}
1437
1438#ifndef STBI_NO_STDIO
1439int stbi_jpeg_test_file(FILE *f)
1440{
1441   int n,r;
1442   n = ftell(f);
1443   start_file(f);
1444   r = decode_jpeg_header(SCAN_type);
1445   fseek(f,n,SEEK_SET);
1446   return r;
1447}
1448#endif
1449
1450int stbi_jpeg_test_memory(unsigned char *buffer, int len)
1451{
1452   start_mem(buffer,len);
1453   return decode_jpeg_header(SCAN_type);
1454}
1455
1456// @TODO:
1457#ifndef STBI_NO_STDIO
1458extern int      stbi_jpeg_info            (char *filename,           int *x, int *y, int *comp);
1459extern int      stbi_jpeg_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
1460#endif
1461extern int      stbi_jpeg_info_from_memory(stbi_uc *buffer, int len, int *x, int *y, int *comp);
1462
1463// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
1464//    simple implementation
1465//      - all input must be provided in an upfront buffer
1466//      - all output is written to a single output buffer (can malloc/realloc)
1467//    performance
1468//      - fast huffman
1469
1470// fast-way is faster to check than jpeg huffman, but slow way is slower
1471#define ZFAST_BITS  9 // accelerate all cases in default tables
1472#define ZFAST_MASK  ((1 << ZFAST_BITS) - 1)
1473
1474// zlib-style huffman encoding
1475// (jpegs packs from left, zlib from right, so can't share code)
1476typedef struct
1477{
1478   uint16 fast[1 << ZFAST_BITS];
1479   uint16 firstcode[16];
1480   int maxcode[17];
1481   uint16 firstsymbol[16];
1482   uint8  size[288];
1483   uint16 value[288];
1484} zhuffman;
1485
1486__forceinline static int bitreverse16(int n)
1487{
1488  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
1489  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
1490  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
1491  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
1492  return n;
1493}
1494
1495__forceinline static int bit_reverse(int v, int bits)
1496{
1497   assert(bits <= 16);
1498   // to bit reverse n bits, reverse 16 and shift
1499   // e.g. 11 bits, bit reverse and shift away 5
1500   return bitreverse16(v) >> (16-bits);
1501}
1502
1503static int zbuild_huffman(zhuffman *z, uint8 *sizelist, int num)
1504{
1505   int i,k=0;
1506   int code, next_code[16], sizes[17];
1507
1508   // DEFLATE spec for generating codes
1509   memset(sizes, 0, sizeof(sizes));
1510   memset(z->fast, 255, sizeof(z->fast));
1511   for (i=0; i < num; ++i)
1512      ++sizes[sizelist[i]];
1513   sizes[0] = 0;
1514   for (i=1; i < 16; ++i)
1515      assert(sizes[i] <= (1 << i));
1516   code = 0;
1517   for (i=1; i < 16; ++i) {
1518      next_code[i] = code;
1519      z->firstcode[i] = (uint16) code;
1520      z->firstsymbol[i] = (uint16) k;
1521      code = (code + sizes[i]);
1522      if (sizes[i])
1523         if (code-1 >= (1 << i)) return e("bad codelengths","Corrupt JPEG");
1524      z->maxcode[i] = code << (16-i); // preshift for inner loop
1525      code <<= 1;
1526      k += sizes[i];
1527   }
1528   z->maxcode[16] = 0x10000; // sentinel
1529   for (i=0; i < num; ++i) {
1530      int s = sizelist[i];
1531      if (s) {
1532         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
1533         z->size[c] = (uint8)s;
1534         z->value[c] = (uint16)i;
1535         if (s <= ZFAST_BITS) {
1536            int k = bit_reverse(next_code[s],s);
1537            while (k < (1 << ZFAST_BITS)) {
1538               z->fast[k] = (uint16) c;
1539               k += (1 << s);
1540            }
1541         }
1542         ++next_code[s];
1543      }
1544   }
1545   return 1;
1546}
1547
1548// zlib-from-memory implementation for PNG reading
1549//    because PNG allows splitting the zlib stream arbitrarily,
1550//    and it's annoying structurally to have PNG call ZLIB call PNG,
1551//    we require PNG read all the IDATs and combine them into a single
1552//    memory buffer
1553
1554static uint8 *zbuffer, *zbuffer_end;

Large files files are truncated, but you can click here to view the full file