PageRenderTime 135ms CodeModel.GetById 26ms app.highlight 100ms RepoModel.GetById 1ms app.codeStats 1ms

/contrib/bzip2/blocksort.c

https://bitbucket.org/freebsd/freebsd-head/
C | 1094 lines | 729 code | 142 blank | 223 comment | 226 complexity | 03d6f03b71eceefb5a0338e91ad75d5a MD5 | raw file
   1
   2/*-------------------------------------------------------------*/
   3/*--- Block sorting machinery                               ---*/
   4/*---                                           blocksort.c ---*/
   5/*-------------------------------------------------------------*/
   6
   7/* ------------------------------------------------------------------
   8   This file is part of bzip2/libbzip2, a program and library for
   9   lossless, block-sorting data compression.
  10
  11   bzip2/libbzip2 version 1.0.6 of 6 September 2010
  12   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
  13
  14   Please read the WARNING, DISCLAIMER and PATENTS sections in the 
  15   README file.
  16
  17   This program is released under the terms of the license contained
  18   in the file LICENSE.
  19   ------------------------------------------------------------------ */
  20
  21
  22#include "bzlib_private.h"
  23
  24/*---------------------------------------------*/
  25/*--- Fallback O(N log(N)^2) sorting        ---*/
  26/*--- algorithm, for repetitive blocks      ---*/
  27/*---------------------------------------------*/
  28
  29/*---------------------------------------------*/
  30static 
  31__inline__
  32void fallbackSimpleSort ( UInt32* fmap, 
  33                          UInt32* eclass, 
  34                          Int32   lo, 
  35                          Int32   hi )
  36{
  37   Int32 i, j, tmp;
  38   UInt32 ec_tmp;
  39
  40   if (lo == hi) return;
  41
  42   if (hi - lo > 3) {
  43      for ( i = hi-4; i >= lo; i-- ) {
  44         tmp = fmap[i];
  45         ec_tmp = eclass[tmp];
  46         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  47            fmap[j-4] = fmap[j];
  48         fmap[j-4] = tmp;
  49      }
  50   }
  51
  52   for ( i = hi-1; i >= lo; i-- ) {
  53      tmp = fmap[i];
  54      ec_tmp = eclass[tmp];
  55      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
  56         fmap[j-1] = fmap[j];
  57      fmap[j-1] = tmp;
  58   }
  59}
  60
  61
  62/*---------------------------------------------*/
  63#define fswap(zz1, zz2) \
  64   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
  65
  66#define fvswap(zzp1, zzp2, zzn)       \
  67{                                     \
  68   Int32 yyp1 = (zzp1);               \
  69   Int32 yyp2 = (zzp2);               \
  70   Int32 yyn  = (zzn);                \
  71   while (yyn > 0) {                  \
  72      fswap(fmap[yyp1], fmap[yyp2]);  \
  73      yyp1++; yyp2++; yyn--;          \
  74   }                                  \
  75}
  76
  77
  78#define fmin(a,b) ((a) < (b)) ? (a) : (b)
  79
  80#define fpush(lz,hz) { stackLo[sp] = lz; \
  81                       stackHi[sp] = hz; \
  82                       sp++; }
  83
  84#define fpop(lz,hz) { sp--;              \
  85                      lz = stackLo[sp];  \
  86                      hz = stackHi[sp]; }
  87
  88#define FALLBACK_QSORT_SMALL_THRESH 10
  89#define FALLBACK_QSORT_STACK_SIZE   100
  90
  91
  92static
  93void fallbackQSort3 ( UInt32* fmap, 
  94                      UInt32* eclass,
  95                      Int32   loSt, 
  96                      Int32   hiSt )
  97{
  98   Int32 unLo, unHi, ltLo, gtHi, n, m;
  99   Int32 sp, lo, hi;
 100   UInt32 med, r, r3;
 101   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 102   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 103
 104   r = 0;
 105
 106   sp = 0;
 107   fpush ( loSt, hiSt );
 108
 109   while (sp > 0) {
 110
 111      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
 112
 113      fpop ( lo, hi );
 114      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 115         fallbackSimpleSort ( fmap, eclass, lo, hi );
 116         continue;
 117      }
 118
 119      /* Random partitioning.  Median of 3 sometimes fails to
 120         avoid bad cases.  Median of 9 seems to help but 
 121         looks rather expensive.  This too seems to work but
 122         is cheaper.  Guidance for the magic constants 
 123         7621 and 32768 is taken from Sedgewick's algorithms
 124         book, chapter 35.
 125      */
 126      r = ((r * 7621) + 1) % 32768;
 127      r3 = r % 3;
 128      if (r3 == 0) med = eclass[fmap[lo]]; else
 129      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 130                   med = eclass[fmap[hi]];
 131
 132      unLo = ltLo = lo;
 133      unHi = gtHi = hi;
 134
 135      while (1) {
 136         while (1) {
 137            if (unLo > unHi) break;
 138            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 139            if (n == 0) { 
 140               fswap(fmap[unLo], fmap[ltLo]); 
 141               ltLo++; unLo++; 
 142               continue; 
 143            };
 144            if (n > 0) break;
 145            unLo++;
 146         }
 147         while (1) {
 148            if (unLo > unHi) break;
 149            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 150            if (n == 0) { 
 151               fswap(fmap[unHi], fmap[gtHi]); 
 152               gtHi--; unHi--; 
 153               continue; 
 154            };
 155            if (n < 0) break;
 156            unHi--;
 157         }
 158         if (unLo > unHi) break;
 159         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 160      }
 161
 162      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 163
 164      if (gtHi < ltLo) continue;
 165
 166      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 167      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 168
 169      n = lo + unLo - ltLo - 1;
 170      m = hi - (gtHi - unHi) + 1;
 171
 172      if (n - lo > hi - m) {
 173         fpush ( lo, n );
 174         fpush ( m, hi );
 175      } else {
 176         fpush ( m, hi );
 177         fpush ( lo, n );
 178      }
 179   }
 180}
 181
 182#undef fmin
 183#undef fpush
 184#undef fpop
 185#undef fswap
 186#undef fvswap
 187#undef FALLBACK_QSORT_SMALL_THRESH
 188#undef FALLBACK_QSORT_STACK_SIZE
 189
 190
 191/*---------------------------------------------*/
 192/* Pre:
 193      nblock > 0
 194      eclass exists for [0 .. nblock-1]
 195      ((UChar*)eclass) [0 .. nblock-1] holds block
 196      ptr exists for [0 .. nblock-1]
 197
 198   Post:
 199      ((UChar*)eclass) [0 .. nblock-1] holds block
 200      All other areas of eclass destroyed
 201      fmap [0 .. nblock-1] holds sorted order
 202      bhtab [ 0 .. 2+(nblock/32) ] destroyed
 203*/
 204
 205#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 206#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 207#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 208#define      WORD_BH(zz)  bhtab[(zz) >> 5]
 209#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 210
 211static
 212void fallbackSort ( UInt32* fmap, 
 213                    UInt32* eclass, 
 214                    UInt32* bhtab,
 215                    Int32   nblock,
 216                    Int32   verb )
 217{
 218   Int32 ftab[257];
 219   Int32 ftabCopy[256];
 220   Int32 H, i, j, k, l, r, cc, cc1;
 221   Int32 nNotDone;
 222   Int32 nBhtab;
 223   UChar* eclass8 = (UChar*)eclass;
 224
 225   /*--
 226      Initial 1-char radix sort to generate
 227      initial fmap and initial BH bits.
 228   --*/
 229   if (verb >= 4)
 230      VPrintf0 ( "        bucket sorting ...\n" );
 231   for (i = 0; i < 257;    i++) ftab[i] = 0;
 232   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 233   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 234   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 235
 236   for (i = 0; i < nblock; i++) {
 237      j = eclass8[i];
 238      k = ftab[j] - 1;
 239      ftab[j] = k;
 240      fmap[k] = i;
 241   }
 242
 243   nBhtab = 2 + (nblock / 32);
 244   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 245   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 246
 247   /*--
 248      Inductively refine the buckets.  Kind-of an
 249      "exponential radix sort" (!), inspired by the
 250      Manber-Myers suffix array construction algorithm.
 251   --*/
 252
 253   /*-- set sentinel bits for block-end detection --*/
 254   for (i = 0; i < 32; i++) { 
 255      SET_BH(nblock + 2*i);
 256      CLEAR_BH(nblock + 2*i + 1);
 257   }
 258
 259   /*-- the log(N) loop --*/
 260   H = 1;
 261   while (1) {
 262
 263      if (verb >= 4) 
 264         VPrintf1 ( "        depth %6d has ", H );
 265
 266      j = 0;
 267      for (i = 0; i < nblock; i++) {
 268         if (ISSET_BH(i)) j = i;
 269         k = fmap[i] - H; if (k < 0) k += nblock;
 270         eclass[k] = j;
 271      }
 272
 273      nNotDone = 0;
 274      r = -1;
 275      while (1) {
 276
 277	 /*-- find the next non-singleton bucket --*/
 278         k = r + 1;
 279         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 280         if (ISSET_BH(k)) {
 281            while (WORD_BH(k) == 0xffffffff) k += 32;
 282            while (ISSET_BH(k)) k++;
 283         }
 284         l = k - 1;
 285         if (l >= nblock) break;
 286         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 287         if (!ISSET_BH(k)) {
 288            while (WORD_BH(k) == 0x00000000) k += 32;
 289            while (!ISSET_BH(k)) k++;
 290         }
 291         r = k - 1;
 292         if (r >= nblock) break;
 293
 294         /*-- now [l, r] bracket current bucket --*/
 295         if (r > l) {
 296            nNotDone += (r - l + 1);
 297            fallbackQSort3 ( fmap, eclass, l, r );
 298
 299            /*-- scan bucket and generate header bits-- */
 300            cc = -1;
 301            for (i = l; i <= r; i++) {
 302               cc1 = eclass[fmap[i]];
 303               if (cc != cc1) { SET_BH(i); cc = cc1; };
 304            }
 305         }
 306      }
 307
 308      if (verb >= 4) 
 309         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 310
 311      H *= 2;
 312      if (H > nblock || nNotDone == 0) break;
 313   }
 314
 315   /*-- 
 316      Reconstruct the original block in
 317      eclass8 [0 .. nblock-1], since the
 318      previous phase destroyed it.
 319   --*/
 320   if (verb >= 4)
 321      VPrintf0 ( "        reconstructing block ...\n" );
 322   j = 0;
 323   for (i = 0; i < nblock; i++) {
 324      while (ftabCopy[j] == 0) j++;
 325      ftabCopy[j]--;
 326      eclass8[fmap[i]] = (UChar)j;
 327   }
 328   AssertH ( j < 256, 1005 );
 329}
 330
 331#undef       SET_BH
 332#undef     CLEAR_BH
 333#undef     ISSET_BH
 334#undef      WORD_BH
 335#undef UNALIGNED_BH
 336
 337
 338/*---------------------------------------------*/
 339/*--- The main, O(N^2 log(N)) sorting       ---*/
 340/*--- algorithm.  Faster for "normal"       ---*/
 341/*--- non-repetitive blocks.                ---*/
 342/*---------------------------------------------*/
 343
 344/*---------------------------------------------*/
 345static
 346__inline__
 347Bool mainGtU ( UInt32  i1, 
 348               UInt32  i2,
 349               UChar*  block, 
 350               UInt16* quadrant,
 351               UInt32  nblock,
 352               Int32*  budget )
 353{
 354   Int32  k;
 355   UChar  c1, c2;
 356   UInt16 s1, s2;
 357
 358   AssertD ( i1 != i2, "mainGtU" );
 359   /* 1 */
 360   c1 = block[i1]; c2 = block[i2];
 361   if (c1 != c2) return (c1 > c2);
 362   i1++; i2++;
 363   /* 2 */
 364   c1 = block[i1]; c2 = block[i2];
 365   if (c1 != c2) return (c1 > c2);
 366   i1++; i2++;
 367   /* 3 */
 368   c1 = block[i1]; c2 = block[i2];
 369   if (c1 != c2) return (c1 > c2);
 370   i1++; i2++;
 371   /* 4 */
 372   c1 = block[i1]; c2 = block[i2];
 373   if (c1 != c2) return (c1 > c2);
 374   i1++; i2++;
 375   /* 5 */
 376   c1 = block[i1]; c2 = block[i2];
 377   if (c1 != c2) return (c1 > c2);
 378   i1++; i2++;
 379   /* 6 */
 380   c1 = block[i1]; c2 = block[i2];
 381   if (c1 != c2) return (c1 > c2);
 382   i1++; i2++;
 383   /* 7 */
 384   c1 = block[i1]; c2 = block[i2];
 385   if (c1 != c2) return (c1 > c2);
 386   i1++; i2++;
 387   /* 8 */
 388   c1 = block[i1]; c2 = block[i2];
 389   if (c1 != c2) return (c1 > c2);
 390   i1++; i2++;
 391   /* 9 */
 392   c1 = block[i1]; c2 = block[i2];
 393   if (c1 != c2) return (c1 > c2);
 394   i1++; i2++;
 395   /* 10 */
 396   c1 = block[i1]; c2 = block[i2];
 397   if (c1 != c2) return (c1 > c2);
 398   i1++; i2++;
 399   /* 11 */
 400   c1 = block[i1]; c2 = block[i2];
 401   if (c1 != c2) return (c1 > c2);
 402   i1++; i2++;
 403   /* 12 */
 404   c1 = block[i1]; c2 = block[i2];
 405   if (c1 != c2) return (c1 > c2);
 406   i1++; i2++;
 407
 408   k = nblock + 8;
 409
 410   do {
 411      /* 1 */
 412      c1 = block[i1]; c2 = block[i2];
 413      if (c1 != c2) return (c1 > c2);
 414      s1 = quadrant[i1]; s2 = quadrant[i2];
 415      if (s1 != s2) return (s1 > s2);
 416      i1++; i2++;
 417      /* 2 */
 418      c1 = block[i1]; c2 = block[i2];
 419      if (c1 != c2) return (c1 > c2);
 420      s1 = quadrant[i1]; s2 = quadrant[i2];
 421      if (s1 != s2) return (s1 > s2);
 422      i1++; i2++;
 423      /* 3 */
 424      c1 = block[i1]; c2 = block[i2];
 425      if (c1 != c2) return (c1 > c2);
 426      s1 = quadrant[i1]; s2 = quadrant[i2];
 427      if (s1 != s2) return (s1 > s2);
 428      i1++; i2++;
 429      /* 4 */
 430      c1 = block[i1]; c2 = block[i2];
 431      if (c1 != c2) return (c1 > c2);
 432      s1 = quadrant[i1]; s2 = quadrant[i2];
 433      if (s1 != s2) return (s1 > s2);
 434      i1++; i2++;
 435      /* 5 */
 436      c1 = block[i1]; c2 = block[i2];
 437      if (c1 != c2) return (c1 > c2);
 438      s1 = quadrant[i1]; s2 = quadrant[i2];
 439      if (s1 != s2) return (s1 > s2);
 440      i1++; i2++;
 441      /* 6 */
 442      c1 = block[i1]; c2 = block[i2];
 443      if (c1 != c2) return (c1 > c2);
 444      s1 = quadrant[i1]; s2 = quadrant[i2];
 445      if (s1 != s2) return (s1 > s2);
 446      i1++; i2++;
 447      /* 7 */
 448      c1 = block[i1]; c2 = block[i2];
 449      if (c1 != c2) return (c1 > c2);
 450      s1 = quadrant[i1]; s2 = quadrant[i2];
 451      if (s1 != s2) return (s1 > s2);
 452      i1++; i2++;
 453      /* 8 */
 454      c1 = block[i1]; c2 = block[i2];
 455      if (c1 != c2) return (c1 > c2);
 456      s1 = quadrant[i1]; s2 = quadrant[i2];
 457      if (s1 != s2) return (s1 > s2);
 458      i1++; i2++;
 459
 460      if (i1 >= nblock) i1 -= nblock;
 461      if (i2 >= nblock) i2 -= nblock;
 462
 463      k -= 8;
 464      (*budget)--;
 465   }
 466      while (k >= 0);
 467
 468   return False;
 469}
 470
 471
 472/*---------------------------------------------*/
 473/*--
 474   Knuth's increments seem to work better
 475   than Incerpi-Sedgewick here.  Possibly
 476   because the number of elems to sort is
 477   usually small, typically <= 20.
 478--*/
 479static
 480Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 481                   9841, 29524, 88573, 265720,
 482                   797161, 2391484 };
 483
 484static
 485void mainSimpleSort ( UInt32* ptr,
 486                      UChar*  block,
 487                      UInt16* quadrant,
 488                      Int32   nblock,
 489                      Int32   lo, 
 490                      Int32   hi, 
 491                      Int32   d,
 492                      Int32*  budget )
 493{
 494   Int32 i, j, h, bigN, hp;
 495   UInt32 v;
 496
 497   bigN = hi - lo + 1;
 498   if (bigN < 2) return;
 499
 500   hp = 0;
 501   while (incs[hp] < bigN) hp++;
 502   hp--;
 503
 504   for (; hp >= 0; hp--) {
 505      h = incs[hp];
 506
 507      i = lo + h;
 508      while (True) {
 509
 510         /*-- copy 1 --*/
 511         if (i > hi) break;
 512         v = ptr[i];
 513         j = i;
 514         while ( mainGtU ( 
 515                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 516                 ) ) {
 517            ptr[j] = ptr[j-h];
 518            j = j - h;
 519            if (j <= (lo + h - 1)) break;
 520         }
 521         ptr[j] = v;
 522         i++;
 523
 524         /*-- copy 2 --*/
 525         if (i > hi) break;
 526         v = ptr[i];
 527         j = i;
 528         while ( mainGtU ( 
 529                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 530                 ) ) {
 531            ptr[j] = ptr[j-h];
 532            j = j - h;
 533            if (j <= (lo + h - 1)) break;
 534         }
 535         ptr[j] = v;
 536         i++;
 537
 538         /*-- copy 3 --*/
 539         if (i > hi) break;
 540         v = ptr[i];
 541         j = i;
 542         while ( mainGtU ( 
 543                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 544                 ) ) {
 545            ptr[j] = ptr[j-h];
 546            j = j - h;
 547            if (j <= (lo + h - 1)) break;
 548         }
 549         ptr[j] = v;
 550         i++;
 551
 552         if (*budget < 0) return;
 553      }
 554   }
 555}
 556
 557
 558/*---------------------------------------------*/
 559/*--
 560   The following is an implementation of
 561   an elegant 3-way quicksort for strings,
 562   described in a paper "Fast Algorithms for
 563   Sorting and Searching Strings", by Robert
 564   Sedgewick and Jon L. Bentley.
 565--*/
 566
 567#define mswap(zz1, zz2) \
 568   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 569
 570#define mvswap(zzp1, zzp2, zzn)       \
 571{                                     \
 572   Int32 yyp1 = (zzp1);               \
 573   Int32 yyp2 = (zzp2);               \
 574   Int32 yyn  = (zzn);                \
 575   while (yyn > 0) {                  \
 576      mswap(ptr[yyp1], ptr[yyp2]);    \
 577      yyp1++; yyp2++; yyn--;          \
 578   }                                  \
 579}
 580
 581static 
 582__inline__
 583UChar mmed3 ( UChar a, UChar b, UChar c )
 584{
 585   UChar t;
 586   if (a > b) { t = a; a = b; b = t; };
 587   if (b > c) { 
 588      b = c;
 589      if (a > b) b = a;
 590   }
 591   return b;
 592}
 593
 594#define mmin(a,b) ((a) < (b)) ? (a) : (b)
 595
 596#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 597                          stackHi[sp] = hz; \
 598                          stackD [sp] = dz; \
 599                          sp++; }
 600
 601#define mpop(lz,hz,dz) { sp--;             \
 602                         lz = stackLo[sp]; \
 603                         hz = stackHi[sp]; \
 604                         dz = stackD [sp]; }
 605
 606
 607#define mnextsize(az) (nextHi[az]-nextLo[az])
 608
 609#define mnextswap(az,bz)                                        \
 610   { Int32 tz;                                                  \
 611     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 612     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 613     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 614
 615
 616#define MAIN_QSORT_SMALL_THRESH 20
 617#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 618#define MAIN_QSORT_STACK_SIZE 100
 619
 620static
 621void mainQSort3 ( UInt32* ptr,
 622                  UChar*  block,
 623                  UInt16* quadrant,
 624                  Int32   nblock,
 625                  Int32   loSt, 
 626                  Int32   hiSt, 
 627                  Int32   dSt,
 628                  Int32*  budget )
 629{
 630   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 631   Int32 sp, lo, hi, d;
 632
 633   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 634   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 635   Int32 stackD [MAIN_QSORT_STACK_SIZE];
 636
 637   Int32 nextLo[3];
 638   Int32 nextHi[3];
 639   Int32 nextD [3];
 640
 641   sp = 0;
 642   mpush ( loSt, hiSt, dSt );
 643
 644   while (sp > 0) {
 645
 646      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
 647
 648      mpop ( lo, hi, d );
 649      if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
 650          d > MAIN_QSORT_DEPTH_THRESH) {
 651         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 652         if (*budget < 0) return;
 653         continue;
 654      }
 655
 656      med = (Int32) 
 657            mmed3 ( block[ptr[ lo         ]+d],
 658                    block[ptr[ hi         ]+d],
 659                    block[ptr[ (lo+hi)>>1 ]+d] );
 660
 661      unLo = ltLo = lo;
 662      unHi = gtHi = hi;
 663
 664      while (True) {
 665         while (True) {
 666            if (unLo > unHi) break;
 667            n = ((Int32)block[ptr[unLo]+d]) - med;
 668            if (n == 0) { 
 669               mswap(ptr[unLo], ptr[ltLo]); 
 670               ltLo++; unLo++; continue; 
 671            };
 672            if (n >  0) break;
 673            unLo++;
 674         }
 675         while (True) {
 676            if (unLo > unHi) break;
 677            n = ((Int32)block[ptr[unHi]+d]) - med;
 678            if (n == 0) { 
 679               mswap(ptr[unHi], ptr[gtHi]); 
 680               gtHi--; unHi--; continue; 
 681            };
 682            if (n <  0) break;
 683            unHi--;
 684         }
 685         if (unLo > unHi) break;
 686         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 687      }
 688
 689      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 690
 691      if (gtHi < ltLo) {
 692         mpush(lo, hi, d+1 );
 693         continue;
 694      }
 695
 696      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 697      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 698
 699      n = lo + unLo - ltLo - 1;
 700      m = hi - (gtHi - unHi) + 1;
 701
 702      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 703      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 704      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 705
 706      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 707      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 708      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 709
 710      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 711      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 712
 713      mpush (nextLo[0], nextHi[0], nextD[0]);
 714      mpush (nextLo[1], nextHi[1], nextD[1]);
 715      mpush (nextLo[2], nextHi[2], nextD[2]);
 716   }
 717}
 718
 719#undef mswap
 720#undef mvswap
 721#undef mpush
 722#undef mpop
 723#undef mmin
 724#undef mnextsize
 725#undef mnextswap
 726#undef MAIN_QSORT_SMALL_THRESH
 727#undef MAIN_QSORT_DEPTH_THRESH
 728#undef MAIN_QSORT_STACK_SIZE
 729
 730
 731/*---------------------------------------------*/
 732/* Pre:
 733      nblock > N_OVERSHOOT
 734      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 735      ((UChar*)block32) [0 .. nblock-1] holds block
 736      ptr exists for [0 .. nblock-1]
 737
 738   Post:
 739      ((UChar*)block32) [0 .. nblock-1] holds block
 740      All other areas of block32 destroyed
 741      ftab [0 .. 65536 ] destroyed
 742      ptr [0 .. nblock-1] holds sorted order
 743      if (*budget < 0), sorting was abandoned
 744*/
 745
 746#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 747#define SETMASK (1 << 21)
 748#define CLEARMASK (~(SETMASK))
 749
 750static
 751void mainSort ( UInt32* ptr, 
 752                UChar*  block,
 753                UInt16* quadrant, 
 754                UInt32* ftab,
 755                Int32   nblock,
 756                Int32   verb,
 757                Int32*  budget )
 758{
 759   Int32  i, j, k, ss, sb;
 760   Int32  runningOrder[256];
 761   Bool   bigDone[256];
 762   Int32  copyStart[256];
 763   Int32  copyEnd  [256];
 764   UChar  c1;
 765   Int32  numQSorted;
 766   UInt16 s;
 767   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 768
 769   /*-- set up the 2-byte frequency table --*/
 770   for (i = 65536; i >= 0; i--) ftab[i] = 0;
 771
 772   j = block[0] << 8;
 773   i = nblock-1;
 774   for (; i >= 3; i -= 4) {
 775      quadrant[i] = 0;
 776      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 777      ftab[j]++;
 778      quadrant[i-1] = 0;
 779      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 780      ftab[j]++;
 781      quadrant[i-2] = 0;
 782      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 783      ftab[j]++;
 784      quadrant[i-3] = 0;
 785      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 786      ftab[j]++;
 787   }
 788   for (; i >= 0; i--) {
 789      quadrant[i] = 0;
 790      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 791      ftab[j]++;
 792   }
 793
 794   /*-- (emphasises close relationship of block & quadrant) --*/
 795   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 796      block   [nblock+i] = block[i];
 797      quadrant[nblock+i] = 0;
 798   }
 799
 800   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 801
 802   /*-- Complete the initial radix sort --*/
 803   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 804
 805   s = block[0] << 8;
 806   i = nblock-1;
 807   for (; i >= 3; i -= 4) {
 808      s = (s >> 8) | (block[i] << 8);
 809      j = ftab[s] -1;
 810      ftab[s] = j;
 811      ptr[j] = i;
 812      s = (s >> 8) | (block[i-1] << 8);
 813      j = ftab[s] -1;
 814      ftab[s] = j;
 815      ptr[j] = i-1;
 816      s = (s >> 8) | (block[i-2] << 8);
 817      j = ftab[s] -1;
 818      ftab[s] = j;
 819      ptr[j] = i-2;
 820      s = (s >> 8) | (block[i-3] << 8);
 821      j = ftab[s] -1;
 822      ftab[s] = j;
 823      ptr[j] = i-3;
 824   }
 825   for (; i >= 0; i--) {
 826      s = (s >> 8) | (block[i] << 8);
 827      j = ftab[s] -1;
 828      ftab[s] = j;
 829      ptr[j] = i;
 830   }
 831
 832   /*--
 833      Now ftab contains the first loc of every small bucket.
 834      Calculate the running order, from smallest to largest
 835      big bucket.
 836   --*/
 837   for (i = 0; i <= 255; i++) {
 838      bigDone     [i] = False;
 839      runningOrder[i] = i;
 840   }
 841
 842   {
 843      Int32 vv;
 844      Int32 h = 1;
 845      do h = 3 * h + 1; while (h <= 256);
 846      do {
 847         h = h / 3;
 848         for (i = h; i <= 255; i++) {
 849            vv = runningOrder[i];
 850            j = i;
 851            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 852               runningOrder[j] = runningOrder[j-h];
 853               j = j - h;
 854               if (j <= (h - 1)) goto zero;
 855            }
 856            zero:
 857            runningOrder[j] = vv;
 858         }
 859      } while (h != 1);
 860   }
 861
 862   /*--
 863      The main sorting loop.
 864   --*/
 865
 866   numQSorted = 0;
 867
 868   for (i = 0; i <= 255; i++) {
 869
 870      /*--
 871         Process big buckets, starting with the least full.
 872         Basically this is a 3-step process in which we call
 873         mainQSort3 to sort the small buckets [ss, j], but
 874         also make a big effort to avoid the calls if we can.
 875      --*/
 876      ss = runningOrder[i];
 877
 878      /*--
 879         Step 1:
 880         Complete the big bucket [ss] by quicksorting
 881         any unsorted small buckets [ss, j], for j != ss.  
 882         Hopefully previous pointer-scanning phases have already
 883         completed many of the small buckets [ss, j], so
 884         we don't have to sort them at all.
 885      --*/
 886      for (j = 0; j <= 255; j++) {
 887         if (j != ss) {
 888            sb = (ss << 8) + j;
 889            if ( ! (ftab[sb] & SETMASK) ) {
 890               Int32 lo = ftab[sb]   & CLEARMASK;
 891               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 892               if (hi > lo) {
 893                  if (verb >= 4)
 894                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 895                                "done %d   this %d\n",
 896                                ss, j, numQSorted, hi - lo + 1 );
 897                  mainQSort3 ( 
 898                     ptr, block, quadrant, nblock, 
 899                     lo, hi, BZ_N_RADIX, budget 
 900                  );   
 901                  numQSorted += (hi - lo + 1);
 902                  if (*budget < 0) return;
 903               }
 904            }
 905            ftab[sb] |= SETMASK;
 906         }
 907      }
 908
 909      AssertH ( !bigDone[ss], 1006 );
 910
 911      /*--
 912         Step 2:
 913         Now scan this big bucket [ss] so as to synthesise the
 914         sorted order for small buckets [t, ss] for all t,
 915         including, magically, the bucket [ss,ss] too.
 916         This will avoid doing Real Work in subsequent Step 1's.
 917      --*/
 918      {
 919         for (j = 0; j <= 255; j++) {
 920            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 921            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 922         }
 923         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 924            k = ptr[j]-1; if (k < 0) k += nblock;
 925            c1 = block[k];
 926            if (!bigDone[c1])
 927               ptr[ copyStart[c1]++ ] = k;
 928         }
 929         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 930            k = ptr[j]-1; if (k < 0) k += nblock;
 931            c1 = block[k];
 932            if (!bigDone[c1]) 
 933               ptr[ copyEnd[c1]-- ] = k;
 934         }
 935      }
 936
 937      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 938                || 
 939                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 940                   Necessity for this case is demonstrated by compressing 
 941                   a sequence of approximately 48.5 million of character 
 942                   251; 1.0.0/1.0.1 will then die here. */
 943                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 944                1007 )
 945
 946      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 947
 948      /*--
 949         Step 3:
 950         The [ss] big bucket is now done.  Record this fact,
 951         and update the quadrant descriptors.  Remember to
 952         update quadrants in the overshoot area too, if
 953         necessary.  The "if (i < 255)" test merely skips
 954         this updating for the last bucket processed, since
 955         updating for the last bucket is pointless.
 956
 957         The quadrant array provides a way to incrementally
 958         cache sort orderings, as they appear, so as to 
 959         make subsequent comparisons in fullGtU() complete
 960         faster.  For repetitive blocks this makes a big
 961         difference (but not big enough to be able to avoid
 962         the fallback sorting mechanism, exponential radix sort).
 963
 964         The precise meaning is: at all times:
 965
 966            for 0 <= i < nblock and 0 <= j <= nblock
 967
 968            if block[i] != block[j], 
 969
 970               then the relative values of quadrant[i] and 
 971                    quadrant[j] are meaningless.
 972
 973               else {
 974                  if quadrant[i] < quadrant[j]
 975                     then the string starting at i lexicographically
 976                     precedes the string starting at j
 977
 978                  else if quadrant[i] > quadrant[j]
 979                     then the string starting at j lexicographically
 980                     precedes the string starting at i
 981
 982                  else
 983                     the relative ordering of the strings starting
 984                     at i and j has not yet been determined.
 985               }
 986      --*/
 987      bigDone[ss] = True;
 988
 989      if (i < 255) {
 990         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
 991         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
 992         Int32 shifts   = 0;
 993
 994         while ((bbSize >> shifts) > 65534) shifts++;
 995
 996         for (j = bbSize-1; j >= 0; j--) {
 997            Int32 a2update     = ptr[bbStart + j];
 998            UInt16 qVal        = (UInt16)(j >> shifts);
 999            quadrant[a2update] = qVal;
1000            if (a2update < BZ_N_OVERSHOOT)
1001               quadrant[a2update + nblock] = qVal;
1002         }
1003         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004      }
1005
1006   }
1007
1008   if (verb >= 4)
1009      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010                 nblock, numQSorted, nblock - numQSorted );
1011}
1012
1013#undef BIGFREQ
1014#undef SETMASK
1015#undef CLEARMASK
1016
1017
1018/*---------------------------------------------*/
1019/* Pre:
1020      nblock > 0
1021      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022      ((UChar*)arr2)  [0 .. nblock-1] holds block
1023      arr1 exists for [0 .. nblock-1]
1024
1025   Post:
1026      ((UChar*)arr2) [0 .. nblock-1] holds block
1027      All other areas of block destroyed
1028      ftab [ 0 .. 65536 ] destroyed
1029      arr1 [0 .. nblock-1] holds sorted order
1030*/
1031void BZ2_blockSort ( EState* s )
1032{
1033   UInt32* ptr    = s->ptr; 
1034   UChar*  block  = s->block;
1035   UInt32* ftab   = s->ftab;
1036   Int32   nblock = s->nblock;
1037   Int32   verb   = s->verbosity;
1038   Int32   wfact  = s->workFactor;
1039   UInt16* quadrant;
1040   Int32   budget;
1041   Int32   budgetInit;
1042   Int32   i;
1043
1044   if (nblock < 10000) {
1045      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046   } else {
1047      /* Calculate the location for quadrant, remembering to get
1048         the alignment right.  Assumes that &(block[0]) is at least
1049         2-byte aligned -- this should be ok since block is really
1050         the first section of arr2.
1051      */
1052      i = nblock+BZ_N_OVERSHOOT;
1053      if (i & 1) i++;
1054      quadrant = (UInt16*)(&(block[i]));
1055
1056      /* (wfact-1) / 3 puts the default-factor-30
1057         transition point at very roughly the same place as 
1058         with v0.1 and v0.9.0.  
1059         Not that it particularly matters any more, since the
1060         resulting compressed stream is now the same regardless
1061         of whether or not we use the main sort or fallback sort.
1062      */
1063      if (wfact < 1  ) wfact = 1;
1064      if (wfact > 100) wfact = 100;
1065      budgetInit = nblock * ((wfact-1) / 3);
1066      budget = budgetInit;
1067
1068      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069      if (verb >= 3) 
1070         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071                    budgetInit - budget,
1072                    nblock, 
1073                    (float)(budgetInit - budget) /
1074                    (float)(nblock==0 ? 1 : nblock) ); 
1075      if (budget < 0) {
1076         if (verb >= 2) 
1077            VPrintf0 ( "    too repetitive; using fallback"
1078                       " sorting algorithm\n" );
1079         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080      }
1081   }
1082
1083   s->origPtr = -1;
1084   for (i = 0; i < s->nblock; i++)
1085      if (ptr[i] == 0)
1086         { s->origPtr = i; break; };
1087
1088   AssertH( s->origPtr != -1, 1003 );
1089}
1090
1091
1092/*-------------------------------------------------------------*/
1093/*--- end                                       blocksort.c ---*/
1094/*-------------------------------------------------------------*/