xref: /plan9/sys/src/cmd/bzip2/lib/blocksort.c (revision 59cc4ca53493a3c6d2349fe2b7f7c40f7dce7294)
1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /*--
8   This file is a part of bzip2 and/or libbzip2, a program and
9   library for lossless, block-sorting data compression.
10 
11   Copyright (C) 1996-2000 Julian R Seward.  All rights reserved.
12 
13   Redistribution and use in source and binary forms, with or without
14   modification, are permitted provided that the following conditions
15   are met:
16 
17   1. Redistributions of source code must retain the above copyright
18      notice, this list of conditions and the following disclaimer.
19 
20   2. The origin of this software must not be misrepresented; you must
21      not claim that you wrote the original software.  If you use this
22      software in a product, an acknowledgment in the product
23      documentation would be appreciated but is not required.
24 
25   3. Altered source versions must be plainly marked as such, and must
26      not be misrepresented as being the original software.
27 
28   4. The name of the author may not be used to endorse or promote
29      products derived from this software without specific prior written
30      permission.
31 
32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 
44   Julian Seward, Cambridge, UK.
45   jseward@acm.org
46   bzip2/libbzip2 version 1.0 of 21 March 2000
47 
48   This program is based on (at least) the work of:
49      Mike Burrows
50      David Wheeler
51      Peter Fenwick
52      Alistair Moffat
53      Radford Neal
54      Ian H. Witten
55      Robert Sedgewick
56      Jon L. Bentley
57 
58   For more information on these sources, see the manual.
59 
60   To get some idea how the block sorting algorithms in this file
61   work, read my paper
62      On the Performance of BWT Sorting Algorithms
63   in Proceedings of the IEEE Data Compression Conference 2000,
64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
65   file implements the algorithm called  cache  in the paper.
66 --*/
67 
68 #include "os.h"
69 #include "bzlib.h"
70 #include "bzlib_private.h"
71 
72 /*---------------------------------------------*/
73 /*--- Fallback O(N log(N)^2) sorting        ---*/
74 /*--- algorithm, for repetitive blocks      ---*/
75 /*---------------------------------------------*/
76 
77 /*---------------------------------------------*/
78 static
79 __inline__
fallbackSimpleSort(UInt32 * fmap,UInt32 * eclass,Int32 lo,Int32 hi)80 void fallbackSimpleSort ( UInt32* fmap,
81                           UInt32* eclass,
82                           Int32   lo,
83                           Int32   hi )
84 {
85    Int32 i, j, tmp;
86    UInt32 ec_tmp;
87 
88    if (lo == hi) return;
89 
90    if (hi - lo > 3) {
91       for ( i = hi-4; i >= lo; i-- ) {
92          tmp = fmap[i];
93          ec_tmp = eclass[tmp];
94          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
95             fmap[j-4] = fmap[j];
96          fmap[j-4] = tmp;
97       }
98    }
99 
100    for ( i = hi-1; i >= lo; i-- ) {
101       tmp = fmap[i];
102       ec_tmp = eclass[tmp];
103       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
104          fmap[j-1] = fmap[j];
105       fmap[j-1] = tmp;
106    }
107 }
108 
109 
110 /*---------------------------------------------*/
111 #define fswap(zz1, zz2) \
112    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
113 
114 #define fvswap(zzp1, zzp2, zzn)       \
115 {                                     \
116    Int32 yyp1 = (zzp1);               \
117    Int32 yyp2 = (zzp2);               \
118    Int32 yyn  = (zzn);                \
119    while (yyn > 0) {                  \
120       fswap(fmap[yyp1], fmap[yyp2]);  \
121       yyp1++; yyp2++; yyn--;          \
122    }                                  \
123 }
124 
125 
126 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
127 
128 #define fpush(lz,hz) { stackLo[sp] = lz; \
129                        stackHi[sp] = hz; \
130                        sp++; }
131 
132 #define fpop(lz,hz) { sp--;              \
133                       lz = stackLo[sp];  \
134                       hz = stackHi[sp]; }
135 
136 #define FALLBACK_QSORT_SMALL_THRESH 10
137 #define FALLBACK_QSORT_STACK_SIZE   100
138 
139 
140 static
fallbackQSort3(UInt32 * fmap,UInt32 * eclass,Int32 loSt,Int32 hiSt)141 void fallbackQSort3 ( UInt32* fmap,
142                       UInt32* eclass,
143                       Int32   loSt,
144                       Int32   hiSt )
145 {
146    Int32 unLo, unHi, ltLo, gtHi, n, m;
147    Int32 sp, lo, hi;
148    UInt32 med, r, r3;
149    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
150    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
151 
152    r = 0;
153 
154    sp = 0;
155    fpush ( loSt, hiSt );
156 
157    while (sp > 0) {
158 
159       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
160 
161       fpop ( lo, hi );
162       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
163          fallbackSimpleSort ( fmap, eclass, lo, hi );
164          continue;
165       }
166 
167       /* Random partitioning.  Median of 3 sometimes fails to
168          avoid bad cases.  Median of 9 seems to help but
169          looks rather expensive.  This too seems to work but
170          is cheaper.  Guidance for the magic constants
171          7621 and 32768 is taken from Sedgewick's algorithms
172          book, chapter 35.
173       */
174       r = ((r * 7621) + 1) % 32768;
175       r3 = r % 3;
176       if (r3 == 0) med = eclass[fmap[lo]]; else
177       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
178                    med = eclass[fmap[hi]];
179 
180       unLo = ltLo = lo;
181       unHi = gtHi = hi;
182 
183       while (1) {
184          while (1) {
185             if (unLo > unHi) break;
186             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
187             if (n == 0) {
188                fswap(fmap[unLo], fmap[ltLo]);
189                ltLo++; unLo++;
190                continue;
191             };
192             if (n > 0) break;
193             unLo++;
194          }
195          while (1) {
196             if (unLo > unHi) break;
197             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
198             if (n == 0) {
199                fswap(fmap[unHi], fmap[gtHi]);
200                gtHi--; unHi--;
201                continue;
202             };
203             if (n < 0) break;
204             unHi--;
205          }
206          if (unLo > unHi) break;
207          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
208       }
209 
210       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
211 
212       if (gtHi < ltLo) continue;
213 
214       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
215       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
216 
217       n = lo + unLo - ltLo - 1;
218       m = hi - (gtHi - unHi) + 1;
219 
220       if (n - lo > hi - m) {
221          fpush ( lo, n );
222          fpush ( m, hi );
223       } else {
224          fpush ( m, hi );
225          fpush ( lo, n );
226       }
227    }
228 }
229 
230 #undef fmin
231 #undef fpush
232 #undef fpop
233 #undef fswap
234 #undef fvswap
235 #undef FALLBACK_QSORT_SMALL_THRESH
236 #undef FALLBACK_QSORT_STACK_SIZE
237 
238 
239 /*---------------------------------------------*/
240 /* Pre:
241       nblock > 0
242       eclass exists for [0 .. nblock-1]
243       ((UChar*)eclass) [0 .. nblock-1] holds block
244       ptr exists for [0 .. nblock-1]
245 
246    Post:
247       ((UChar*)eclass) [0 .. nblock-1] holds block
248       All other areas of eclass destroyed
249       fmap [0 .. nblock-1] holds sorted order
250       bhtab [ 0 .. 2+(nblock/32) ] destroyed
251 */
252 
253 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
254 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
255 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
256 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
257 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
258 
259 static
fallbackSort(UInt32 * fmap,UInt32 * eclass,UInt32 * bhtab,Int32 nblock,Int32 verb)260 void fallbackSort ( UInt32* fmap,
261                     UInt32* eclass,
262                     UInt32* bhtab,
263                     Int32   nblock,
264                     Int32   verb )
265 {
266    Int32 ftab[257];
267    Int32 ftabCopy[256];
268    Int32 H, i, j, k, l, r, cc, cc1;
269    Int32 nNotDone;
270    Int32 nBhtab;
271    UChar* eclass8 = (UChar*)eclass;
272 
273    /*--
274       Initial 1-char radix sort to generate
275       initial fmap and initial BH bits.
276    --*/
277    if (verb >= 4)
278       VPrintf0 ( "        bucket sorting ...\n" );
279    for (i = 0; i < 257;    i++) ftab[i] = 0;
280    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
281    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
282    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
283 
284    for (i = 0; i < nblock; i++) {
285       j = eclass8[i];
286       k = ftab[j] - 1;
287       ftab[j] = k;
288       fmap[k] = i;
289    }
290 
291    nBhtab = 2 + (nblock / 32);
292    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
293    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
294 
295    /*--
296       Inductively refine the buckets.  Kind-of an
297       "exponential radix sort" (!), inspired by the
298       Manber-Myers suffix array construction algorithm.
299    --*/
300 
301    /*-- set sentinel bits for block-end detection --*/
302    for (i = 0; i < 32; i++) {
303       SET_BH(nblock + 2*i);
304       CLEAR_BH(nblock + 2*i + 1);
305    }
306 
307    /*-- the log(N) loop --*/
308    H = 1;
309    while (1) {
310 
311       if (verb >= 4)
312          VPrintf1 ( "        depth %6d has ", H );
313 
314       j = 0;
315       for (i = 0; i < nblock; i++) {
316          if (ISSET_BH(i)) j = i;
317          k = fmap[i] - H; if (k < 0) k += nblock;
318          eclass[k] = j;
319       }
320 
321       nNotDone = 0;
322       r = -1;
323       while (1) {
324 
325 	 /*-- find the next non-singleton bucket --*/
326          k = r + 1;
327          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
328          if (ISSET_BH(k)) {
329             while (WORD_BH(k) == 0xffffffff) k += 32;
330             while (ISSET_BH(k)) k++;
331          }
332          l = k - 1;
333          if (l >= nblock) break;
334          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
335          if (!ISSET_BH(k)) {
336             while (WORD_BH(k) == 0x00000000) k += 32;
337             while (!ISSET_BH(k)) k++;
338          }
339          r = k - 1;
340          if (r >= nblock) break;
341 
342          /*-- now [l, r] bracket current bucket --*/
343          if (r > l) {
344             nNotDone += (r - l + 1);
345             fallbackQSort3 ( fmap, eclass, l, r );
346 
347             /*-- scan bucket and generate header bits-- */
348             cc = -1;
349             for (i = l; i <= r; i++) {
350                cc1 = eclass[fmap[i]];
351                if (cc != cc1) { SET_BH(i); cc = cc1; };
352             }
353          }
354       }
355 
356       if (verb >= 4)
357          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
358 
359       H *= 2;
360       if (H > nblock || nNotDone == 0) break;
361    }
362 
363    /*--
364       Reconstruct the original block in
365       eclass8 [0 .. nblock-1], since the
366       previous phase destroyed it.
367    --*/
368    if (verb >= 4)
369       VPrintf0 ( "        reconstructing block ...\n" );
370    j = 0;
371    for (i = 0; i < nblock; i++) {
372       while (ftabCopy[j] == 0) j++;
373       ftabCopy[j]--;
374       eclass8[fmap[i]] = (UChar)j;
375    }
376    AssertH ( j < 256, 1005 );
377 }
378 
379 #undef       SET_BH
380 #undef     CLEAR_BH
381 #undef     ISSET_BH
382 #undef      WORD_BH
383 #undef UNALIGNED_BH
384 
385 
386 /*---------------------------------------------*/
387 /*--- The main, O(N^2 log(N)) sorting       ---*/
388 /*--- algorithm.  Faster for "normal"       ---*/
389 /*--- non-repetitive blocks.                ---*/
390 /*---------------------------------------------*/
391 
392 /*---------------------------------------------*/
393 static
394 __inline__
mainGtU(UInt32 i1,UInt32 i2,UChar * block,UInt16 * quadrant,UInt32 nblock,Int32 * budget)395 Bool mainGtU ( UInt32  i1,
396                UInt32  i2,
397                UChar*  block,
398                UInt16* quadrant,
399                UInt32  nblock,
400                Int32*  budget )
401 {
402    Int32  k;
403    UChar  c1, c2;
404    UInt16 s1, s2;
405 
406    AssertD ( i1 != i2, "mainGtU" );
407    /* 1 */
408    c1 = block[i1]; c2 = block[i2];
409    if (c1 != c2) return (c1 > c2);
410    i1++; i2++;
411    /* 2 */
412    c1 = block[i1]; c2 = block[i2];
413    if (c1 != c2) return (c1 > c2);
414    i1++; i2++;
415    /* 3 */
416    c1 = block[i1]; c2 = block[i2];
417    if (c1 != c2) return (c1 > c2);
418    i1++; i2++;
419    /* 4 */
420    c1 = block[i1]; c2 = block[i2];
421    if (c1 != c2) return (c1 > c2);
422    i1++; i2++;
423    /* 5 */
424    c1 = block[i1]; c2 = block[i2];
425    if (c1 != c2) return (c1 > c2);
426    i1++; i2++;
427    /* 6 */
428    c1 = block[i1]; c2 = block[i2];
429    if (c1 != c2) return (c1 > c2);
430    i1++; i2++;
431    /* 7 */
432    c1 = block[i1]; c2 = block[i2];
433    if (c1 != c2) return (c1 > c2);
434    i1++; i2++;
435    /* 8 */
436    c1 = block[i1]; c2 = block[i2];
437    if (c1 != c2) return (c1 > c2);
438    i1++; i2++;
439    /* 9 */
440    c1 = block[i1]; c2 = block[i2];
441    if (c1 != c2) return (c1 > c2);
442    i1++; i2++;
443    /* 10 */
444    c1 = block[i1]; c2 = block[i2];
445    if (c1 != c2) return (c1 > c2);
446    i1++; i2++;
447    /* 11 */
448    c1 = block[i1]; c2 = block[i2];
449    if (c1 != c2) return (c1 > c2);
450    i1++; i2++;
451    /* 12 */
452    c1 = block[i1]; c2 = block[i2];
453    if (c1 != c2) return (c1 > c2);
454    i1++; i2++;
455 
456    k = nblock + 8;
457 
458    do {
459       /* 1 */
460       c1 = block[i1]; c2 = block[i2];
461       if (c1 != c2) return (c1 > c2);
462       s1 = quadrant[i1]; s2 = quadrant[i2];
463       if (s1 != s2) return (s1 > s2);
464       i1++; i2++;
465       /* 2 */
466       c1 = block[i1]; c2 = block[i2];
467       if (c1 != c2) return (c1 > c2);
468       s1 = quadrant[i1]; s2 = quadrant[i2];
469       if (s1 != s2) return (s1 > s2);
470       i1++; i2++;
471       /* 3 */
472       c1 = block[i1]; c2 = block[i2];
473       if (c1 != c2) return (c1 > c2);
474       s1 = quadrant[i1]; s2 = quadrant[i2];
475       if (s1 != s2) return (s1 > s2);
476       i1++; i2++;
477       /* 4 */
478       c1 = block[i1]; c2 = block[i2];
479       if (c1 != c2) return (c1 > c2);
480       s1 = quadrant[i1]; s2 = quadrant[i2];
481       if (s1 != s2) return (s1 > s2);
482       i1++; i2++;
483       /* 5 */
484       c1 = block[i1]; c2 = block[i2];
485       if (c1 != c2) return (c1 > c2);
486       s1 = quadrant[i1]; s2 = quadrant[i2];
487       if (s1 != s2) return (s1 > s2);
488       i1++; i2++;
489       /* 6 */
490       c1 = block[i1]; c2 = block[i2];
491       if (c1 != c2) return (c1 > c2);
492       s1 = quadrant[i1]; s2 = quadrant[i2];
493       if (s1 != s2) return (s1 > s2);
494       i1++; i2++;
495       /* 7 */
496       c1 = block[i1]; c2 = block[i2];
497       if (c1 != c2) return (c1 > c2);
498       s1 = quadrant[i1]; s2 = quadrant[i2];
499       if (s1 != s2) return (s1 > s2);
500       i1++; i2++;
501       /* 8 */
502       c1 = block[i1]; c2 = block[i2];
503       if (c1 != c2) return (c1 > c2);
504       s1 = quadrant[i1]; s2 = quadrant[i2];
505       if (s1 != s2) return (s1 > s2);
506       i1++; i2++;
507 
508       if (i1 >= nblock) i1 -= nblock;
509       if (i2 >= nblock) i2 -= nblock;
510 
511       k -= 8;
512       (*budget)--;
513    }
514       while (k >= 0);
515 
516    return False;
517 }
518 
519 
520 /*---------------------------------------------*/
521 /*--
522    Knuth's increments seem to work better
523    than Incerpi-Sedgewick here.  Possibly
524    because the number of elems to sort is
525    usually small, typically <= 20.
526 --*/
527 static
528 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
529                    9841, 29524, 88573, 265720,
530                    797161, 2391484 };
531 
532 static
mainSimpleSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 lo,Int32 hi,Int32 d,Int32 * budget)533 void mainSimpleSort ( UInt32* ptr,
534                       UChar*  block,
535                       UInt16* quadrant,
536                       Int32   nblock,
537                       Int32   lo,
538                       Int32   hi,
539                       Int32   d,
540                       Int32*  budget )
541 {
542    Int32 i, j, h, bigN, hp;
543    UInt32 v;
544 
545    bigN = hi - lo + 1;
546    if (bigN < 2) return;
547 
548    hp = 0;
549    while (incs[hp] < bigN) hp++;
550    hp--;
551 
552    for (; hp >= 0; hp--) {
553       h = incs[hp];
554 
555       i = lo + h;
556       while (True) {
557 
558          /*-- copy 1 --*/
559          if (i > hi) break;
560          v = ptr[i];
561          j = i;
562          while ( mainGtU (
563                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
564                  ) ) {
565             ptr[j] = ptr[j-h];
566             j = j - h;
567             if (j <= (lo + h - 1)) break;
568          }
569          ptr[j] = v;
570          i++;
571 
572          /*-- copy 2 --*/
573          if (i > hi) break;
574          v = ptr[i];
575          j = i;
576          while ( mainGtU (
577                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
578                  ) ) {
579             ptr[j] = ptr[j-h];
580             j = j - h;
581             if (j <= (lo + h - 1)) break;
582          }
583          ptr[j] = v;
584          i++;
585 
586          /*-- copy 3 --*/
587          if (i > hi) break;
588          v = ptr[i];
589          j = i;
590          while ( mainGtU (
591                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
592                  ) ) {
593             ptr[j] = ptr[j-h];
594             j = j - h;
595             if (j <= (lo + h - 1)) break;
596          }
597          ptr[j] = v;
598          i++;
599 
600          if (*budget < 0) return;
601       }
602    }
603 }
604 
605 
606 /*---------------------------------------------*/
607 /*--
608    The following is an implementation of
609    an elegant 3-way quicksort for strings,
610    described in a paper "Fast Algorithms for
611    Sorting and Searching Strings", by Robert
612    Sedgewick and Jon L. Bentley.
613 --*/
614 
615 #define mswap(zz1, zz2) \
616    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
617 
618 #define mvswap(zzp1, zzp2, zzn)       \
619 {                                     \
620    Int32 yyp1 = (zzp1);               \
621    Int32 yyp2 = (zzp2);               \
622    Int32 yyn  = (zzn);                \
623    while (yyn > 0) {                  \
624       mswap(ptr[yyp1], ptr[yyp2]);    \
625       yyp1++; yyp2++; yyn--;          \
626    }                                  \
627 }
628 
629 static
630 __inline__
mmed3(UChar a,UChar b,UChar c)631 UChar mmed3 ( UChar a, UChar b, UChar c )
632 {
633    UChar t;
634    if (a > b) { t = a; a = b; b = t; };
635    if (b > c) {
636       b = c;
637       if (a > b) b = a;
638    }
639    return b;
640 }
641 
642 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
643 
644 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
645                           stackHi[sp] = hz; \
646                           stackD [sp] = dz; \
647                           sp++; }
648 
649 #define mpop(lz,hz,dz) { sp--;             \
650                          lz = stackLo[sp]; \
651                          hz = stackHi[sp]; \
652                          dz = stackD [sp]; }
653 
654 
655 #define mnextsize(az) (nextHi[az]-nextLo[az])
656 
657 #define mnextswap(az,bz)                                        \
658    { Int32 tz;                                                  \
659      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
660      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
661      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
662 
663 
664 #define MAIN_QSORT_SMALL_THRESH 20
665 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
666 #define MAIN_QSORT_STACK_SIZE 100
667 
668 static
mainQSort3(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 loSt,Int32 hiSt,Int32 dSt,Int32 * budget)669 void mainQSort3 ( UInt32* ptr,
670                   UChar*  block,
671                   UInt16* quadrant,
672                   Int32   nblock,
673                   Int32   loSt,
674                   Int32   hiSt,
675                   Int32   dSt,
676                   Int32*  budget )
677 {
678    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
679    Int32 sp, lo, hi, d;
680 
681    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
682    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
683    Int32 stackD [MAIN_QSORT_STACK_SIZE];
684 
685    Int32 nextLo[3];
686    Int32 nextHi[3];
687    Int32 nextD [3];
688 
689    sp = 0;
690    mpush ( loSt, hiSt, dSt );
691 
692    while (sp > 0) {
693 
694       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
695 
696       mpop ( lo, hi, d );
697       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
698           d > MAIN_QSORT_DEPTH_THRESH) {
699          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
700          if (*budget < 0) return;
701          continue;
702       }
703 
704       med = (Int32)
705             mmed3 ( block[ptr[ lo         ]+d],
706                     block[ptr[ hi         ]+d],
707                     block[ptr[ (lo+hi)>>1 ]+d] );
708 
709       unLo = ltLo = lo;
710       unHi = gtHi = hi;
711 
712       while (True) {
713          while (True) {
714             if (unLo > unHi) break;
715             n = ((Int32)block[ptr[unLo]+d]) - med;
716             if (n == 0) {
717                mswap(ptr[unLo], ptr[ltLo]);
718                ltLo++; unLo++; continue;
719             };
720             if (n >  0) break;
721             unLo++;
722          }
723          while (True) {
724             if (unLo > unHi) break;
725             n = ((Int32)block[ptr[unHi]+d]) - med;
726             if (n == 0) {
727                mswap(ptr[unHi], ptr[gtHi]);
728                gtHi--; unHi--; continue;
729             };
730             if (n <  0) break;
731             unHi--;
732          }
733          if (unLo > unHi) break;
734          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
735       }
736 
737       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
738 
739       if (gtHi < ltLo) {
740          mpush(lo, hi, d+1 );
741          continue;
742       }
743 
744       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
745       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
746 
747       n = lo + unLo - ltLo - 1;
748       m = hi - (gtHi - unHi) + 1;
749 
750       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
751       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
752       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
753 
754       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
755       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
756       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
757 
758       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
759       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
760 
761       mpush (nextLo[0], nextHi[0], nextD[0]);
762       mpush (nextLo[1], nextHi[1], nextD[1]);
763       mpush (nextLo[2], nextHi[2], nextD[2]);
764    }
765 }
766 
767 #undef mswap
768 #undef mvswap
769 #undef mpush
770 #undef mpop
771 #undef mmin
772 #undef mnextsize
773 #undef mnextswap
774 #undef MAIN_QSORT_SMALL_THRESH
775 #undef MAIN_QSORT_DEPTH_THRESH
776 #undef MAIN_QSORT_STACK_SIZE
777 
778 
779 /*---------------------------------------------*/
780 /* Pre:
781       nblock > N_OVERSHOOT
782       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
783       ((UChar*)block32) [0 .. nblock-1] holds block
784       ptr exists for [0 .. nblock-1]
785 
786    Post:
787       ((UChar*)block32) [0 .. nblock-1] holds block
788       All other areas of block32 destroyed
789       ftab [0 .. 65536 ] destroyed
790       ptr [0 .. nblock-1] holds sorted order
791       if (*budget < 0), sorting was abandoned
792 */
793 
794 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
795 #define SETMASK (1 << 21)
796 #define CLEARMASK (~(SETMASK))
797 
798 static
mainSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,UInt32 * ftab,Int32 nblock,Int32 verb,Int32 * budget)799 void mainSort ( UInt32* ptr,
800                 UChar*  block,
801                 UInt16* quadrant,
802                 UInt32* ftab,
803                 Int32   nblock,
804                 Int32   verb,
805                 Int32*  budget )
806 {
807    Int32  i, j, k, ss, sb;
808    Int32  runningOrder[256];
809    Bool   bigDone[256];
810    Int32  copyStart[256];
811    Int32  copyEnd  [256];
812    UChar  c1;
813    Int32  numQSorted;
814    UInt16 s;
815    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
816 
817    /*-- set up the 2-byte frequency table --*/
818    for (i = 65536; i >= 0; i--) ftab[i] = 0;
819 
820    j = block[0] << 8;
821    i = nblock-1;
822    for (; i >= 3; i -= 4) {
823       quadrant[i] = 0;
824       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
825       ftab[j]++;
826       quadrant[i-1] = 0;
827       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
828       ftab[j]++;
829       quadrant[i-2] = 0;
830       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
831       ftab[j]++;
832       quadrant[i-3] = 0;
833       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
834       ftab[j]++;
835    }
836    for (; i >= 0; i--) {
837       quadrant[i] = 0;
838       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
839       ftab[j]++;
840    }
841 
842    /*-- (emphasises close relationship of block & quadrant) --*/
843    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
844       block   [nblock+i] = block[i];
845       quadrant[nblock+i] = 0;
846    }
847 
848    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
849 
850    /*-- Complete the initial radix sort --*/
851    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
852 
853    s = block[0] << 8;
854    i = nblock-1;
855    for (; i >= 3; i -= 4) {
856       s = (s >> 8) | (block[i] << 8);
857       j = ftab[s] -1;
858       ftab[s] = j;
859       ptr[j] = i;
860       s = (s >> 8) | (block[i-1] << 8);
861       j = ftab[s] -1;
862       ftab[s] = j;
863       ptr[j] = i-1;
864       s = (s >> 8) | (block[i-2] << 8);
865       j = ftab[s] -1;
866       ftab[s] = j;
867       ptr[j] = i-2;
868       s = (s >> 8) | (block[i-3] << 8);
869       j = ftab[s] -1;
870       ftab[s] = j;
871       ptr[j] = i-3;
872    }
873    for (; i >= 0; i--) {
874       s = (s >> 8) | (block[i] << 8);
875       j = ftab[s] -1;
876       ftab[s] = j;
877       ptr[j] = i;
878    }
879 
880    /*--
881       Now ftab contains the first loc of every small bucket.
882       Calculate the running order, from smallest to largest
883       big bucket.
884    --*/
885    for (i = 0; i <= 255; i++) {
886       bigDone     [i] = False;
887       runningOrder[i] = i;
888    }
889 
890    {
891       Int32 vv;
892       Int32 h = 1;
893       do h = 3 * h + 1; while (h <= 256);
894       do {
895          h = h / 3;
896          for (i = h; i <= 255; i++) {
897             vv = runningOrder[i];
898             j = i;
899             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
900                runningOrder[j] = runningOrder[j-h];
901                j = j - h;
902                if (j <= (h - 1)) goto zero;
903             }
904             zero:
905             runningOrder[j] = vv;
906          }
907       } while (h != 1);
908    }
909 
910    /*--
911       The main sorting loop.
912    --*/
913 
914    numQSorted = 0;
915 
916    for (i = 0; i <= 255; i++) {
917 
918       /*--
919          Process big buckets, starting with the least full.
920          Basically this is a 3-step process in which we call
921          mainQSort3 to sort the small buckets [ss, j], but
922          also make a big effort to avoid the calls if we can.
923       --*/
924       ss = runningOrder[i];
925 
926       /*--
927          Step 1:
928          Complete the big bucket [ss] by quicksorting
929          any unsorted small buckets [ss, j], for j != ss.
930          Hopefully previous pointer-scanning phases have already
931          completed many of the small buckets [ss, j], so
932          we don't have to sort them at all.
933       --*/
934       for (j = 0; j <= 255; j++) {
935          if (j != ss) {
936             sb = (ss << 8) + j;
937             if ( ! (ftab[sb] & SETMASK) ) {
938                Int32 lo = ftab[sb]   & CLEARMASK;
939                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
940                if (hi > lo) {
941                   if (verb >= 4)
942                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
943                                 "done %d   this %d\n",
944                                 ss, j, numQSorted, hi - lo + 1 );
945                   mainQSort3 (
946                      ptr, block, quadrant, nblock,
947                      lo, hi, BZ_N_RADIX, budget
948                   );
949                   numQSorted += (hi - lo + 1);
950                   if (*budget < 0) return;
951                }
952             }
953             ftab[sb] |= SETMASK;
954          }
955       }
956 
957       AssertH ( !bigDone[ss], 1006 );
958 
959       /*--
960          Step 2:
961          Now scan this big bucket [ss] so as to synthesise the
962          sorted order for small buckets [t, ss] for all t,
963          including, magically, the bucket [ss,ss] too.
964          This will avoid doing Real Work in subsequent Step 1's.
965       --*/
966       {
967          for (j = 0; j <= 255; j++) {
968             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
969             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
970          }
971          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
972             k = ptr[j]-1; if (k < 0) k += nblock;
973             c1 = block[k];
974             if (!bigDone[c1])
975                ptr[ copyStart[c1]++ ] = k;
976          }
977          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
978             k = ptr[j]-1; if (k < 0) k += nblock;
979             c1 = block[k];
980             if (!bigDone[c1])
981                ptr[ copyEnd[c1]-- ] = k;
982          }
983       }
984 
985       AssertH ( copyStart[ss]-1 == copyEnd[ss], 1007 );
986 
987       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
988 
989       /*--
990          Step 3:
991          The [ss] big bucket is now done.  Record this fact,
992          and update the quadrant descriptors.  Remember to
993          update quadrants in the overshoot area too, if
994          necessary.  The "if (i < 255)" test merely skips
995          this updating for the last bucket processed, since
996          updating for the last bucket is pointless.
997 
998          The quadrant array provides a way to incrementally
999          cache sort orderings, as they appear, so as to
1000          make subsequent comparisons in fullGtU() complete
1001          faster.  For repetitive blocks this makes a big
1002          difference (but not big enough to be able to avoid
1003          the fallback sorting mechanism, exponential radix sort).
1004 
1005          The precise meaning is: at all times:
1006 
1007             for 0 <= i < nblock and 0 <= j <= nblock
1008 
1009             if block[i] != block[j],
1010 
1011                then the relative values of quadrant[i] and
1012                     quadrant[j] are meaningless.
1013 
1014                else {
1015                   if quadrant[i] < quadrant[j]
1016                      then the string starting at i lexicographically
1017                      precedes the string starting at j
1018 
1019                   else if quadrant[i] > quadrant[j]
1020                      then the string starting at j lexicographically
1021                      precedes the string starting at i
1022 
1023                   else
1024                      the relative ordering of the strings starting
1025                      at i and j has not yet been determined.
1026                }
1027       --*/
1028       bigDone[ss] = True;
1029 
1030       if (i < 255) {
1031          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1032          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1033          Int32 shifts   = 0;
1034 
1035          while ((bbSize >> shifts) > 65534) shifts++;
1036 
1037          for (j = bbSize-1; j >= 0; j--) {
1038             Int32 a2update     = ptr[bbStart + j];
1039             UInt16 qVal        = (UInt16)(j >> shifts);
1040             quadrant[a2update] = qVal;
1041             if (a2update < BZ_N_OVERSHOOT)
1042                quadrant[a2update + nblock] = qVal;
1043          }
1044          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1045       }
1046 
1047    }
1048 
1049    if (verb >= 4)
1050       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1051                  nblock, numQSorted, nblock - numQSorted );
1052 }
1053 
1054 #undef BIGFREQ
1055 #undef SETMASK
1056 #undef CLEARMASK
1057 
1058 
1059 /*---------------------------------------------*/
1060 /* Pre:
1061       nblock > 0
1062       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1063       ((UChar*)arr2)  [0 .. nblock-1] holds block
1064       arr1 exists for [0 .. nblock-1]
1065 
1066    Post:
1067       ((UChar*)arr2) [0 .. nblock-1] holds block
1068       All other areas of block destroyed
1069       ftab [ 0 .. 65536 ] destroyed
1070       arr1 [0 .. nblock-1] holds sorted order
1071 */
BZ2_blockSort(EState * s)1072 void BZ2_blockSort ( EState* s )
1073 {
1074    UInt32* ptr    = s->ptr;
1075    UChar*  block  = s->block;
1076    UInt32* ftab   = s->ftab;
1077    Int32   nblock = s->nblock;
1078    Int32   verb   = s->verbosity;
1079    Int32   wfact  = s->workFactor;
1080    UInt16* quadrant;
1081    Int32   budget;
1082    Int32   budgetInit;
1083    Int32   i;
1084 
1085    if (nblock < 10000) {
1086       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1087    } else {
1088       /* Calculate the location for quadrant, remembering to get
1089          the alignment right.  Assumes that &(block[0]) is at least
1090          2-byte aligned -- this should be ok since block is really
1091          the first section of arr2.
1092       */
1093       i = nblock+BZ_N_OVERSHOOT;
1094       if (i & 1) i++;
1095       quadrant = (UInt16*)(&(block[i]));
1096 
1097       /* (wfact-1) / 3 puts the default-factor-30
1098          transition point at very roughly the same place as
1099          with v0.1 and v0.9.0.
1100          Not that it particularly matters any more, since the
1101          resulting compressed stream is now the same regardless
1102          of whether or not we use the main sort or fallback sort.
1103       */
1104       if (wfact < 1  ) wfact = 1;
1105       if (wfact > 100) wfact = 100;
1106       budgetInit = nblock * ((wfact-1) / 3);
1107       budget = budgetInit;
1108 
1109       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1110       if (verb >= 3)
1111          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1112                     budgetInit - budget,
1113                     nblock,
1114                     (float)(budgetInit - budget) /
1115                     (float)(nblock==0 ? 1 : nblock) );
1116       if (budget < 0) {
1117          if (verb >= 2)
1118             VPrintf0 ( "    too repetitive; using fallback"
1119                        " sorting algorithm\n" );
1120          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1121       }
1122    }
1123 
1124    s->origPtr = -1;
1125    for (i = 0; i < s->nblock; i++)
1126       if (ptr[i] == 0)
1127          { s->origPtr = i; break; };
1128 
1129    AssertH( s->origPtr != -1, 1003 );
1130 }
1131 
1132 
1133 /*-------------------------------------------------------------*/
1134 /*--- end                                       blocksort.c ---*/
1135 /*-------------------------------------------------------------*/
1136