TBlock.c - This C code implements a block-based untransform…

/tags/jet3d_dev_msvc2003/source/Engine/JetEngine/Bitmap/Compression/TBlock.c

# · C · 809 lines · 523 code · 130 blank · 156 comment · 47 complexity · 138067608100b6880f8f759cd668a340 MD5 · raw file

/****************************************************************************************/
/*  TBLOCK.C                                                                            */
/*                                                                                      */
/*  Author:                                                                             */
/*  Description:                                                                        */
/*                                                                                      */
/*  The contents of this file are subject to the Jet3D Public License                   */
/*  Version 1.02 (the "License"); you may not use this file except in                   */
/*  compliance with the License. You may obtain a copy of the License at                */
/*  http://www.jet3d.com                                                                */
/*                                                                                      */
/*  Software distributed under the License is distributed on an "AS IS"                 */
/*  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See                */
/*  the License for the specific language governing rights and limitations              */
/*  under the License.                                                                  */
/*                                                                                      */
/*  The Original Code is Jet3D, released December 12, 1999.                             */
/*  Copyright (C) 1996-1999 Eclipse Entertainment, L.L.C. All Rights Reserved           */
/*                                                                                      */
/****************************************************************************************/
/*{**** BOF ****/

#include "TBlock.h"
#include "codeimage.h"
#include "Tsc.h"
#include "Log.h"

#include "Timer.h"

TIMER_VARS(TBlock_All);
TIMER_VARS(TBlock_Ram);
TIMER_VARS(TBlock_Transpose);
TIMER_VARS(TBlock_H);
TIMER_VARS(TBlock_HB);
TIMER_VARS(TBlock_V);
TIMER_VARS(TBlock_VB);
TIMER_VARS(TBlock_H_SpinUpDown);
TIMER_VARS(TBlock_H_Waver);
TIMER_VARS(TBlock_H_Block);
TIMER_VARS(TBlock_V_UnBlock);
TIMER_VARS(TBlock_V_Waver);

//#define cachetouch_w(x,y)
//#define cachetouch_r(x,y)

/**
with cachetouch disabled:

TBlock_All           : 0.039409 : 48.0 %
TBlock_Ram           : 0.000285 : 0.3 %
TBlock_Transpose     : 0.000000 : 0.0 %
TBlock_H             : 0.012152 : 14.8 %	// these four numbers vary pretty wildly:
TBlock_HB            : 0.009154 : 11.2 %
TBlock_V             : 0.013977 : 17.0 %
TBlock_VB            : 0.003804 : 4.6 %

with cachetouch:

TBlock_All           : 0.043376 : 48.9 %
TBlock_Ram           : 0.000291 : 0.3 %
TBlock_Transpose     : 0.000000 : 0.0 %
TBlock_H             : 0.012161 : 13.7 %
TBlock_HB            : 0.010536 : 11.9 %
TBlock_V             : 0.016425 : 18.5 %
TBlock_VB            : 0.003923 : 4.4 %

**/

typedef struct {
	jeWaveletFunc waver;
	int * blocks;
	int * trows[9];
	int stride8;
	int ** rows;
} tblockInfo;

#define DO8(x)	do { x; x; x; x; x; x; x; x; } while(0)

/*}{**** row <-> block copiers ****/

void __inline rowtoblock(int * bptr,const int *row,int w8)
{
int x8;

	for(x8=w8;x8--;)
	{
		DO8(*bptr++ = *row++);
		bptr += 56;
	}
}

void __inline blocktorow(int * row,const int *bptr,int x)
{
int x8;

	x8 = x>>3;
	while(x8--)
	{
		DO8(*row++ = *bptr++);
		bptr += 56;
	}
	x = x&7;
	while(x--)
	{
		*row++ = *bptr++;
	}
}

void __inline blockvtorow(int * row,const int *bptr,int y,int stride8)
{
int y8;

	y8 = y>>3;
	while(y8--)
	{
		DO8(*row++ = *bptr; bptr += 8);
		bptr += stride8 - 64;
	}
	y = y&7;
	while(y--)
	{
		*row++ = *bptr; bptr += 8;
	}
}

void __inline rowtoblockv(int *bptr,const int *row,int y,int stride8)
{
int y8;

	y8 = y>>3;
	while(y8--)
	{
		DO8(*bptr = *row++; bptr += 8);
		bptr += stride8 - 64;
	}
	y = y&7;
	while(y--)
	{
		*bptr = *row++; bptr += 8;
	}
}

/***
void rowtoblock8(int * bptr,const int **inrows,int w8)
{
int x8;
int * rows[8];

	memcpy(rows,inrows,32);

	for(x8=w8;x8--;)
	{
		DO8(*bptr++ = *rows[0]++);
		DO8(*bptr++ = *rows[1]++);
		DO8(*bptr++ = *rows[2]++);
		DO8(*bptr++ = *rows[3]++);
		DO8(*bptr++ = *rows[4]++);
		DO8(*bptr++ = *rows[5]++);
		DO8(*bptr++ = *rows[6]++);
		DO8(*bptr++ = *rows[7]++);
	}
}
***/

void rowtoblock8(int * inbptr,const int **inrows,int w8)
{
uint32 rows[8],bptr;

	//__asm { int 3 };

	bptr = (uint32)inbptr;
	memcpy(rows,inrows,32);

	while(w8--)
	{
		copy32_8((char *)bptr,(char **)rows);
		rows[0] += 32; rows[1] += 32; rows[2] += 32; rows[3] += 32;
		rows[4] += 32; rows[5] += 32; rows[6] += 32; rows[7] += 32;
		bptr += 256;
	}
}

void blockvtorow8(int ** rows,const int *bptr,int h,int stride8)
{
int y8;
int y;
int *row0,*row1,*row2,*row3,*row4,*row5,*row6,*row7;

	row0 = rows[0]; cachetouch_w(row0,h>>3);
	row1 = rows[1]; cachetouch_w(row1,h>>3);
	row2 = rows[2]; cachetouch_w(row2,h>>3);
	row3 = rows[3]; cachetouch_w(row3,h>>3);
	row4 = rows[4]; cachetouch_w(row4,h>>3);
	row5 = rows[5]; cachetouch_w(row5,h>>3);
	row6 = rows[6]; cachetouch_w(row6,h>>3);
	row7 = rows[7]; cachetouch_w(row7,h>>3);
	y8 = h>>3;
	while(y8--)
	{
		DO8(*row0++ = bptr[0]; *row1++ = bptr[1]; *row2++ = bptr[2]; *row3++ = bptr[3];	\
			*row4++ = bptr[4]; *row5++ = bptr[5]; *row6++ = bptr[6]; *row7++ = bptr[7];	\
			bptr += 8; );
		bptr += stride8 - 64;
	}
	y = h&7;
	while(y--)
	{
		*row0++ = *bptr++;
		*row1++ = *bptr++;
		*row2++ = *bptr++;
		*row3++ = *bptr++;
		*row4++ = *bptr++;
		*row5++ = *bptr++;
		*row6++ = *bptr++;
		*row7++ = *bptr++;
	}
}

void rowtoblockv8(int *bptr,const int ** rows,int h,int stride8)
{
int y8;
int y;
const int *row0,*row1,*row2,*row3,*row4,*row5,*row6,*row7;

	row0 = rows[0];	row1 = rows[1];
	row2 = rows[2];	row3 = rows[3];
	row4 = rows[4];	row5 = rows[5];
	row6 = rows[6];	row7 = rows[7];
	y8 = h>>3;
	while(y8--)
	{
		cachetouch_w(bptr,1);
		DO8(bptr[0] = *row0++; bptr[1] = *row1++; bptr[2] = *row2++; bptr[3] = *row3++; \
			bptr[4] = *row4++; bptr[5] = *row5++; bptr[6] = *row6++; bptr[7] = *row7++; \
			bptr += 8; );
		bptr += stride8 - 64;
	}
	y = h&7;
	while(y--)
	{
		bptr[0] = *row0++;
		bptr[1] = *row1++;
		bptr[2] = *row2++;
		bptr[3] = *row3++;
		bptr[4] = *row4++;
		bptr[5] = *row5++;
		bptr[6] = *row6++;
		bptr[7] = *row7++;
		bptr += 8;
	}
}

/*}{**** transformers ; row <-> block **************/

void untH(int starty,int endy,int w,tblockInfo * tbi)
{
int *workrow;
int y8,yi,y,w8;
int * bptr;
int stride8,**rows,*blocks;
jeWaveletFunc waver;

	TIMER_P(TBlock_H);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;

	// (row+row) -> (brow)

	// <> we should make a roll-8 version, but then we need
	//	a spin-up and a spin-down loop

	w8 = (w+7)>>3;
	y8 = (starty>>3);
	yi = (starty&7);
	bptr = blocks + stride8*y8 + 8*yi;
	for(y=starty;y<endy;y++)
	{
		workrow = rows[y-1];
		waver(workrow,rows[y],w);		// workrow <- rows[y]
		rowtoblock(bptr,workrow,w8);	// block   <- workrow
	 
		bptr += 8;	// point to next line in blocks !
		yi++;
		if ( yi == 8 )
		{
			yi = 0; y8 ++;
			bptr = blocks + stride8*y8;
		}
		assert(y8 == ((y+1)>>3));
	}
	
	TIMER_Q(TBlock_H);
}

void untH2(int starty,int endy,int w,tblockInfo * tbi)
{
int y8,nexty,y,w8,i;
int * bptr;
int stride8,**rows,*blocks;
jeWaveletFunc waver;

	TIMER_P(TBlock_H);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;

	// (row+row) -> (brow)

	// a roll-8 version
	//	with a spin-up and a spin-down loop

	y  = starty;
	w8 = (w+7)>>3;
	bptr = blocks + stride8*(starty>>3) + 8*(starty&7);

	nexty = ((starty+7)&(~7));
	if ( nexty > endy ) nexty = endy;

	TIMER_P(TBlock_H_SpinUpDown);

	for(;y<nexty;y++)
	{
		waver(rows[y-1],rows[y],w);		// workrow <- rows[y]
		rowtoblock(bptr,rows[y-1],w8);	// block   <- workrow
		bptr += 8;	// point to next line in blocks !
	}
	
	TIMER_Q(TBlock_H_SpinUpDown);

	bptr = blocks + stride8*(y>>3);
	y8 = (endy - y)>>3;

	nexty = y + (y8<<3);

/**

	----
	separated : 
	TBlock_H_Waver       : 0.005721 : 6.7 %
	TBlock_H_Block       : 0.008363 : 9.9 %
	----
		
	TIMER_P(TBlock_H_Waver);
	for(i=y;i<nexty;i++)
	{
		// this is cache optimal ; read a row, then write it
		waver(rows[i-1],rows[i],w);		// workrow <- rows[y]
	}
	TIMER_Q(TBlock_H_Waver);

	TIMER_P(TBlock_H_Block);
	while(y8--)
	{
		rowtoblock8(bptr,rows + y-1,w8);	// blocks <- rows
		y += 8;
		bptr += stride8;
	}
	TIMER_Q(TBlock_H_Block);

	----
	merged:
	TBlock_H_Waver       : 0.005636 : 6.8 %
	TBlock_H_Block       : 0.006693 : 8.1 %
	----

	copy32_8 assembly:

	TBlock_H_Waver       : 0.005728 : 6.4 %
	TBlock_H_Block       : 0.006458 : 7.2 %

**/

	while(y8--)
	{
		TIMER_P(TBlock_H_Waver);
		cachetouch_w(rows[y-1],w8); // this row may not be in cache yet
		//cachetouch_r(rows[y],w8);
		for(i=0;i<8;i++)
		{
			// this is cache optimal ; read a row, then write it
			waver(rows[y+i-1],rows[y+i],w);		// workrow <- rows[y]
		}
		TIMER_Q(TBlock_H_Waver);
		TIMER_P(TBlock_H_Block);
		// all rows should be in cache now
		rowtoblock8(bptr,(const int **)(rows + y-1),w8);	// blocks <- rows
		TIMER_Q(TBlock_H_Block);
		y += 8;
		bptr += stride8;
	}

	TIMER_P(TBlock_H_SpinUpDown);

	for(;y<endy;y++)
	{
		waver(rows[y-1],rows[y],w);		// workrow <- rows[y]
		rowtoblock(bptr,rows[y-1],w8);	// block   <- workrow
		bptr += 8;	// point to next line in blocks !
	}

	TIMER_Q(TBlock_H_SpinUpDown);

	TIMER_Q(TBlock_H);
}

void untHb(int starty,int endy,int w,tblockInfo * tbi)
{
int *workrow,*row;
int y8,yi,y,w8;
int * bptr;
int stride8,**rows,*blocks;
jeWaveletFunc waver;

	TIMER_P(TBlock_HB);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;

	// (brow+row) -> (block row)

	// <> we should make a roll-8 version, 

	w8 = (w+7)>>3;
	y8 = (starty>>3);
	yi = (starty&7);
	bptr = blocks + stride8*y8 + 8*yi;
	for(y=starty;y<endy;y++)
	{
		row = rows[y];
		workrow = rows[y-1];
		cachetouch_w(row,w8);
		blocktorow(row,bptr,w>>1);	// get the LL out of blocks
									// the LH part is already in row[]

		waver(workrow,row,w);		// workrow <- row		; write to the row we just read from
		rowtoblock(bptr,workrow,w8);// block   <- workrow	; back in the blocks

		yi++;
		bptr += 8;	// point to next line in blocks !
		if ( yi == 8 )
		{
			yi = 0; y8 ++;
			bptr = blocks + stride8*y8;
		}
		assert(y8 == ((y+1)>>3));
	}
	
	TIMER_Q(TBlock_HB);
}

void untV2(int w,int h,tblockInfo * tbi)
{
int x8,xi,y;
int * bptr;
int stride8,**rows,*blocks;
jeWaveletFunc waver;

	//  this is just bad:
	// TBlock_V_UnBlock     : 0.009621 : 11.4 %
	// TBlock_V_Waver       : 0.007168 : 8.5 %

	TIMER_P(TBlock_V);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;

	// this is only done once, at the very end
	//	 at this point all our data is in the blocks,
	//	 so we can trash anything in the rows

	// (bcolumn) -> (row)

	x8 = w>>3;
	xi = w&7;
	bptr = blocks;
	y = -1;
	
	TIMER_P(TBlock_V_UnBlock);
	while(x8--)
	{
		blockvtorow8(rows+y,bptr,h,stride8);
		bptr += 64;		// step past 8 columns in blocks !
		y += 8;
	}
	TIMER_Q(TBlock_V_UnBlock);
	
	while(xi--)
	{
		cachetouch_w(rows[y],h>>3);
		blockvtorow(rows[y],bptr,h,stride8);
		bptr ++;		// point to next column in blocks !
		y++;
	}

	TIMER_P(TBlock_V_Waver);
	cachetouch_w(rows[w-1],h>>3);
	for(y = w - 1;(y>=0);y--)
	{
		// this is cache-optimal : we read from row (y) then write to row (y)
		waver(rows[y],rows[y-1],h);
	}
	TIMER_Q(TBlock_V_Waver);

	TIMER_Q(TBlock_V);
}

void untV3(int w,int h,tblockInfo * tbi)
{
int x8,xi,y,i;
int * bptr;
int stride8,**rows,**trows,*blocks;
jeWaveletFunc waver;

	// The Waver is slow cuz we're writing to memory not in cache at all
	//  on a K7 or P3, we the cachetouch_w fixes everything
	// TBlock_V_UnBlock     : 0.006368 : 7.4 %
	// TBlock_V_Waver       : 0.007712 : 9.0 %

	TIMER_P(TBlock_V);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;
	trows	= tbi->trows;

	// this is only done once, at the very end
	//	 at this point all our data is in the blocks,
	//	 so we can trash anything in the rows

	// (bcolumn) -> (row)

	x8 = w>>3;
	xi = w&7;
	bptr = blocks;
	y = 0;
	
	while(x8--)
	{
	TIMER_P(TBlock_V_UnBlock);
		blockvtorow8(trows,bptr,h,stride8);
	TIMER_Q(TBlock_V_UnBlock);
	TIMER_P(TBlock_V_Waver);
		for(i=0;i<8;i++)
		{
			cachetouch_w(rows[y+i],h>>3);
			waver(rows[y+i],trows[i],h);
		}
	TIMER_Q(TBlock_V_Waver);
		bptr += 64;		// step past 8 columns in blocks !
		y += 8;
	}
	
	cachetouch_w(trows[0],h>>3);
	while(xi--)
	{
		blockvtorow(trows[0],bptr,h,stride8);
		waver(rows[y],trows[0],h);
		bptr ++;		// point to next column in blocks !
		y++;
	}

	TIMER_Q(TBlock_V);
}

void untV4(int w,int h,tblockInfo * tbi)
{
int x8,xi,y,i;
int * bptr;
int stride8,**rows,*workrow,*blocks;
int *zrows[8];
jeWaveletFunc waver;

	// well, we sped up the Waver, but the UnBlock still hurts
	// TBlock_V_UnBlock     : 0.009015 : 11.1 %
	// TBlock_V_Waver       : 0.004652 : 5.7 %

	TIMER_P(TBlock_V);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	rows	= tbi->rows;
	workrow = tbi->rows[-1];

	// this is only done once, at the very end
	//	 at this point all our data is in the blocks,
	//	 so we can trash anything in the rows

	// (bcolumn) -> (row)

	x8 = w>>3;
	xi = w&7;
	bptr = blocks;
	y = 0;
	
	zrows[0] = workrow;
	cachetouch_w(workrow,h>>3);
	while(x8--)
	{
		for(i=1;i<8;i++)
			zrows[i] = rows[y+i-1];
	TIMER_P(TBlock_V_UnBlock);
		blockvtorow8(zrows,bptr,h,stride8);
	TIMER_Q(TBlock_V_UnBlock);
	TIMER_P(TBlock_V_Waver);
		for(i=7;i>=0;i--)
		{
			cachetouch_w(rows[y+i],h>>3);
			// write to i, read from (i-1), step backwards; this is cache-optimal
			waver(rows[y+i],zrows[i],h);
		}
	TIMER_Q(TBlock_V_Waver);
		bptr += 64;		// step past 8 columns in blocks !
		y += 8;
	}
	
	cachetouch_w(workrow,h>>3);
	while(xi--)
	{
		blockvtorow(workrow,bptr,h,stride8);
		waver(rows[y],workrow,h);
		bptr ++;		// point to next column in blocks !
		y++;
	}

	TIMER_Q(TBlock_V);
}

void untVb3(int w,int h,tblockInfo * tbi)
{
int x8,xi,y;
int * bptr;
int stride8,**trows,*blocks;
jeWaveletFunc waver;

	TIMER_P(TBlock_VB);

	waver	= tbi->waver;
	stride8 = tbi->stride8;
	blocks	= tbi->blocks;
	trows	= tbi->trows;
	// (bcolumn) -> (bcolumn)

	// read the whole set of bcolumns out to rows,
	// then wavelet all the rows
	// then read 'em back to bcolumns

	x8 = w>>3;
	bptr = blocks;
	while(x8--)
	{
		// read 8 columns out to rows
		blockvtorow8(trows+1,bptr,h,stride8);
		// wave 'em, shifting down one
		cachetouch_w(trows[0],h>>3);
		for(y=1;y<9;y++)
			waver(trows[y-1],trows[y],h);

		// now put 'em back in blocks :
		rowtoblockv8(bptr,(const int **)trows,h,stride8);
		bptr += 64;		// step past 8 columns in blocks !
	}

	// spin down:

	xi = w&7;
	cachetouch_w(trows[1],h>>3);
	for(y=0;y<xi;y++)
	{
		blockvtorow(trows[0],bptr,h,stride8);
		waver(trows[1],trows[0],h);
		rowtoblockv(bptr,trows[1],h,stride8);
		bptr ++;		// point to next column in blocks !
	}
	
	TIMER_Q(TBlock_VB);
}

/*}{*** IT ********/

void untransformBlocked(image *im,int levels,jeWaveletFunc waver,jeBoolean doLHs)
{
int p,l;
tblockInfo tbi;
int * blocks;
int width8,height8,stride8,w,h;
int ** rows;
int ** trows;
int imw,imh,ims;

	Log_Printf("Doing untransformBlocked\n");

	TIMER_P(TBlock_All);

	imw = im->width;
	imh = im->height;
	ims = im->stride;
	width8  = (imw + 7)>>3;
	height8 = (imh + 7)>>3;

	stride8 = (((ims + 7)>>3)<<6) + 3;

	w = (ims + 7)>>3;
	h = w<<3;

	TIMER_P(TBlock_Ram);

	blocks = (int *)jeRam_Allocate(sizeof(int)*(stride8*w + 9*h));
	assert(blocks);

	TIMER_Q(TBlock_Ram);

	trows = tbi.trows;
	trows[0] = blocks + stride8*w;
	for(l=1;l<9;l++)
		trows[l] = trows[l-1] + h;

	tbi.blocks  = blocks;
	tbi.stride8 = stride8;
	tbi.waver   = waver;

pushTSC();

	for(p=0;p<(im->planes);p++) 
	{
		rows = im->data[p];
		tbi.rows = rows;

		for (l = levels-1; l >= 0; l--) 
		{
			w = imw >> l;
			h = imh >> l;

			/* untransform into blocks */

			//<> seems a shame not to use the blocks to transpose
			if ( doLHs )
			{
				TIMER_P(TBlock_Transpose);
				transposeHL(im,p,l);
				TIMER_Q(TBlock_Transpose);
			}

			if ( l == (levels - 1) )
			{
				untH2(0,h,w,&tbi);
			}
			else
			{
				untHb(0,h>>1,w,&tbi);
				untH2(h>>1,h,w,&tbi);
			}	

			/* Columns */

			if ( l == 0 )
			{
				untV4(w,h,&tbi);
			}
			else
			{
				untVb3(w,h,&tbi);
			}

			assert(jeRam_IsValidPtr(blocks));
		}
	}

showPopTSC("untrans blocked");

	TIMER_P(TBlock_Ram);

	jeRam_Free(blocks);

	TIMER_Q(TBlock_Ram);

	// we did a transpose !
	swapints(im->width,im->height);

	TIMER_Q(TBlock_All);
}

void TBlock_DoReport(void)
{
	TIMER_REPORT(TBlock_All);
	TIMER_REPORT(TBlock_Ram);
	TIMER_REPORT(TBlock_Transpose);
	TIMER_REPORT(TBlock_H);
	TIMER_REPORT(TBlock_HB);
	TIMER_REPORT(TBlock_V);
	TIMER_REPORT(TBlock_VB);
	TIMER_REPORT(TBlock_V_UnBlock);
	TIMER_REPORT(TBlock_V_Waver);
	TIMER_REPORT(TBlock_H_SpinUpDown);
	TIMER_REPORT(TBlock_H_Waver);
	TIMER_REPORT(TBlock_H_Block);
}

/*}*** EOF ********/