YUV.cpp - This C++ code is part of a video processing libra…

/trunk/source/Engine/JetEngine/Bitmap/Compression/YUV.cpp

# · C++ · 703 lines · 441 code · 186 blank · 76 comment · 10 complexity · 1f8521119bb673056cf24518967ce036 MD5 · raw file

/****************************************************************************************/

/*  Yuv                                                                                 */

/*                                                                                      */

/*  Author: Charles Bloom                                                               */

/*  Description:  YUV <-> RGB code                                                      */

/*                                                                                      */

/*  The contents of this file are subject to the Jet3D Public License                   */

/*  Version 1.02 (the "License"); you may not use this file except in                   */

/*  compliance with the License. You may obtain a copy of the License at                */

/*  http://www.jet3d.com                                                                */

/*                                                                                      */

/*  Software distributed under the License is distributed on an "AS IS"                 */

/*  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See                */

/*  the License for the specific language governing rights and limitations              */

/*  under the License.                                                                  */

/*                                                                                      */

/*  The Original Code is Jet3D, released December 12, 1999.                             */

/*  Copyright (C) 1996-1999 Eclipse Entertainment, L.L.C. All Rights Reserved           */

/*                                                                                      */

/****************************************************************************************/

#include "stdafx.h"

#include <assert.h>



#include "YUV.h"

#include "Utility.h"

#include "Cpu.h"



#pragma warning(disable : 4244)	// int -> uint8 conversions abound



#pragma warning(disable : 4799)	// I know we've got no emms; it's done in wavelet.c



/*}{******* RGB <-> YUV in C ***********/



void RGBb_to_YUVb(const uint8 *RGB,uint8 *YUV)

{

int R = RGB[0], G = RGB[1], B = RGB[2];



	YUV[0] = Y_RGB(R,G,B);

	YUV[1] = U_RGB(R,G,B) + 127;

	YUV[2] = V_RGB(R,G,B) + 127;

}



void YUVb_to_RGBb(const uint8 *YUV,uint8 *RGB)

{

int y,u,v,r,g,b;



	y = YUV[0];

	u = YUV[1] - 127;

	v = YUV[2] - 127;



	r = R_YUV(y,u,v);

	g = G_YUV(y,u,v);

	b = B_YUV(y,u,v);



	RGB[0] = minmax(r,0,255);	// we could get negative ones and whatnot

	RGB[1] = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;

	RGB[2] = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb

}





void RGBb_to_YUVb_line(const uint8 *RGB,uint8 *YUV,int len)

{

int R,G,B;



	while(len--)

	{

		R = *RGB++;

		G = *RGB++;

		B = *RGB++;

		*YUV++ = Y_RGB(R,G,B);

		*YUV++ = U_RGB(R,G,B) + 127;

		*YUV++ = V_RGB(R,G,B) + 127;

	}

}



void YUVb_to_RGBb_line(const uint8 *YUV,uint8 *RGB,int len)

{

int y,u,v,r,g,b;



	while(len--)

	{

		y = (*YUV++);

		u = (*YUV++) - 127;

		v = (*YUV++) - 127;



		r = R_YUV(y,u,v);

		g = G_YUV(y,u,v);

		b = B_YUV(y,u,v);



		*RGB++ = minmax(r,0,255);	// we could get negative ones and whatnot

		*RGB++ = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;

		*RGB++ = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb

	}

}





void RGBb_to_YUVi(const uint8 *RGB,int *Y,int *U,int *V)

{

int R = RGB[0], G = RGB[1], B = RGB[2];



	*Y = Y_RGB(R,G,B);

	*U = U_RGB(R,G,B) + 127;

	*V = V_RGB(R,G,B) + 127;



	assert( isinrange(*Y,0,255) );

	assert( isinrange(*U,0,255) );

	assert( isinrange(*V,0,255) );

}



void YUVi_to_RGBb(int y,int u,int v,uint8 *RGB)

{

int r,g,b;



// yuv can be kicked out of 0,255 by the wavelet

//	assert( isinrange(y,0,255) );

//	assert( isinrange(u,0,255) );

//	assert( isinrange(v,0,255) );



	u -= 127;

	v -= 127;

	r = R_YUV(y,u,v); // this is just like a matrix multiply

	g = G_YUV(y,u,v);

	b = B_YUV(y,u,v);

	RGB[0] = minmax(r,0,255);	// we could get negative ones and whatnot

	RGB[1] = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;

	RGB[2] = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb

}



void RGBi_to_YUVi(int R,int G,int B,int *Y,int *U,int *V)

{

	assert( isinrange(R,0,255) );

	assert( isinrange(G,0,255) );

	assert( isinrange(B,0,255) );



	*Y = Y_RGB(R,G,B);

	*U = U_RGB(R,G,B) + 127;

	*V = V_RGB(R,G,B) + 127;



	assert( isinrange(*Y,0,255) );

	assert( isinrange(*U,0,255) );

	assert( isinrange(*V,0,255) );

}



void YUVi_to_RGBi(int y,int u,int v,int *R,int *G,int *B)

{

int r,g,b;



// yuv can be kicked out of 0,255 by the wavelet

//	assert( isinrange(y,0,255) );

//	assert( isinrange(u,0,255) );

//	assert( isinrange(v,0,255) );



	u -= 127;

	v -= 127;

	r = R_YUV(y,u,v); // this is just like a matrix multiply

	g = G_YUV(y,u,v);

	b = B_YUV(y,u,v);



	*R = minmax(r,0,255);	// we could get negative ones and whatnot

	*G = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;

	*B = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb

}



void YUVi_to_RGBi_line(int *line1,int *line2,int *line3,int len)

{

int y,u,v,r,g,b;



	// <> use MMX



	cachetouch_w(line1,len>>3);

	cachetouch_w(line2,len>>3);

	cachetouch_w(line3,len>>3);

	while(len--)

	{

		y = *line1;

		u = *line2 - 127;

		v = *line3 - 127;



		r = R_YUV(y,u,v);

		g = G_YUV(y,u,v);

		b = B_YUV(y,u,v);



		r = minmax(r,0,255);

		g = minmax(g,0,255);

		b = minmax(b,0,255);



		*line1++ = r;

		*line2++ = g;

		*line3++ = b;

	}

}



void YUVi_to_BGRb_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)

{

int y,u,v,r,g,b,len;

int *line1,*line2,*line3;

uint8 * bline;



	line1 = iline1;

	line2 = iline2;

	line3 = iline3;

	bline = ibline;

	len = ilen;



	cachetouch_r(line1,len>>3);

	cachetouch_r(line2,len>>3);

	cachetouch_r(line3,len>>3);

	cachetouch_w(bline,(len*3)>>5);

	

	while(len--)

	{

		y = (*line1++);

		u = (*line2++) - 127;

		v = (*line3++) - 127;



		r = R_YUV(y,u,v);

		g = G_YUV(y,u,v);

		b = B_YUV(y,u,v);



		r = minmax(r,0,255);

		g = minmax(g,0,255);

		b = minmax(b,0,255);



		bline[0] = b;

		bline[1] = g;

		bline[2] = r;

		bline+=3;

	}

}



void YUVi_to_BGRb_lines_c(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)

{

int yz;

	for(yz=0;yz<h;yz++)

	{	

	int y,u,v,r,g,b,len;

	int *line1,*line2,*line3;

	uint8 * bline;



		line1 = Ylines[yz];

		line2 = Ulines[yz];

		line3 = Vlines[yz];

		bline = BGRptr;

		len = w;



		cachetouch_r(line1,len>>3);

		cachetouch_r(line2,len>>3);

		cachetouch_r(line3,len>>3);

		cachetouch_w(bline,(len*3)>>5);

		

		while(len--)

		{

			y = (*line1++);

			u = (*line2++) - 127;

			v = (*line3++) - 127;



			r = R_YUV(y,u,v);

			g = G_YUV(y,u,v);

			b = B_YUV(y,u,v);



			r = minmax(r,0,255);

			g = minmax(g,0,255);

			b = minmax(b,0,255);



			bline[0] = b;

			bline[1] = g;

			bline[2] = r;

			bline+=3;

		}



		BGRptr += BGRstride;

	}

}



void YUVi_to_XRGB_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)

{

int y,u,v,r,g,b,len;

int *line1,*line2,*line3;

uint8 * bline;



	line1 = iline1;

	line2 = iline2;

	line3 = iline3;

	bline = ibline;

	len = ilen;



	cachetouch_r(line1,len>>3);

	cachetouch_r(line2,len>>3);

	cachetouch_r(line3,len>>3);

	cachetouch_w(bline,len>>3);

	

	while(len--)

	{

		y = (*line1++);

		u = (*line2++) - 127;

		v = (*line3++) - 127;



		r = R_YUV(y,u,v);

		g = G_YUV(y,u,v);

		b = B_YUV(y,u,v);



		r = minmax(r,0,255);

		g = minmax(g,0,255);

		b = minmax(b,0,255);



		bline[0] = b;

		bline[1] = g;

		bline[2] = r;

		bline += 4;

	}

}



/*}{******* MMX YUV -> BGR blitters ***********/



static const __int64 Const_V_16 = 0x0000A6462DB50000;

static const __int64 Const_U_16 = 0x00000000E9FA7168;



void YUVi_to_BGRb_lines_mmx(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)

{

int yz;



	for(yz=0;yz<h;yz++)

	{

	int *line1,*line2,*line3;

	uint8 * bline;



		line1 = Ylines[yz];

		line2 = Ulines[yz];

		line3 = Vlines[yz];

		bline = BGRptr;

		BGRptr += BGRstride;



		assert(w > 1 && h > 1 );



		cachetouch_r(line1,w>>3);

		cachetouch_r(line2,w>>3);

		cachetouch_r(line3,w>>3);

		cachetouch_w(bline,(w*3)>>5);

			



		__asm

		{

		mov ecx,w

		sub ecx,1

		mov edi,bline



		movq mm3,Const_V_16

		movq mm4,Const_U_16



		More:		



			/**

			*

			*	ecx is width

			*	edi is BGRptr

			*

			*	eax is (V<<2)-509

			*	ebx is (U<<2)-509

			*	edx is Y

			*

			*	the multiply coefficients are in 14 bits, then we rshr 16 via mulhw

			*

			*	mm0 is four V int16's, multiplied by their coefficients (mm3)

			*	mm1 is four U int16's, multiplied by their coefficients (mm4)

			*	mm2 is four Y int16's

			*

			*	XRGB = mm0 + mm1 + mm2

			*

			*	we're taking about 45 clocks

			*	my manual count indicates we could take about 37 if we were perfect

			*/



			/*

			*

			* MMX optimization notes:

			*	1. there is only one MMX pack/unpack unit

			*	2. there is only one MMX multiply unit

			*	3. MMX instructions that use memory or integers use port 0 only

			*	4. all MMX instructions are 1 clock except multiply, which is 3

			*/



			mov			eax,line3	// V

			mov			eax,[eax]	// eax = v; hard stall on eax, inevitable

			add			line3,4		// no stall on line3



			shl			eax,2		// V<<=2

			

			mov			ebx,line2	// U



			sub			eax,509		// do ((V<<2)-510) instead of ((V-127)<<2)



			mov			ebx,[ebx]	// ebx = u

			add			line2,4



			movd		mm0,eax		// mm0 = [0][v]

			

			shl			ebx,2



			punpckldq	mm0,mm0		// mm0 = [v][v]

			

			sub			ebx,509



			packssdw	mm0,mm0		// mm0 = [v][v][v][v]

			

			movd		mm1,ebx		// mm1 = [0][u]



			mov			edx,line1	// Y



			pmulhw		mm0,mm3		// keep only high words; same as multiplying in 32 bits and doing >>16

			

			// put some non-dependent stuff after the multiply:



			mov			edx,[edx]	// edx = y			



			punpckldq	mm1,mm1		// mm1 = [u][u]



			movd		mm2,edx		// mm2 = [0][y]



			packssdw	mm1,mm1		// mm1 = [u][u][u][u]			



			// these two packs cannot pair!



			punpckldq	mm2,mm2		// mm2 = [y][y]

			

			pmulhw		mm1,mm4



			// put some stuff after the multiply:



			add			line1,4

			packssdw	mm2,mm2		// mm2 = [y][y][y][y]



			// now XRGB = mm0 + mm1 + mm2



			paddsw		mm0,mm1



			paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2



			// convert the four int16s to eight bytes; also do a clamp(0,255) for free!



			packuswb	mm0,mm0		// hard stall on mm0, inevitable



			movd		[edi],mm0	// hard stall on mm0, then unaligned write! bad!

			add			edi,3		// no stall on edi



		dec ecx

		jnz More



			//{		one last one that doesn't write 4->3

			mov eax,line3			// V

			mov eax,[eax]

			add line3,4



			shl			eax,2

			sub			eax,509

			movd		mm0,eax		// mm0 = [0][x]

			punpckldq	mm0,mm0		// mm0 = [x][x]

			packssdw	mm0,mm0		// mm0 = [x][x][x][x]

			pmulhw		mm0,mm3

			

			mov ebx,line2			// U

			mov ebx,[ebx]

			add line2,4



			shl			ebx,2

			sub			ebx,509

			movd		mm1,ebx		// mm0 = [0][x]

			punpckldq	mm1,mm1		// mm0 = [x][x]

			packssdw	mm1,mm1		// mm0 = [x][x][x][x]

			pmulhw		mm1,mm4



			mov edx,line1			// Y

			mov edx,[edx]

			add line1,4



			movd		mm2,edx		// mm0 = [0][x]

			punpckldq	mm2,mm2		// mm0 = [x][x]

			packssdw	mm2,mm2		// mm0 = [x][x][x][x]



			paddsw		mm0,mm1

			paddsw		mm0,mm2



			packuswb	mm0,mm0



			movd		eax,mm0		// eax is XRGB

			mov			[edi],ax

			shr			eax,16

			mov			[edi+2],al

			//}

		}

	}



	//__asm { emms }	

}



void YUVi_to_BGRb_line_mmx2(int *line1,int *line2,int *line3,uint8 * bline,int len)

{

	assert(len > 1 );



	len --;



	cachetouch_r(line1,len>>3);

	cachetouch_r(line2,len>>3);

	cachetouch_r(line3,len>>3);

	cachetouch_w(bline,(len*3)>>5);

	

	__asm

	{

	

	mov ecx,len

	mov edi,bline



	movq mm3,Const_V_16

	movq mm4,Const_U_16



	YUVi_to_BGRb_line_mmx2_More:		



		mov			eax,line3			// V

		mov			eax,[eax]			// hard stall on eax, inevitable

		add			line3,4				// no stall on line3



		shl			eax,2

		

		mov			ebx,line2			// U



		sub			eax,510



		mov			ebx,[ebx]

		add			line2,4



		movd		mm0,eax		// mm0 = [0][x]

		

		shl			ebx,2



		punpckldq	mm0,mm0		// mm0 = [x][x]

		

		sub			ebx,510



		packssdw	mm0,mm0		// mm0 = [x][x][x][x]

		

		movd		mm1,ebx		// mm0 = [0][x]



		pmulhw		mm0,mm3

		mov			edx,line1			// Y

		

		punpckldq	mm1,mm1		// mm0 = [x][x]



		mov			edx,[edx]



		packssdw	mm1,mm1		// mm0 = [x][x][x][x]

				

		movd		mm2,edx		// mm0 = [0][x]



		add			line1,4



		punpckldq	mm2,mm2		// mm0 = [x][x]

		

		pmulhw		mm1,mm4



		packssdw	mm2,mm2		// mm0 = [x][x][x][x]



		paddsw		mm0,mm1



		paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2



		packuswb	mm0,mm0		// hard stall on mm0, inevitable



		movd		[edi],mm0	// unaligned write! bad!

		add			edi,3		// no stall on edi



	dec ecx

	jnz YUVi_to_BGRb_line_mmx2_More



	mov bline,edi



	//emms

	}

	{

	int y,u,v,r,g,b;	

	y = (*line1);

	u = (*line2) - 127;

	v = (*line3) - 127;



	r = R_YUV(y,u,v);

	g = G_YUV(y,u,v);

	b = B_YUV(y,u,v);



	r = minmax(r,0,255);

	g = minmax(g,0,255);

	b = minmax(b,0,255);



	bline[0] = b;

	bline[1] = g;

	bline[2] = r;

	}

}



void YUVi_to_XRGB_line_mmx(int *line1,int *line2,int *line3,uint8 * bline,int len)

{

	assert(len > 0 );



	cachetouch_r(line1,len>>3);

	cachetouch_r(line2,len>>3);

	cachetouch_r(line3,len>>3);

	cachetouch_w(bline,len>>3);

		

	__asm

	{

	

	mov ecx,len

	mov edi,bline



	movq mm3,Const_V_16

	movq mm4,Const_U_16



	More:		



		mov			eax,line3			// V

		mov			eax,[eax]			// hard stall on eax, inevitable

		add			line3,4				// no stall on line3



		shl			eax,2

		

		mov			ebx,line2			// U



		sub			eax,510



		mov			ebx,[ebx]

		add			line2,4



		movd		mm0,eax		// mm0 = [0][x]

		

		shl			ebx,2



		punpckldq	mm0,mm0		// mm0 = [x][x]

		

		sub			ebx,510



		packssdw	mm0,mm0		// mm0 = [x][x][x][x]

		

		movd		mm1,ebx		// mm0 = [0][x]



		pmulhw		mm0,mm3

		mov			edx,line1			// Y

		

		punpckldq	mm1,mm1		// mm0 = [x][x]



		mov			edx,[edx]



		packssdw	mm1,mm1		// mm0 = [x][x][x][x]

				

		movd		mm2,edx		// mm0 = [0][x]



		add			line1,4



		punpckldq	mm2,mm2		// mm0 = [x][x]

		

		pmulhw		mm1,mm4



		packssdw	mm2,mm2		// mm0 = [x][x][x][x]



		paddsw		mm0,mm1



		paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2



		packuswb	mm0,mm0		// hard stall on mm0, inevitable



		movd		[edi],mm0

		add			edi,4		// no stall on edi



	dec ecx

	jnz More



	//emms

	}

}



/*}{******* CPU setup ***********/



void (*YUVi_to_XRGB_line)(int *line1,int *line2,int *line3,uint8 * bline,int len) = NULL;

void (*YUVi_to_BGRb_lines)(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride) = NULL;



void SetupYUV(void)

{

	jeCPU_GetInfo();



	if ( jeCPU_Features & JE_CPU_HAS_MMX )

	{

		// timed on hare512.bmp :

	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx1;	// blit : 0.025 seconds = 47.2 clocks / pixel

	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx2;	// blit : 0.025 seconds = 47.2 clocks / pixel

	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;	// blit : 0.034 seconds = 66.6 clocks / pixel

		YUVi_to_XRGB_line = YUVi_to_XRGB_line_mmx;

		YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_mmx;// blit : 0.0245 seconds= 45.9 clocks / pixel

	}

	else

	{

	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;

		YUVi_to_XRGB_line = YUVi_to_XRGB_line_c;

		YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_c;

	}

}



/*}******* EOF ***********/