YUV.c - This C code implements a function `YUVi_to_XRGB

/branches/jet3d_dev_msvc2005/source/Engine/JetEngine/Bitmap/Compression/YUV.c

# · C · 1021 lines · 749 code · 196 blank · 76 comment · 10 complexity · feec26ced659f2367447079cf4b77f5a MD5 · raw file

/****************************************************************************************/
/*  Yuv                                                                                 */
/*                                                                                      */
/*  Author: Charles Bloom                                                               */
/*  Description:  YUV <-> RGB code                                                      */
/*                                                                                      */
/*  The contents of this file are subject to the Jet3D Public License                   */
/*  Version 1.02 (the "License"); you may not use this file except in                   */
/*  compliance with the License. You may obtain a copy of the License at                */
/*  http://www.jet3d.com                                                                */
/*                                                                                      */
/*  Software distributed under the License is distributed on an "AS IS"                 */
/*  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See                */
/*  the License for the specific language governing rights and limitations              */
/*  under the License.                                                                  */
/*                                                                                      */
/*  The Original Code is Jet3D, released December 12, 1999.                             */
/*  Copyright (C) 1996-1999 Eclipse Entertainment, L.L.C. All Rights Reserved           */
/*                                                                                      */
/****************************************************************************************/
#include <assert.h>

#include "YUV.h"
#include "Utility.h"
#include "Cpu.h"

#ifdef BUILD_BE
#include <inttypes.h>
#define __int64 uint64_t
#endif

#pragma warning(disable : 4244)	// int -> uint8 conversions abound

#pragma warning(disable : 4799)	// I know we've got no emms; it's done in wavelet.c

/*}{******* RGB <-> YUV in C ***********/

void RGBb_to_YUVb(const uint8 *RGB,uint8 *YUV)
{
int R = RGB[0], G = RGB[1], B = RGB[2];

	YUV[0] = Y_RGB(R,G,B);
	YUV[1] = U_RGB(R,G,B) + 127;
	YUV[2] = V_RGB(R,G,B) + 127;
}

void YUVb_to_RGBb(const uint8 *YUV,uint8 *RGB)
{
int y,u,v,r,g,b;

	y = YUV[0];
	u = YUV[1] - 127;
	v = YUV[2] - 127;

	r = R_YUV(y,u,v);
	g = G_YUV(y,u,v);
	b = B_YUV(y,u,v);

	RGB[0] = minmax(r,0,255);	// we could get negative ones and whatnot
	RGB[1] = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;
	RGB[2] = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb
}


void RGBb_to_YUVb_line(const uint8 *RGB,uint8 *YUV,int len)
{
int R,G,B;

	while(len--)
	{
		R = *RGB++;
		G = *RGB++;
		B = *RGB++;
		*YUV++ = Y_RGB(R,G,B);
		*YUV++ = U_RGB(R,G,B) + 127;
		*YUV++ = V_RGB(R,G,B) + 127;
	}
}

void YUVb_to_RGBb_line(const uint8 *YUV,uint8 *RGB,int len)
{
int y,u,v,r,g,b;

	while(len--)
	{
		y = (*YUV++);
		u = (*YUV++) - 127;
		v = (*YUV++) - 127;

		r = R_YUV(y,u,v);
		g = G_YUV(y,u,v);
		b = B_YUV(y,u,v);

		*RGB++ = minmax(r,0,255);	// we could get negative ones and whatnot
		*RGB++ = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;
		*RGB++ = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb
	}
}


void RGBb_to_YUVi(const uint8 *RGB,int *Y,int *U,int *V)
{
int R = RGB[0], G = RGB[1], B = RGB[2];

	*Y = Y_RGB(R,G,B);
	*U = U_RGB(R,G,B) + 127;
	*V = V_RGB(R,G,B) + 127;

	assert( isinrange(*Y,0,255) );
	assert( isinrange(*U,0,255) );
	assert( isinrange(*V,0,255) );
}

void YUVi_to_RGBb(int y,int u,int v,uint8 *RGB)
{
int r,g,b;

// yuv can be kicked out of 0,255 by the wavelet
//	assert( isinrange(y,0,255) );
//	assert( isinrange(u,0,255) );
//	assert( isinrange(v,0,255) );

	u -= 127;
	v -= 127;
	r = R_YUV(y,u,v); // this is just like a matrix multiply
	g = G_YUV(y,u,v);
	b = B_YUV(y,u,v);
	RGB[0] = minmax(r,0,255);	// we could get negative ones and whatnot
	RGB[1] = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;
	RGB[2] = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb
}

void RGBi_to_YUVi(int R,int G,int B,int *Y,int *U,int *V)
{
	assert( isinrange(R,0,255) );
	assert( isinrange(G,0,255) );
	assert( isinrange(B,0,255) );

	*Y = Y_RGB(R,G,B);
	*U = U_RGB(R,G,B) + 127;
	*V = V_RGB(R,G,B) + 127;

	assert( isinrange(*Y,0,255) );
	assert( isinrange(*U,0,255) );
	assert( isinrange(*V,0,255) );
}

void YUVi_to_RGBi(int y,int u,int v,int *R,int *G,int *B)
{
int r,g,b;

// yuv can be kicked out of 0,255 by the wavelet
//	assert( isinrange(y,0,255) );
//	assert( isinrange(u,0,255) );
//	assert( isinrange(v,0,255) );

	u -= 127;
	v -= 127;
	r = R_YUV(y,u,v); // this is just like a matrix multiply
	g = G_YUV(y,u,v);
	b = B_YUV(y,u,v);

	*R = minmax(r,0,255);	// we could get negative ones and whatnot
	*G = minmax(g,0,255);	//	because the y,u,v are not really 24 bits;
	*B = minmax(b,0,255);	//	there are regions of YUV space that will never be reached by RGBb_to_YUVb
}

void YUVi_to_RGBi_line(int *line1,int *line2,int *line3,int len)
{
int y,u,v,r,g,b;

	// <> use MMX

	cachetouch_w(line1,len>>3);
	cachetouch_w(line2,len>>3);
	cachetouch_w(line3,len>>3);
	while(len--)
	{
		y = *line1;
		u = *line2 - 127;
		v = *line3 - 127;

		r = R_YUV(y,u,v);
		g = G_YUV(y,u,v);
		b = B_YUV(y,u,v);

		r = minmax(r,0,255);
		g = minmax(g,0,255);
		b = minmax(b,0,255);

		*line1++ = r;
		*line2++ = g;
		*line3++ = b;
	}
}

void YUVi_to_BGRb_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
{
int y,u,v,r,g,b,len;
int *line1,*line2,*line3;
uint8 * bline;

	line1 = iline1;
	line2 = iline2;
	line3 = iline3;
	bline = ibline;
	len = ilen;

	cachetouch_r(line1,len>>3);
	cachetouch_r(line2,len>>3);
	cachetouch_r(line3,len>>3);
	cachetouch_w(bline,(len*3)>>5);
	
	while(len--)
	{
		y = (*line1++);
		u = (*line2++) - 127;
		v = (*line3++) - 127;

		r = R_YUV(y,u,v);
		g = G_YUV(y,u,v);
		b = B_YUV(y,u,v);

		r = minmax(r,0,255);
		g = minmax(g,0,255);
		b = minmax(b,0,255);

		bline[0] = b;
		bline[1] = g;
		bline[2] = r;
		bline+=3;
	}
}

void YUVi_to_BGRb_lines_c(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
{
int yz;
	for(yz=0;yz<h;yz++)
	{	
	int y,u,v,r,g,b,len;
	int *line1,*line2,*line3;
	uint8 * bline;

		line1 = Ylines[yz];
		line2 = Ulines[yz];
		line3 = Vlines[yz];
		bline = BGRptr;
		len = w;

		cachetouch_r(line1,len>>3);
		cachetouch_r(line2,len>>3);
		cachetouch_r(line3,len>>3);
		cachetouch_w(bline,(len*3)>>5);
		
		while(len--)
		{
			y = (*line1++);
			u = (*line2++) - 127;
			v = (*line3++) - 127;

			r = R_YUV(y,u,v);
			g = G_YUV(y,u,v);
			b = B_YUV(y,u,v);

			r = minmax(r,0,255);
			g = minmax(g,0,255);
			b = minmax(b,0,255);

			bline[0] = b;
			bline[1] = g;
			bline[2] = r;
			bline+=3;
		}

		BGRptr += BGRstride;
	}
}

void YUVi_to_XRGB_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
{
int y,u,v,r,g,b,len;
int *line1,*line2,*line3;
uint8 * bline;

	line1 = iline1;
	line2 = iline2;
	line3 = iline3;
	bline = ibline;
	len = ilen;

	cachetouch_r(line1,len>>3);
	cachetouch_r(line2,len>>3);
	cachetouch_r(line3,len>>3);
	cachetouch_w(bline,len>>3);
	
	while(len--)
	{
		y = (*line1++);
		u = (*line2++) - 127;
		v = (*line3++) - 127;

		r = R_YUV(y,u,v);
		g = G_YUV(y,u,v);
		b = B_YUV(y,u,v);

		r = minmax(r,0,255);
		g = minmax(g,0,255);
		b = minmax(b,0,255);

		bline[0] = b;
		bline[1] = g;
		bline[2] = r;
		bline += 4;
	}
}

/*}{******* MMX YUV -> BGR blitters ***********/

#ifdef BUILD_BE // neccesary due to integer overflow on the various systems..
static const __int64 Const_V_16 = 2789617077 * 256 * 256;//0x0000A6462DB50000;
#endif
#ifdef WIN32
static const __int64 Const_V_16 = 0x0000A6462DB50000;
#endif

static const __int64 Const_U_16 = 0x00000000E9FA7168;

void YUVi_to_BGRb_lines_mmx(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
{
int yz;

	for(yz=0;yz<h;yz++)
	{
	int *line1,*line2,*line3;
	uint8 * bline;

		line1 = Ylines[yz];
		line2 = Ulines[yz];
		line3 = Vlines[yz];
		bline = BGRptr;
		BGRptr += BGRstride;

		assert(w > 1 && h > 1 );

		cachetouch_r(line1,w>>3);
		cachetouch_r(line2,w>>3);
		cachetouch_r(line3,w>>3);
		cachetouch_w(bline,(w*3)>>5);
			

#ifdef WIN32
		__asm
		{
		mov ecx,w
		sub ecx,1
		mov edi,bline

		movq mm3,Const_V_16
		movq mm4,Const_U_16

		More:		

			/**
			*
			*	ecx is width
			*	edi is BGRptr
			*
			*	eax is (V<<2)-509
			*	ebx is (U<<2)-509
			*	edx is Y
			*
			*	the multiply coefficients are in 14 bits, then we rshr 16 via mulhw
			*
			*	mm0 is four V int16's, multiplied by their coefficients (mm3)
			*	mm1 is four U int16's, multiplied by their coefficients (mm4)
			*	mm2 is four Y int16's
			*
			*	XRGB = mm0 + mm1 + mm2
			*
			*	we're taking about 45 clocks
			*	my manual count indicates we could take about 37 if we were perfect
			*/

			/*
			*
			* MMX optimization notes:
			*	1. there is only one MMX pack/unpack unit
			*	2. there is only one MMX multiply unit
			*	3. MMX instructions that use memory or integers use port 0 only
			*	4. all MMX instructions are 1 clock except multiply, which is 3
			*/

			mov			eax,line3	// V
			mov			eax,[eax]	// eax = v; hard stall on eax, inevitable
			add			line3,4		// no stall on line3

			shl			eax,2		// V<<=2
			
			mov			ebx,line2	// U

			sub			eax,509		// do ((V<<2)-510) instead of ((V-127)<<2)

			mov			ebx,[ebx]	// ebx = u
			add			line2,4

			movd		mm0,eax		// mm0 = [0][v]
			
			shl			ebx,2

			punpckldq	mm0,mm0		// mm0 = [v][v]
			
			sub			ebx,509

			packssdw	mm0,mm0		// mm0 = [v][v][v][v]
			
			movd		mm1,ebx		// mm1 = [0][u]

			mov			edx,line1	// Y

			pmulhw		mm0,mm3		// keep only high words; same as multiplying in 32 bits and doing >>16
			
			// put some non-dependent stuff after the multiply:

			mov			edx,[edx]	// edx = y			

			punpckldq	mm1,mm1		// mm1 = [u][u]

			movd		mm2,edx		// mm2 = [0][y]

			packssdw	mm1,mm1		// mm1 = [u][u][u][u]			

			// these two packs cannot pair!

			punpckldq	mm2,mm2		// mm2 = [y][y]
			
			pmulhw		mm1,mm4

			// put some stuff after the multiply:

			add			line1,4
			packssdw	mm2,mm2		// mm2 = [y][y][y][y]

			// now XRGB = mm0 + mm1 + mm2

			paddsw		mm0,mm1

			paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

			// convert the four int16s to eight bytes; also do a clamp(0,255) for free!

			packuswb	mm0,mm0		// hard stall on mm0, inevitable

			movd		[edi],mm0	// hard stall on mm0, then unaligned write! bad!
			add			edi,3		// no stall on edi

		dec ecx
		jnz More

			//{		one last one that doesn't write 4->3
			mov eax,line3			// V
			mov eax,[eax]
			add line3,4

			shl			eax,2
			sub			eax,509
			movd		mm0,eax		// mm0 = [0][x]
			punpckldq	mm0,mm0		// mm0 = [x][x]
			packssdw	mm0,mm0		// mm0 = [x][x][x][x]
			pmulhw		mm0,mm3
			
			mov ebx,line2			// U
			mov ebx,[ebx]
			add line2,4

			shl			ebx,2
			sub			ebx,509
			movd		mm1,ebx		// mm0 = [0][x]
			punpckldq	mm1,mm1		// mm0 = [x][x]
			packssdw	mm1,mm1		// mm0 = [x][x][x][x]
			pmulhw		mm1,mm4

			mov edx,line1			// Y
			mov edx,[edx]
			add line1,4

			movd		mm2,edx		// mm0 = [0][x]
			punpckldq	mm2,mm2		// mm0 = [x][x]
			packssdw	mm2,mm2		// mm0 = [x][x][x][x]

			paddsw		mm0,mm1
			paddsw		mm0,mm2

			packuswb	mm0,mm0

			movd		eax,mm0		// eax is XRGB
			mov			[edi],ax
			shr			eax,16
			mov			[edi+2],al
			//}
		}
#endif

#ifdef BUILD_BE
		__asm__ __volatile__ ("
		movl %0, %%ecx 				// %0 = w ; mov ecx,w
		subl $1, %%ecx				// sub ecx,1
		movl %1, %%edi				// %1 = bline ; mov edi,bline

		movq %5, %%mm3 //Const_V_16, %%mm3				// %2 = Const_V_16, i think we could use the name cause its global but.. movq mm3,Const_V_16
		movq %6, %%mm4 // Const_U_16, %%mm4				// %3 = const_U_15, same as above ; movq mm4,Const_U_16

		More:		

			///**
			//*
			//*	ecx is width
			//*	edi is BGRptr
			//*
			//*	eax is (V<<2)-509
			//*	ebx is (U<<2)-509
			//*	edx is Y
			//*
			//*	the multiply coefficients are in 14 bits, then we rshr 16 via mulhw
			//*
			//*	mm0 is four V int16's, multiplied by their coefficients (mm3)
			//*	mm1 is four U int16's, multiplied by their coefficients (mm4)
			//*	mm2 is four Y int16's
			//*
			//*	XRGB = mm0 + mm1 + mm2
			//*
			//*	we're taking about 45 clocks
			//*	my manual count indicates we could take about 37 if we were perfect
			//*/

			///*
			//*
			//* MMX optimization notes:
			//*	1. there is only one MMX pack/unpack unit
			//*	2. there is only one MMX multiply unit
			//*	3. MMX instructions that use memory or integers use port 0 only
			//*	4. all MMX instructions are 1 clock except multiply, which is 3
			//*/

			movl		%2, %%eax			// %2 = line 3 ;mov			eax,line3	// V
			movl		(%%eax), %%eax 		//mov			eax,[eax]	// eax = v; hard stall on eax, inevitable
			addl		$4, %2				// %2 = line 3 add			line3,4		// no stall on line3

			shll		$2, %%eax			//shl			eax,2		// V<<=2
			
			movl		%3, %%ebx			// %3 = line2 mov			ebx,line2	// U

			subl		$509, %%eax			// sub			eax,509		// do ((V<<2)-510) instead of ((V-127)<<2)

			movl		(%%ebx), %%ebx		// mov			ebx,[ebx]	// ebx = u
			addl		$4, %3			// %3 = line2 ;add			line2,4

			movd		%%eax, %%mm0		//;movd		mm0,eax		// mm0 = [0][v]
			
			shll		$2, %%ebx			// shl			ebx,2

			punpckldq %%mm0, %%mm0			//;punpckldq	mm0,mm0		// mm0 = [v][v]
			
			subl 	$509, %%ebx				//sub			ebx,509

			packssdw	%%mm0, %%mm0 		//;packssdw	mm0,mm0		// mm0 = [v][v][v][v]
			
			movd	%%ebx, %%mm1			//;movd		mm1,ebx		// mm1 = [0][u]

			movl	%4, %%edx			//%4 = line1;mov			edx,line1	// Y

			pmulhw		%%mm0, %%mm3		// reverse? pmulhw		mm0,mm3		// keep only high words; same as multiplying in 32 bits and doing >>16
			
			// put some non-dependent stuff after the multiply:

			movl	(%%edx), %%edx			//;mov			edx,[edx]	// edx = y			

			punpckldq	%%mm1,%%mm1			//;punpckldq	mm1,mm1		// mm1 = [u][u]

			movd	%%edx, %%mm2			//;movd		mm2,edx		// mm2 = [0][y]

			packssdw		%%mm1,%%mm1		//;packssdw	mm1,mm1		// mm1 = [u][u][u][u]			

			// these two packs cannot pair!

			punpckldq	%%mm2,%%mm2			//;punpckldq	mm2,mm2		// mm2 = [y][y]
			
			pmulhw	%%mm4, %%mm1			// reverse?	pmulhw		mm1,mm4

			// put some stuff after the multiply:

			addl $4, %4						// ;add			line1,4
			packssdw	%%mm2, %%mm2		// ;packssdw	mm2,mm2		// mm2 = [y][y][y][y]

			// now XRGB = mm0 + mm1 + mm2

			paddsw	%%mm1, %%mm0		// ?reverse? paddsw		mm0,mm1

			paddsw	%%mm2, %%mm0		// ;paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

			// convert the four int16s to eight bytes; also do a clamp(0,255) for free!

			packuswb %%mm0, %%mm0 		//; packuswb	mm0,mm0		// hard stall on mm0, inevitable

			movd %%mm0, (%%edi)			//; movd		[edi],mm0	// hard stall on mm0, then unaligned write! bad!
			addl $3, %%edi				//;add			edi,3		// no stall on edi

		dec %%ecx	//;dec ecx
		jnz More

			//{		one last one that doesn't write 4->3
			movl %2, %%eax				//;mov eax,line3			// V
			movl (%%eax), %%eax			//;mov eax,[eax]
			addl $4, %2					//;add line3,4

			shl		$2, %%eax			//;shl			eax,2
			subl		$509, %%eax		//;sub			eax,509
			movd %%eax, %%mm0		//;movd		mm0,eax		// mm0 = [0][x]
			punpckldq %%mm0, %%mm0  //;punpckldq	mm0,mm0		// mm0 = [x][x]
			packssdw	%%mm0, %%mm0  //;packssdw	mm0,mm0		// mm0 = [x][x][x][x]
			pmulhw	%%mm3, %%mm0	//;pmulhw		mm0,mm3
			
			movl %3, %%ebx			//;mov ebx,line2			// U
			movl (%%ebx), %%ebx		//;mov ebx,[ebx]
			addl $4, %3				//;add line2,4

			shll $2, %%ebx			//;shl			ebx,2
			subl $509, %%ebx		//;sub			ebx,509
			movd %%ebx, %%mm1		//;movd		mm1,ebx		// mm0 = [0][x]
			punpckldq %%mm1, %%mm1	//;punpckldq	mm1,mm1		// mm0 = [x][x]
			packssdw %%mm1, %%mm1	//;packssdw	mm1,mm1		// mm0 = [x][x][x][x]
			pmulhw %%mm4, %%mm1		//;pmulhw		mm1,mm4

			movl %4, %%edx			//;mov edx,line1			// Y
			movl (%%edx), %%edx		//;mov edx,[edx]
			addl $4, %4             //;add line1,4

			movd %%edx, %%mm2		//;movd		mm2,edx		// mm0 = [0][x]
			punpckldq	%%mm2,%%mm2	//;punpckldq	mm2,mm2		// mm0 = [x][x]
			packssdw %%mm2, %%mm2	//;packssdw	mm2,mm2		// mm0 = [x][x][x][x]

			paddsw %%mm1, %%mm0		//;paddsw		mm0,mm1
			paddsw %%mm2, %%mm0		//;paddsw		mm0,mm2

			packuswb %%mm0, %%mm0	//;packuswb	mm0,mm0

			movd %%eax, %%mm0		//;movd		eax,mm0		// eax is XRGB
			mov (%%edi), %%ax		//;mov			[edi],ax
			shr $16, %%eax			//;shr			eax,16
			mov %%al, 2(%%edi)		//;mov			[edi+2],al
			" : // outputs
			  : "m" (w), "m" (bline), "m" (line3), "m" (line2), "m" (line1), "m" (Const_V_16), "m" (Const_U_16)// inputs
			  : "%edi", "%edx", "%eax", "%ebx", "%ecx" );// clobbered
#endif

	}

	//__asm { emms }	
}

void YUVi_to_BGRb_line_mmx2(int *line1,int *line2,int *line3,uint8 * bline,int len)
{
	assert(len > 1 );

	len --;

	cachetouch_r(line1,len>>3);
	cachetouch_r(line2,len>>3);
	cachetouch_r(line3,len>>3);
	cachetouch_w(bline,(len*3)>>5);
	
#ifdef WIN32
	__asm
	{
	
	mov ecx,len
	mov edi,bline

	movq mm3,Const_V_16
	movq mm4,Const_U_16

	YUVi_to_BGRb_line_mmx2_More:		

		mov			eax,line3			// V
		mov			eax,[eax]			// hard stall on eax, inevitable
		add			line3,4				// no stall on line3

		shl			eax,2
		
		mov			ebx,line2			// U

		sub			eax,510

		mov			ebx,[ebx]
		add			line2,4

		movd		mm0,eax		// mm0 = [0][x]
		
		shl			ebx,2

		punpckldq	mm0,mm0		// mm0 = [x][x]
		
		sub			ebx,510

		packssdw	mm0,mm0		// mm0 = [x][x][x][x]
		
		movd		mm1,ebx		// mm0 = [0][x]

		pmulhw		mm0,mm3
		mov			edx,line1			// Y
		
		punpckldq	mm1,mm1		// mm0 = [x][x]

		mov			edx,[edx]

		packssdw	mm1,mm1		// mm0 = [x][x][x][x]
				
		movd		mm2,edx		// mm0 = [0][x]

		add			line1,4

		punpckldq	mm2,mm2		// mm0 = [x][x]
		
		pmulhw		mm1,mm4

		packssdw	mm2,mm2		// mm0 = [x][x][x][x]

		paddsw		mm0,mm1

		paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

		packuswb	mm0,mm0		// hard stall on mm0, inevitable

		movd		[edi],mm0	// unaligned write! bad!
		add			edi,3		// no stall on edi

	dec ecx
	jnz YUVi_to_BGRb_line_mmx2_More

	mov bline,edi

	//emms
	}
#endif

#ifdef BUILD_BE
	__asm__ __volatile__ ( "
		
	movl %0, %%ecx		//;%0 = len   ;mov ecx,len
	movl %1, %%edi		//;%1 = bline ;mov edi,bline

	movq Const_V_16, %%mm3		//%2 = Const_V_16;movq mm3,Const_V_16
	movq Const_U_16, %%mm4		//%3 = Const_U_16;movq mm4,Const_U_16

	YUVi_to_BGRb_line_mmx2_More:		

		movl	%2,%%eax 			// ;%4 = line3  //mov			eax,line3			// V
		movl	(%%eax), %%eax		//;mov			eax,[eax]			// hard stall on eax, inevitable
		addl	$4, %2				//;add			line3,4				// no stall on line3

		shll	$2, %%eax			//;shl			eax,2
		
		movl	%3, %%ebx			//;3 = line2	mov			ebx,line2			// U

		subl	$510, %%eax			//;sub			eax,510

		movl (%%ebx), %%ebx			//;mov			ebx,[ebx]
		addl	$4, %3				//;add			line2,4

		movd %%eax, %%mm0			//;movd		mm0,eax		// mm0 = [0][x]
		
		shll $2, %%ebx				//;shl			ebx,2

		punpckldq	%%mm0, %%mm0	//;punpckldq	mm0,mm0		// mm0 = [x][x]
		
		subl $510,%%ebx				//;sub			ebx,510

		packssdw	%%mm0, %%mm0	//;packssdw	mm0,mm0		// mm0 = [x][x][x][x]
		
		movd %%ebx, %%mm1			//;movd		mm1,ebx		// mm0 = [0][x]

		pmulhw %%mm3, %%mm0			//;pmulhw		mm0,mm3
		movl %4, %%edx				// ;%4 = line 1mov			edx,line1			// Y
		
		punpckldq	%%mm1, %%mm1	//;punpckldq	mm1,mm1		// mm0 = [x][x]

		movl (%%edx), %%edx			//;mov			edx,[edx]

		packssdw	%%mm1, %%mm1	//;packssdw	mm1,mm1		// mm0 = [x][x][x][x]
				
		movd	%%edx, %%mm2		//;movd		mm2,edx		// mm0 = [0][x]

		addl $6, %3					//;add			line1,4

		punpckldq	%%mm2,%%mm2		//;punpckldq	mm2,mm2		// mm0 = [x][x]
		
		pmulhw %%mm4, %%mm1			//; (?reverse?) pmulhw		mm1,mm4

		packssdw %%mm2, %%mm2		//; packssdw	mm2,mm2		// mm0 = [x][x][x][x]

		paddsw %%mm1, %%mm0			// ?reverse? paddsw		mm0,mm1

		paddsw %%mm2, %%mm0			// ?reverse? paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

		packuswb %%mm0, %%mm0		// packuswb	mm0,mm0		// hard stall on mm0, inevitable

		movd (%%edi), %%mm0         // ;movd		[edi],mm0	// unaligned write! bad!
		addl $3, %%edi				// ;add			edi,3		// no stall on edi

	decl %%ecx		//;dec ecx
	jnz YUVi_to_BGRb_line_mmx2_More

	movl %%edi, %1			// ;mov bline,edi" 
	:	// outputs
	: "m" (len), "m" (bline), "m" (line3) , "m" (line2), "m" (line1)
	: "%ebx" , "%eax" , "%edi" , "%ecx" , "%edx" );
#endif


	{
	int y,u,v,r,g,b;	
	y = (*line1);
	u = (*line2) - 127;
	v = (*line3) - 127;

	r = R_YUV(y,u,v);
	g = G_YUV(y,u,v);
	b = B_YUV(y,u,v);

	r = minmax(r,0,255);
	g = minmax(g,0,255);
	b = minmax(b,0,255);

	bline[0] = b;
	bline[1] = g;
	bline[2] = r;
	}
}

void YUVi_to_XRGB_line_mmx(int *line1,int *line2,int *line3,uint8 * bline,int len)
{
	assert(len > 0 );

	cachetouch_r(line1,len>>3);
	cachetouch_r(line2,len>>3);
	cachetouch_r(line3,len>>3);
	cachetouch_w(bline,len>>3);
		
#ifdef WIN32
	__asm
	{
	
	mov ecx,len
	mov edi,bline

	movq mm3,Const_V_16
	movq mm4,Const_U_16

	More:		

		mov			eax,line3			// V
		mov			eax,[eax]			// hard stall on eax, inevitable
		add			line3,4				// no stall on line3

		shl			eax,2
		
		mov			ebx,line2			// U

		sub			eax,510

		mov			ebx,[ebx]
		add			line2,4

		movd		mm0,eax		// mm0 = [0][x]
		
		shl			ebx,2

		punpckldq	mm0,mm0		// mm0 = [x][x]
		
		sub			ebx,510

		packssdw	mm0,mm0		// mm0 = [x][x][x][x]
		
		movd		mm1,ebx		// mm0 = [0][x]

		pmulhw		mm0,mm3
		mov			edx,line1			// Y
		
		punpckldq	mm1,mm1		// mm0 = [x][x]

		mov			edx,[edx]

		packssdw	mm1,mm1		// mm0 = [x][x][x][x]
				
		movd		mm2,edx		// mm0 = [0][x]

		add			line1,4

		punpckldq	mm2,mm2		// mm0 = [x][x]
		
		pmulhw		mm1,mm4

		packssdw	mm2,mm2		// mm0 = [x][x][x][x]

		paddsw		mm0,mm1

		paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

		packuswb	mm0,mm0		// hard stall on mm0, inevitable

		movd		[edi],mm0
		add			edi,4		// no stall on edi

	dec ecx
	jnz More

	//emms
	}
#endif

#ifdef BUILD_BE
	__asm__ __volatile__ ("
	
	movl %0, %%ecx	// %0 = len ;mov ecx,len
	movl %1, %%edi	// %1 = bline ; mov edi,bline

	movq Const_V_16, %%mm3	//;movq mm3,Const_V_16
	movq Const_U_16, %%mm4	//;movq mm4,Const_U_16

	YUVi_to_XRGB_line_mmx_More:		

		movl %2, %%eax		// %2 = line3	//mov			eax,line3			// V
		movl (%%eax), %%eax		//;mov			eax,[eax]			// hard stall on eax, inevitable
		addl $4, %2		//;add			line3,4				// no stall on line3

		shll $2, %%eax		//shl			eax,2
		
		movl %3, %%ebx	// %3 = line2	mov			ebx,line2			// U

		subl $510, %%eax	//;sub			eax,510

		movl (%%ebx), %%ebx		//;mov			ebx,[ebx]
		addl $4, %3				//;add			line2,4

		movd %%eax, %%mm0		//;movd		mm0,eax		// mm0 = [0][x]
		
		shll $2, %%ebx			//; shl			ebx,2

		punpckldq %%mm0, %%mm0	//;punpckldq	mm0,mm0		// mm0 = [x][x]
		
		subl $510, %%ebx		//;sub			ebx,510

		packssdw %%mm0, %%mm0	//;packssdw	mm0,mm0		// mm0 = [x][x][x][x]
		
		movd %%ebx, %%mm1		//;movd		mm1,ebx		// mm0 = [0][x]

		pmulhw %%mm3, %%mm0		//?reverse? pmulhw		mm0,mm3
		movl %4 , %%edx			// %4 = line1	mov			edx,line1			// Y
		
		punpckldq %%mm1, %%mm1	//; punpckldq	mm1,mm1		// mm0 = [x][x]

		movl (%%edx), %%edx		//; mov			edx,[edx]

		packssdw %%mm1, %%mm1	//;packssdw	mm1,mm1		// mm0 = [x][x][x][x]
				
		movd %%edx, %%mm2		//; movd		mm2,edx		// mm0 = [0][x]

		addl $4, %4				// add			line1,4

		punpckldq %%mm2, %%mm2	//; punpckldq	mm2,mm2		// mm0 = [x][x]
		
		pmulhw %%mm4, %%mm1		// ?reverse? 	pmulhw		mm1,mm4

		packssdw %%mm2, %%mm2	// packssdw	mm2,mm2		// mm0 = [x][x][x][x]

		paddsw %%mm1, %%mm0		// ?reverse? paddsw		mm0,mm1

		paddsw %%mm2, %%mm0		// ?reverse? paddsw		mm0,mm2		// hard stall on mm0, inevitable ; no stall on mm2

		packuswb %%mm0, %%mm0	// packuswb	mm0,mm0		// hard stall on mm0, inevitable

		movd %%mm0, (%%edi)		//;movd		[edi],mm0
		addl $4, %%edi			//;add			edi,4		// no stall on edi

	decl %%ecx		//dec ecx
	jnz YUVi_to_XRGB_line_mmx_More
	" 
	: // outputs
	: "g" (len), "g" (bline) , "g" (line3), "g" (line2) , "g" (line1)
	: "%ecx" , "%edi", "%eax", "%ebx", "%edx");
	
#endif // BUILD_BE

}

/*}{******* CPU setup ***********/

void (*YUVi_to_XRGB_line)(int *line1,int *line2,int *line3,uint8 * bline,int len) = NULL;
void (*YUVi_to_BGRb_lines)(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride) = NULL;

void SetupYUV(void)
{
	jeCPU_GetInfo();

	if ( jeCPU_Features & JE_CPU_HAS_MMX )
	{
		// timed on hare512.bmp :
	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx1;	// blit : 0.025 seconds = 47.2 clocks / pixel
	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx2;	// blit : 0.025 seconds = 47.2 clocks / pixel
	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;	// blit : 0.034 seconds = 66.6 clocks / pixel
		YUVi_to_XRGB_line = YUVi_to_XRGB_line_mmx;
		YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_mmx;// blit : 0.0245 seconds= 45.9 clocks / pixel
	}
	else
	{
	//	YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;
		YUVi_to_XRGB_line = YUVi_to_XRGB_line_c;
		YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_c;
	}
}

/*}******* EOF ***********/
Summary ✨

This C code implements a function YUVi_to_XRGB_line that converts YUV (luminance) data to RGB (color) data using various CPU-specific optimization techniques, such as MMX instructions. The function takes four input parameters: two YUV lines and an RGB line, along with the length of the input data. It returns a pointer to the converted RGB line. The code also includes a setup function SetupYUV that determines which CPU-specific optimizations to use based on the CPU’s features.