x86_floatmul_mmx.c | searchcode

/OpenAL-Sample/src/arch/i386/x86_floatmul_mmx.c

https://github.com/rpavlik/openal-svn-mirror · C · 98 lines · 60 code · 14 blank · 24 comment · 7 complexity · 466e9b41c247b86a039f27e468d4a509 MD5 · raw file

/***************************************************************************
 *   MMX routine                                                           *
 *   Copyright (C) 2005-2006 by Prakash Punnoor                            *
 *   prakash@punnoor.de                                                    *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Library General Public License as       *
 *   published by the Free Software Foundation; either version 2 of the    *
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#include "al_siteconfig.h"

#include <AL/al.h>
#include "al_cpu_caps.h"
#include "x86_simd_support_prk.h"

/* MMX routine needs 16 */
#define SCALING_POWER  16
#define SCALING_FACTOR (1 << SCALING_POWER)

#define MIN_ENTER_SIMD_LEN 48


void _alFloatMul_MMX(ALshort *bpt, ALfloat sa, ALuint len) {
	ALint scaled_sa = sa * SCALING_FACTOR;
	ALint iter;
	
	if (len >= MIN_ENTER_SIMD_LEN) {
		v4hi v_sa;
		ALuint samples_main;
		ALuint samples_pre;
		ALuint samples_post;
		
		
		samples_pre = MMX_ALIGN - (aint)bpt % MMX_ALIGN;
		samples_pre /= sizeof(ALshort);
		samples_main = len - samples_pre;
		samples_post = samples_main % 8;
		samples_main = samples_main / 8;
		len = samples_post;
		
		while(samples_pre--) {
			iter = *bpt;
			iter *= scaled_sa;
			iter >>= SCALING_POWER;
			*bpt = iter;
			++bpt;
		}
		
		if (scaled_sa < (1 << 15)) {
			/* we do signed multiplication, so 1 << 15 is the max */
			v_sa = setw(scaled_sa);
			
			while (samples_main--) {
				*(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
				bpt += 4;
				*(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
				bpt += 4;
			}
		} else {
			/* we lose 1 bit here, but well... */
			v4hi temp;
			short sa2 = scaled_sa >> 1;
			v_sa = setw(sa2);
			
			while (samples_main--) {
				/* work-around gcc 3.3.x bug */
				const long num_shift = 1L;
				
				temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
				*(v4hi*)bpt = __builtin_ia32_psllw(temp, num_shift);
				bpt += 4;
				temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
				*(v4hi*)bpt = __builtin_ia32_psllw(temp, num_shift);
				bpt += 4;
			}
		}
		__builtin_ia32_emms();
	}

	while(len--) {
		iter = *bpt;
		iter *= scaled_sa;
		iter >>= SCALING_POWER;
		*bpt = iter;
		++bpt;
	}
}