PageRenderTime 32ms CodeModel.GetById 12ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/xbmc/utils/fastmemcpy.c

http://github.com/xbmc/xbmc
C | 396 lines | 248 code | 27 blank | 121 comment | 24 complexity | 4cd5f6073047ac3d31465e9533caa331 MD5 | raw file
  1/*
  2 * fastmemcpy.h : fast memcpy routines
  3 *****************************************************************************
  4 *      $Id: fastmemcpy.h 13905 2006-01-12 23:10:04Z dionoea $
  5 *
  6 *      Authors: various Linux kernel hackers
  7 *               various MPlayer hackers
  8 *               Nick Kurshev <nickols_k@mail.ru>
  9 *
 10 *      Copyright (C) 2011-2013 Team XBMC
 11 *      http://xbmc.org
 12 *
 13 *  This Program is free software; you can redistribute it and/or modify
 14 *  it under the terms of the GNU General Public License as published by
 15 *  the Free Software Foundation; either version 2, or (at your option)
 16 *  any later version.
 17 *
 18 *  This Program is distributed in the hope that it will be useful,
 19 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 21 *  GNU General Public License for more details.
 22 *
 23 *  You should have received a copy of the GNU General Public License
 24 *  along with XBMC; see the file COPYING.  If not, see
 25 *  <http://www.gnu.org/licenses/>.
 26 *
 27 */
 28#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__arm__) 
 29#define HAVE_MMX2
 30#define HAVE_SSE
 31
 32/*
 33  aclib - advanced C library ;)
 34  This file contains functions which improve and expand standard C-library
 35*/
 36#include <stddef.h>
 37
 38#define BLOCK_SIZE 4096
 39#define CONFUSION_FACTOR 0
 40/*Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)*/
 41
 42/*#define STATISTICS*/
 43
 44#ifndef HAVE_SSE2
 45/*
 46   P3 processor has only one SSE decoder so can execute only 1 sse insn per
 47   cpu clock, but it has 3 mmx decoders (include load/store unit)
 48   and executes 3 mmx insns per cpu clock.
 49   P4 processor has some chances, but after reading:
 50   http://www.emulators.com/pentium4.htm
 51   I have doubts. Anyway SSE2 version of this code can be written better.
 52*/
 53#undef HAVE_SSE
 54#endif
 55
 56
 57/*
 58 This part of code was taken by me from Linux-2.4.3 and slightly modified
 59for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
 60blocks but mplayer uses weakly ordered data and original sources can not
 61speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
 62
 63>From IA-32 Intel Architecture Software Developer's Manual Volume 1,
 64
 65Order Number 245470:
 66"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
 67
 68Data referenced by a program can be temporal (data will be used again) or
 69non-temporal (data will be referenced once and not reused in the immediate
 70future). To make efficient use of the processor's caches, it is generally
 71desirable to cache temporal data and not cache non-temporal data. Overloading
 72the processor's caches with non-temporal data is sometimes referred to as
 73"polluting the caches".
 74The non-temporal data is written to memory with Write-Combining semantics.
 75
 76The PREFETCHh instructions permits a program to load data into the processor
 77at a suggested cache level, so that it is closer to the processors load and
 78store unit when it is needed. If the data is already present in a level of
 79the cache hierarchy that is closer to the processor, the PREFETCHh instruction
 80will not result in any data movement.
 81But we should you PREFETCHNTA: Non-temporal data fetch data into location
 82close to the processor, minimizing cache pollution.
 83
 84The MOVNTQ (store quadword using non-temporal hint) instruction stores
 85packed integer data from an MMX register to memory, using a non-temporal hint.
 86The MOVNTPS (store packed single-precision floating-point values using
 87non-temporal hint) instruction stores packed floating-point data from an
 88XMM register to memory, using a non-temporal hint.
 89
 90The SFENCE (Store Fence) instruction controls write ordering by creating a
 91fence for memory store operations. This instruction guarantees that the results
 92of every store instruction that precedes the store fence in program order is
 93globally visible before any store instruction that follows the fence. The
 94SFENCE instruction provides an efficient way of ensuring ordering between
 95procedures that produce weakly-ordered data and procedures that consume that
 96data.
 97
 98If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
 99*/
100
101/* 3dnow memcpy support from kernel 2.4.2 */
102/*  by Pontscho/fresh!mindworkz           */
103
104#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
105
106#undef HAVE_MMX1
107#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
108/*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
109    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
110    standard (non MMX-optimized) version.
111    Note: on K6-2+ it speedups memory copying upto 25% and
112          on K7 and P3 about 500% (5 times). */
113#define HAVE_MMX1
114#endif
115
116
117#undef HAVE_K6_2PLUS
118#if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
119#define HAVE_K6_2PLUS
120#endif
121
122/* for small memory blocks (<256 bytes) this version is faster */
123#define small_memcpy(to,from,n)\
124{\
125register unsigned long int dummy;\
126__asm__ __volatile__(\
127	"rep; movsb"\
128	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
129/* It's most portable way to notify compiler */\
130/* that edi, esi and ecx are clobbered in asm block. */\
131/* Thanks to A'rpi for hint!!! */\
132        :"0" (to), "1" (from),"2" (n)\
133	: "memory");\
134}
135
136#ifdef HAVE_SSE
137#define MMREG_SIZE 16
138#else
139#define MMREG_SIZE 64 /*8*/
140#endif
141
142/* Small defines (for readability only) ;) */
143#ifdef HAVE_K6_2PLUS
144#define PREFETCH "prefetch"
145/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
146#define EMMS     "femms"
147#else
148#define PREFETCH "prefetchnta"
149#define EMMS     "emms"
150#endif
151
152#ifdef HAVE_MMX2
153#define MOVNTQ "movntq"
154#else
155#define MOVNTQ "movq"
156#endif
157
158#ifdef HAVE_MMX1
159#define MIN_LEN 0x800  /* 2K blocks */
160#else
161#define MIN_LEN 0x40  /* 64-byte blocks */
162#endif
163
164void * fast_memcpy(void * to, const void * from, size_t len)
165{
166	void *retval;
167	size_t i;
168	retval = to;
169#ifdef STATISTICS
170	{
171		static int freq[33];
172		static int t=0;
173		int i;
174		for(i=0; len>(1<<i); i++);
175		freq[i]++;
176		t++;
177		if(1024*1024*1024 % t == 0)
178			for(i=0; i<32; i++)
179				printf("freq < %8d %4d\n", 1<<i, freq[i]);
180	}
181#endif
182#ifndef HAVE_MMX1
183        /* PREFETCH has effect even for MOVSB instruction ;) */
184	__asm__ __volatile__ (
185	        PREFETCH" (%0)\n"
186	        PREFETCH" 64(%0)\n"
187	        PREFETCH" 128(%0)\n"
188        	PREFETCH" 192(%0)\n"
189        	PREFETCH" 256(%0)\n"
190		: : "r" (from) );
191#endif
192        if(len >= MIN_LEN)
193	{
194	  register unsigned long int delta;
195          /* Align destinition to MMREG_SIZE -boundary */
196          delta = ((unsigned long int)to)&(MMREG_SIZE-1);
197          if(delta)
198	  {
199	    delta=MMREG_SIZE-delta;
200	    len -= delta;
201	    small_memcpy(to, from, delta);
202	  }
203	  i = len >> 6; /* len/64 */
204	  len&=63;
205        /*
206           This algorithm is top effective when the code consequently
207           reads and writes blocks which have size of cache line.
208           Size of cache line is processor-dependent.
209           It will, however, be a minimum of 32 bytes on any processors.
210           It would be better to have a number of instructions which
211           perform reading and writing to be multiple to a number of
212           processor's decoders, but it's not always possible.
213        */
214#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
215	if(((unsigned long)from) & 15)
216	/* if SRC is misaligned */
217	for(; i>0; i--)
218	{
219		__asm__ __volatile__ (
220		PREFETCH" 320(%0)\n"
221		"movups (%0), %%xmm0\n"
222		"movups 16(%0), %%xmm1\n"
223		"movups 32(%0), %%xmm2\n"
224		"movups 48(%0), %%xmm3\n"
225		"movntps %%xmm0, (%1)\n"
226		"movntps %%xmm1, 16(%1)\n"
227		"movntps %%xmm2, 32(%1)\n"
228		"movntps %%xmm3, 48(%1)\n"
229		:: "r" (from), "r" (to) : "memory");
230		((const unsigned char *)from)+=64;
231		((unsigned char *)to)+=64;
232	}
233	else
234	/*
235	   Only if SRC is aligned on 16-byte boundary.
236	   It allows to use movaps instead of movups, which required data
237	   to be aligned or a general-protection exception (#GP) is generated.
238	*/
239	for(; i>0; i--)
240	{
241		__asm__ __volatile__ (
242		PREFETCH" 320(%0)\n"
243		"movaps (%0), %%xmm0\n"
244		"movaps 16(%0), %%xmm1\n"
245		"movaps 32(%0), %%xmm2\n"
246		"movaps 48(%0), %%xmm3\n"
247		"movntps %%xmm0, (%1)\n"
248		"movntps %%xmm1, 16(%1)\n"
249		"movntps %%xmm2, 32(%1)\n"
250		"movntps %%xmm3, 48(%1)\n"
251		:: "r" (from), "r" (to) : "memory");
252		((const unsigned char *)from)+=64;
253		((unsigned char *)to)+=64;
254	}
255#else
256	/* Align destination at BLOCK_SIZE boundary */
257	for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--)
258	{
259		__asm__ __volatile__ (
260#ifndef HAVE_MMX1
261        	PREFETCH" 320(%0)\n"
262#endif
263		"movq (%0), %%mm0\n"
264		"movq 8(%0), %%mm1\n"
265		"movq 16(%0), %%mm2\n"
266		"movq 24(%0), %%mm3\n"
267		"movq 32(%0), %%mm4\n"
268		"movq 40(%0), %%mm5\n"
269		"movq 48(%0), %%mm6\n"
270		"movq 56(%0), %%mm7\n"
271		MOVNTQ" %%mm0, (%1)\n"
272		MOVNTQ" %%mm1, 8(%1)\n"
273		MOVNTQ" %%mm2, 16(%1)\n"
274		MOVNTQ" %%mm3, 24(%1)\n"
275		MOVNTQ" %%mm4, 32(%1)\n"
276		MOVNTQ" %%mm5, 40(%1)\n"
277		MOVNTQ" %%mm6, 48(%1)\n"
278		MOVNTQ" %%mm7, 56(%1)\n"
279		:: "r" (from), "r" (to) : "memory");
280                from = (const void *) (((const unsigned char *)from)+64);
281		to = (void *) (((unsigned char *)to)+64);
282	}
283
284/*	printf(" %p %p\n", (ptrdiff_t)from&1023, (ptrdiff_t)to&1023); */
285	/* Pure Assembly cuz gcc is a bit unpredictable ;) */
286# if 0
287	if(i>=BLOCK_SIZE/64)
288		asm volatile(
289			"xorl %%eax, %%eax	\n\t"
290			".balign 16		\n\t"
291			"1:			\n\t"
292				"movl (%0, %%eax), %%ebx 	\n\t"
293				"movl 32(%0, %%eax), %%ebx 	\n\t"
294				"movl 64(%0, %%eax), %%ebx 	\n\t"
295				"movl 96(%0, %%eax), %%ebx 	\n\t"
296				"addl $128, %%eax		\n\t"
297				"cmpl %3, %%eax			\n\t"
298				" jb 1b				\n\t"
299
300			"xorl %%eax, %%eax	\n\t"
301
302				".balign 16		\n\t"
303				"2:			\n\t"
304				"movq (%0, %%eax), %%mm0\n"
305				"movq 8(%0, %%eax), %%mm1\n"
306				"movq 16(%0, %%eax), %%mm2\n"
307				"movq 24(%0, %%eax), %%mm3\n"
308				"movq 32(%0, %%eax), %%mm4\n"
309				"movq 40(%0, %%eax), %%mm5\n"
310				"movq 48(%0, %%eax), %%mm6\n"
311				"movq 56(%0, %%eax), %%mm7\n"
312				MOVNTQ" %%mm0, (%1, %%eax)\n"
313				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
314				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
315				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
316				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
317				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
318				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
319				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
320				"addl $64, %%eax		\n\t"
321				"cmpl %3, %%eax		\n\t"
322				"jb 2b				\n\t"
323
324#if CONFUSION_FACTOR > 0
325	/* a few percent speedup on out of order executing CPUs */
326			"movl %5, %%eax		\n\t"
327				"2:			\n\t"
328				"movl (%0), %%ebx	\n\t"
329				"movl (%0), %%ebx	\n\t"
330				"movl (%0), %%ebx	\n\t"
331				"movl (%0), %%ebx	\n\t"
332				"decl %%eax		\n\t"
333				" jnz 2b		\n\t"
334#endif
335
336			"xorl %%eax, %%eax	\n\t"
337			"addl %3, %0		\n\t"
338			"addl %3, %1		\n\t"
339			"subl %4, %2		\n\t"
340			"cmpl %4, %2		\n\t"
341			" jae 1b		\n\t"
342				: "+r" (from), "+r" (to), "+r" (i)
343				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
344				: "%eax", "%ebx"
345		);
346#endif
347
348	for(; i>0; i--)
349	{
350		__asm__ __volatile__ (
351#ifndef HAVE_MMX1
352        	PREFETCH" 320(%0)\n"
353#endif
354		"movq (%0), %%mm0\n"
355		"movq 8(%0), %%mm1\n"
356		"movq 16(%0), %%mm2\n"
357		"movq 24(%0), %%mm3\n"
358		"movq 32(%0), %%mm4\n"
359		"movq 40(%0), %%mm5\n"
360		"movq 48(%0), %%mm6\n"
361		"movq 56(%0), %%mm7\n"
362		MOVNTQ" %%mm0, (%1)\n"
363		MOVNTQ" %%mm1, 8(%1)\n"
364		MOVNTQ" %%mm2, 16(%1)\n"
365		MOVNTQ" %%mm3, 24(%1)\n"
366		MOVNTQ" %%mm4, 32(%1)\n"
367		MOVNTQ" %%mm5, 40(%1)\n"
368		MOVNTQ" %%mm6, 48(%1)\n"
369		MOVNTQ" %%mm7, 56(%1)\n"
370		:: "r" (from), "r" (to) : "memory");
371		from = (const void *) (((const unsigned char *)from)+64);
372		to = (void *) (((unsigned char *)to)+64);
373	}
374
375#endif /* Have SSE */
376#ifdef HAVE_MMX2
377                /* since movntq is weakly-ordered, a "sfence"
378		 * is needed to become ordered again. */
379		__asm__ __volatile__ ("sfence":::"memory");
380#endif
381#ifndef HAVE_SSE
382		/* enables to use FPU */
383		__asm__ __volatile__ (EMMS:::"memory");
384#endif
385	}
386	/*
387	 *	Now do the tail of the block
388	 */
389	if(len) small_memcpy(to, from, len);
390	return retval;
391}
392
393
394#endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
395
396#endif