/OpenAL-Sample/src/arch/i386/x86_floatmul_mmx.c

https://github.com/rpavlik/openal-svn-mirror · C · 98 lines · 60 code · 14 blank · 24 comment · 7 complexity · 466e9b41c247b86a039f27e468d4a509 MD5 · raw file

  1. /***************************************************************************
  2. * MMX routine *
  3. * Copyright (C) 2005-2006 by Prakash Punnoor *
  4. * prakash@punnoor.de *
  5. * *
  6. * This program is free software; you can redistribute it and/or modify *
  7. * it under the terms of the GNU Library General Public License as *
  8. * published by the Free Software Foundation; either version 2 of the *
  9. * License, or (at your option) any later version. *
  10. * *
  11. * This program is distributed in the hope that it will be useful, *
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  14. * GNU General Public License for more details. *
  15. * *
  16. * You should have received a copy of the GNU Library General Public *
  17. * License along with this program; if not, write to the *
  18. * Free Software Foundation, Inc., *
  19. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
  20. ***************************************************************************/
  21. #include "al_siteconfig.h"
  22. #include <AL/al.h>
  23. #include "al_cpu_caps.h"
  24. #include "x86_simd_support_prk.h"
  25. /* MMX routine needs 16 */
  26. #define SCALING_POWER 16
  27. #define SCALING_FACTOR (1 << SCALING_POWER)
  28. #define MIN_ENTER_SIMD_LEN 48
  29. void _alFloatMul_MMX(ALshort *bpt, ALfloat sa, ALuint len) {
  30. ALint scaled_sa = sa * SCALING_FACTOR;
  31. ALint iter;
  32. if (len >= MIN_ENTER_SIMD_LEN) {
  33. v4hi v_sa;
  34. ALuint samples_main;
  35. ALuint samples_pre;
  36. ALuint samples_post;
  37. samples_pre = MMX_ALIGN - (aint)bpt % MMX_ALIGN;
  38. samples_pre /= sizeof(ALshort);
  39. samples_main = len - samples_pre;
  40. samples_post = samples_main % 8;
  41. samples_main = samples_main / 8;
  42. len = samples_post;
  43. while(samples_pre--) {
  44. iter = *bpt;
  45. iter *= scaled_sa;
  46. iter >>= SCALING_POWER;
  47. *bpt = iter;
  48. ++bpt;
  49. }
  50. if (scaled_sa < (1 << 15)) {
  51. /* we do signed multiplication, so 1 << 15 is the max */
  52. v_sa = setw(scaled_sa);
  53. while (samples_main--) {
  54. *(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
  55. bpt += 4;
  56. *(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
  57. bpt += 4;
  58. }
  59. } else {
  60. /* we lose 1 bit here, but well... */
  61. v4hi temp;
  62. short sa2 = scaled_sa >> 1;
  63. v_sa = setw(sa2);
  64. while (samples_main--) {
  65. /* work-around gcc 3.3.x bug */
  66. const long num_shift = 1L;
  67. temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
  68. *(v4hi*)bpt = __builtin_ia32_psllw(temp, num_shift);
  69. bpt += 4;
  70. temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa);
  71. *(v4hi*)bpt = __builtin_ia32_psllw(temp, num_shift);
  72. bpt += 4;
  73. }
  74. }
  75. __builtin_ia32_emms();
  76. }
  77. while(len--) {
  78. iter = *bpt;
  79. iter *= scaled_sa;
  80. iter >>= SCALING_POWER;
  81. *bpt = iter;
  82. ++bpt;
  83. }
  84. }