/OpenAL-Sample/src/arch/i386/x86_floatmul_sse2.c

https://github.com/rpavlik/openal-svn-mirror · C · 96 lines · 59 code · 13 blank · 24 comment · 7 complexity · 22933b0f232dadadbf166caefdf14d8e MD5 · raw file

  1. /***************************************************************************
  2. * SSE2 routine *
  3. * Copyright (C) 2006 by Prakash Punnoor *
  4. * prakash@punnoor.de *
  5. * *
  6. * This program is free software; you can redistribute it and/or modify *
  7. * it under the terms of the GNU Library General Public License as *
  8. * published by the Free Software Foundation; either version 2 of the *
  9. * License, or (at your option) any later version. *
  10. * *
  11. * This program is distributed in the hope that it will be useful, *
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  14. * GNU General Public License for more details. *
  15. * *
  16. * You should have received a copy of the GNU Library General Public *
  17. * License along with this program; if not, write to the *
  18. * Free Software Foundation, Inc., *
  19. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
  20. ***************************************************************************/
  21. #include "al_siteconfig.h"
  22. #include <AL/al.h>
  23. #include "al_cpu_caps.h"
  24. #include "x86_simd_support_prk.h"
  25. /* SSE2 routines needs 16 */
  26. #define SCALING_POWER 16
  27. #define SCALING_FACTOR (1 << SCALING_POWER)
  28. #define MIN_ENTER_SIMD_LEN 48
  29. void _alFloatMul_SSE2(ALshort *bpt, ALfloat sa, ALuint len) {
  30. ALint scaled_sa = sa * SCALING_FACTOR;
  31. ALint iter;
  32. if (len >= MIN_ENTER_SIMD_LEN) {
  33. v8hi v_sa;
  34. ALuint samples_main;
  35. ALuint samples_pre;
  36. ALuint samples_post;
  37. samples_pre = SSE2_ALIGN - (aint)bpt % SSE2_ALIGN;
  38. samples_pre /= sizeof(ALshort);
  39. samples_main = len - samples_pre;
  40. samples_post = samples_main % 16;
  41. samples_main = samples_main / 16;
  42. len = samples_post;
  43. while(samples_pre--) {
  44. iter = *bpt;
  45. iter *= scaled_sa;
  46. iter >>= SCALING_POWER;
  47. *bpt = iter;
  48. ++bpt;
  49. }
  50. if (scaled_sa < (1 << 15)) {
  51. /* we do signed multiplication, so 1 << 15 is the max */
  52. v_sa = setw128(scaled_sa);
  53. while (samples_main--) {
  54. *(v8hi*)bpt = __builtin_ia32_pmulhw128(*(v8hi*)bpt, v_sa);
  55. bpt += 8;
  56. *(v8hi*)bpt = __builtin_ia32_pmulhw128(*(v8hi*)bpt, v_sa);
  57. bpt += 8;
  58. }
  59. } else {
  60. /* we lose 1 bit here, but well... */
  61. v8hi temp;
  62. short sa2 = scaled_sa >> 1;
  63. v_sa = setw128(sa2);
  64. while (samples_main--) {
  65. /* work-around gcc 3.3.x bug */
  66. const long num_shift = 1L;
  67. temp = __builtin_ia32_pmulhw128(*(v8hi*)bpt, v_sa);
  68. *(v8hi*)bpt = __builtin_ia32_psllwi128(temp, num_shift);
  69. bpt += 8;
  70. temp = __builtin_ia32_pmulhw128(*(v8hi*)bpt, v_sa);
  71. *(v8hi*)bpt = __builtin_ia32_psllwi128(temp, num_shift);
  72. bpt += 8;
  73. }
  74. }
  75. }
  76. while(len--) {
  77. iter = *bpt;
  78. iter *= scaled_sa;
  79. iter >>= SCALING_POWER;
  80. *bpt = iter;
  81. ++bpt;
  82. }
  83. }