/src/dsp/convolve_sse.c

https://github.com/ttsou/openphy · C · 705 lines · 534 code · 100 blank · 71 comment · 36 complexity · 6e262377315f8d5f293016d5d49f3447 MD5 · raw file

  1. /*
  2. * Intel SSE Complex Convolution
  3. *
  4. * Copyright (C) 2015 Ettus Research LLC
  5. * Author Tom Tsou <tom.tsou@ettus.com>
  6. *
  7. * This program is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #include <string.h>
  21. #include <assert.h>
  22. #include <xmmintrin.h>
  23. #include <pmmintrin.h>
  24. #include <immintrin.h>
  25. #include <complex.h>
  26. #include <stdio.h>
  27. #include "convolve.h"
  28. #include "mac.h"
  29. #include "sigvec_internal.h"
  30. #ifndef _MM_SHUFFLE
  31. #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  32. (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  33. #endif
  34. //#define HAVE_FMA
  35. /* Generic non-optimized complex-real convolution */
  36. static void conv_real_generic(const float *x,
  37. const float *h,
  38. float *y,
  39. int h_len, int len)
  40. {
  41. int i;
  42. memset(y, 0, len * sizeof(float complex));
  43. for (i = 0; i < len; i++)
  44. mac_real_vec_n(&x[2 * i], h, &y[2 * i], h_len);
  45. }
  46. /* 4-tap SSE3 complex-real convolution */
  47. static void conv_real_sse4(const float *x,
  48. const float *h,
  49. float *y, int in_len)
  50. {
  51. int i;
  52. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  53. /* Load (aligned) filter taps */
  54. m0 = _mm_load_ps(&h[0]);
  55. m1 = _mm_load_ps(&h[4]);
  56. m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  57. for (i = 0; i < in_len; i++) {
  58. /* Load (unaligned) input data */
  59. m0 = _mm_loadu_ps(&x[2 * i + 0]);
  60. m1 = _mm_loadu_ps(&x[2 * i + 4]);
  61. m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  62. m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  63. /* Quad multiply */
  64. m4 = _mm_mul_ps(m2, m7);
  65. m5 = _mm_mul_ps(m3, m7);
  66. /* Sum and store */
  67. m6 = _mm_hadd_ps(m4, m5);
  68. m0 = _mm_hadd_ps(m6, m6);
  69. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m0));
  70. }
  71. }
  72. /* 8-tap SSE3 complex-real convolution */
  73. static void conv_real_sse8(const float *x,
  74. const float *h,
  75. float *y,
  76. int in_len)
  77. {
  78. int i;
  79. __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
  80. #ifdef HAVE_FMA
  81. __m128 m10, m11;
  82. #endif
  83. /* Load (aligned) filter taps */
  84. m0 = _mm_load_ps(&h[0]);
  85. m1 = _mm_load_ps(&h[4]);
  86. m2 = _mm_load_ps(&h[8]);
  87. m3 = _mm_load_ps(&h[12]);
  88. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  89. m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  90. for (i = 0; i < in_len; i++) {
  91. /* Load (unaligned) input data */
  92. m0 = _mm_loadu_ps(&x[2 * i + 0]);
  93. m1 = _mm_loadu_ps(&x[2 * i + 4]);
  94. m2 = _mm_loadu_ps(&x[2 * i + 8]);
  95. m3 = _mm_loadu_ps(&x[2 * i + 12]);
  96. m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  97. m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  98. m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  99. m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  100. #ifdef HAVE_FMA
  101. m10 = _mm_setzero_ps();
  102. m11 = _mm_setzero_ps();
  103. m10 = _mm_fmadd_ps(m6, m4, m10);
  104. m10 = _mm_fmadd_ps(m8, m5, m10);
  105. m11 = _mm_fmadd_ps(m9, m5, m11);
  106. m11 = _mm_fmadd_ps(m7, m4, m11);
  107. m8 = _mm_hadd_ps(m10, m11);
  108. m9 = _mm_hadd_ps(m8, m8);
  109. #else
  110. /* Quad multiply */
  111. m0 = _mm_mul_ps(m6, m4);
  112. m1 = _mm_mul_ps(m7, m4);
  113. m2 = _mm_mul_ps(m8, m5);
  114. m3 = _mm_mul_ps(m9, m5);
  115. /* Sum and store */
  116. m6 = _mm_add_ps(m0, m2);
  117. m7 = _mm_add_ps(m1, m3);
  118. m8 = _mm_hadd_ps(m6, m7);
  119. m9 = _mm_hadd_ps(m8, m8);
  120. #endif
  121. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m9));
  122. }
  123. }
  124. /* 12-tap SSE3 complex-real convolution */
  125. static void conv_real_sse12(const float *x,
  126. const float *h,
  127. float *y, int in_len)
  128. {
  129. int i;
  130. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  131. __m128 m8, m9, m10, m11, m12, m13, m14;
  132. /* Load (aligned) filter taps */
  133. m0 = _mm_load_ps(&h[0]);
  134. m1 = _mm_load_ps(&h[4]);
  135. m2 = _mm_load_ps(&h[8]);
  136. m3 = _mm_load_ps(&h[12]);
  137. m4 = _mm_load_ps(&h[16]);
  138. m5 = _mm_load_ps(&h[20]);
  139. m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  140. m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  141. m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(2, 0, 2, 0));
  142. for (i = 0; i < in_len; i++) {
  143. /* Load (unaligned) input data */
  144. m0 = _mm_loadu_ps(&x[2 * i + 0]);
  145. m1 = _mm_loadu_ps(&x[2 * i + 4]);
  146. m2 = _mm_loadu_ps(&x[2 * i + 8]);
  147. m3 = _mm_loadu_ps(&x[2 * i + 12]);
  148. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  149. m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  150. m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  151. m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  152. m0 = _mm_loadu_ps(&x[2 * i + 16]);
  153. m1 = _mm_loadu_ps(&x[2 * i + 20]);
  154. m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  155. m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  156. /* Quad multiply */
  157. m0 = _mm_mul_ps(m4, m12);
  158. m1 = _mm_mul_ps(m5, m12);
  159. m2 = _mm_mul_ps(m6, m13);
  160. m3 = _mm_mul_ps(m7, m13);
  161. m4 = _mm_mul_ps(m8, m14);
  162. m5 = _mm_mul_ps(m9, m14);
  163. /* Sum and store */
  164. m8 = _mm_add_ps(m0, m2);
  165. m9 = _mm_add_ps(m1, m3);
  166. m10 = _mm_add_ps(m8, m4);
  167. m11 = _mm_add_ps(m9, m5);
  168. m2 = _mm_hadd_ps(m10, m11);
  169. m3 = _mm_hadd_ps(m2, m2);
  170. _mm_store_ss(&y[2 * i + 0], m3);
  171. m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
  172. _mm_store_ss(&y[2 * i + 1], m3);
  173. }
  174. }
  175. /* 16-tap SSE3 complex-real convolution */
  176. static void conv_real_sse16(const float *x,
  177. const float *h,
  178. float *y, int in_len)
  179. {
  180. int i;
  181. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  182. __m128 m8, m9, m10, m11, m12, m13, m14, m15;
  183. /* Load (aligned) filter taps */
  184. m0 = _mm_load_ps(&h[0]);
  185. m1 = _mm_load_ps(&h[4]);
  186. m2 = _mm_load_ps(&h[8]);
  187. m3 = _mm_load_ps(&h[12]);
  188. m4 = _mm_load_ps(&h[16]);
  189. m5 = _mm_load_ps(&h[20]);
  190. m6 = _mm_load_ps(&h[24]);
  191. m7 = _mm_load_ps(&h[28]);
  192. m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  193. m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  194. m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(2, 0, 2, 0));
  195. m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(2, 0, 2, 0));
  196. for (i = 0; i < in_len; i++) {
  197. m10 = _mm_setzero_ps();
  198. m11 = _mm_setzero_ps();
  199. /* Load (unaligned) input data */
  200. m0 = _mm_loadu_ps(&x[2 * i + 0]);
  201. m1 = _mm_loadu_ps(&x[2 * i + 4]);
  202. m2 = _mm_loadu_ps(&x[2 * i + 8]);
  203. m3 = _mm_loadu_ps(&x[2 * i + 12]);
  204. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  205. m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  206. m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  207. m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  208. m0 = _mm_loadu_ps(&x[2 * i + 16]);
  209. m1 = _mm_loadu_ps(&x[2 * i + 20]);
  210. m2 = _mm_loadu_ps(&x[2 * i + 24]);
  211. m3 = _mm_loadu_ps(&x[2 * i + 28]);
  212. m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  213. m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  214. m0 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  215. m1 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  216. #ifdef HAVE_FMA
  217. m10 = _mm_fmadd_ps(m4, m12, m10);
  218. m10 = _mm_fmadd_ps(m6, m13, m10);
  219. m10 = _mm_fmadd_ps(m8, m14, m10);
  220. m10 = _mm_fmadd_ps(m0, m15, m10);
  221. m11 = _mm_fmadd_ps(m5, m12, m11);
  222. m11 = _mm_fmadd_ps(m7, m13, m11);
  223. m11 = _mm_fmadd_ps(m9, m14, m11);
  224. m11 = _mm_fmadd_ps(m1, m15, m11);
  225. m2 = _mm_hadd_ps(m10, m11);
  226. m3 = _mm_hadd_ps(m2, m2);
  227. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m3));
  228. #else
  229. /* Quad multiply */
  230. m0 = _mm_mul_ps(m4, m12);
  231. m1 = _mm_mul_ps(m5, m12);
  232. m2 = _mm_mul_ps(m6, m13);
  233. m3 = _mm_mul_ps(m7, m13);
  234. m4 = _mm_mul_ps(m8, m14);
  235. m5 = _mm_mul_ps(m9, m14);
  236. m6 = _mm_mul_ps(m10, m15);
  237. m7 = _mm_mul_ps(m11, m15);
  238. /* Sum and store */
  239. m8 = _mm_add_ps(m0, m2);
  240. m9 = _mm_add_ps(m1, m3);
  241. m10 = _mm_add_ps(m4, m6);
  242. m11 = _mm_add_ps(m5, m7);
  243. m0 = _mm_add_ps(m8, m10);
  244. m1 = _mm_add_ps(m9, m11);
  245. m2 = _mm_hadd_ps(m0, m1);
  246. m3 = _mm_hadd_ps(m2, m2);
  247. #endif
  248. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m3));
  249. }
  250. }
  251. /* 20-tap SSE3 complex-real convolution */
  252. static void conv_real_sse20(const float *x,
  253. const float *h,
  254. float *y, int in_len)
  255. {
  256. int i;
  257. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  258. __m128 m8, m9, m11, m12, m13, m14, m15;
  259. /* Load (aligned) filter taps */
  260. m0 = _mm_load_ps(&h[0]);
  261. m1 = _mm_load_ps(&h[4]);
  262. m2 = _mm_load_ps(&h[8]);
  263. m3 = _mm_load_ps(&h[12]);
  264. m4 = _mm_load_ps(&h[16]);
  265. m5 = _mm_load_ps(&h[20]);
  266. m6 = _mm_load_ps(&h[24]);
  267. m7 = _mm_load_ps(&h[28]);
  268. m8 = _mm_load_ps(&h[32]);
  269. m9 = _mm_load_ps(&h[36]);
  270. m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  271. m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  272. m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(2, 0, 2, 0));
  273. m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(2, 0, 2, 0));
  274. m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(2, 0, 2, 0));
  275. for (i = 0; i < in_len; i++) {
  276. /* Multiply-accumulate first 12 taps */
  277. m0 = _mm_loadu_ps(&x[2 * i + 0]);
  278. m1 = _mm_loadu_ps(&x[2 * i + 4]);
  279. m2 = _mm_loadu_ps(&x[2 * i + 8]);
  280. m3 = _mm_loadu_ps(&x[2 * i + 12]);
  281. m4 = _mm_loadu_ps(&x[2 * i + 16]);
  282. m5 = _mm_loadu_ps(&x[2 * i + 20]);
  283. m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  284. m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  285. m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  286. m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  287. m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(2, 0, 2, 0));
  288. m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(3, 1, 3, 1));
  289. m2 = _mm_mul_ps(m6, m11);
  290. m3 = _mm_mul_ps(m7, m11);
  291. m4 = _mm_mul_ps(m8, m12);
  292. m5 = _mm_mul_ps(m9, m12);
  293. m6 = _mm_mul_ps(m0, m13);
  294. m7 = _mm_mul_ps(m1, m13);
  295. m0 = _mm_add_ps(m2, m4);
  296. m1 = _mm_add_ps(m3, m5);
  297. m8 = _mm_add_ps(m0, m6);
  298. m9 = _mm_add_ps(m1, m7);
  299. /* Multiply-accumulate last 8 taps */
  300. m0 = _mm_loadu_ps(&x[2 * i + 24]);
  301. m1 = _mm_loadu_ps(&x[2 * i + 28]);
  302. m2 = _mm_loadu_ps(&x[2 * i + 32]);
  303. m3 = _mm_loadu_ps(&x[2 * i + 36]);
  304. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  305. m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  306. m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  307. m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  308. m0 = _mm_mul_ps(m4, m14);
  309. m1 = _mm_mul_ps(m5, m14);
  310. m2 = _mm_mul_ps(m6, m15);
  311. m3 = _mm_mul_ps(m7, m15);
  312. m4 = _mm_add_ps(m0, m2);
  313. m5 = _mm_add_ps(m1, m3);
  314. /* Final sum and store */
  315. m0 = _mm_add_ps(m8, m4);
  316. m1 = _mm_add_ps(m9, m5);
  317. m2 = _mm_hadd_ps(m0, m1);
  318. m3 = _mm_hadd_ps(m2, m2);
  319. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m3));
  320. }
  321. }
  322. #ifdef HAVE_FMA
  323. /* 20-tap SSE3 complex-real convolution */
  324. static void conv_real_fma_8n(const float *x,
  325. const float *h,
  326. float *y,
  327. int h_len, int len)
  328. {
  329. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  330. __m128 m8, m9, m10, m11;
  331. for (int i = 0; i < len; i++) {
  332. /* Zero */
  333. m10 = _mm_setzero_ps();
  334. m11 = _mm_setzero_ps();
  335. for (int n = 0; n < h_len / 8; n++) {
  336. /* Load (aligned) filter taps */
  337. m0 = _mm_load_ps(&h[16 * n + 0]);
  338. m1 = _mm_load_ps(&h[16 * n + 4]);
  339. m2 = _mm_load_ps(&h[16 * n + 8]);
  340. m3 = _mm_load_ps(&h[16 * n + 12]);
  341. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  342. m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  343. /* Load (unaligned) input data */
  344. m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
  345. m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
  346. m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
  347. m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
  348. m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  349. m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  350. m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  351. m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  352. m10 = _mm_fmadd_ps(m6, m4, m10);
  353. m10 = _mm_fmadd_ps(m8, m5, m10);
  354. m11 = _mm_fmadd_ps(m7, m4, m11);
  355. m11 = _mm_fmadd_ps(m9, m5, m11);
  356. }
  357. m0 = _mm_hadd_ps(m10, m11);
  358. m0 = _mm_hadd_ps(m0, m0);
  359. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m0));
  360. }
  361. }
  362. #else
  363. /* 8N tap SSE3 complex-real convolution */
  364. static void sse_conv_real8n(const float *x, const float *h,
  365. float *y, int h_len, int len)
  366. {
  367. __m128 m0, m1, m2, m3, m4, m5, m6, m7;
  368. __m128 m8, m9, m10, m11, m12, m13, m14, m15;
  369. for (int i = 0; i < len; i++) {
  370. m14 = _mm_setzero_ps();
  371. m15 = _mm_setzero_ps();
  372. for (int n = 0; n < h_len / 8; n++) {
  373. /* Load (aligned) filter taps */
  374. m0 = _mm_load_ps(&h[16 * n + 0]);
  375. m1 = _mm_load_ps(&h[16 * n + 4]);
  376. m2 = _mm_load_ps(&h[16 * n + 8]);
  377. m3 = _mm_load_ps(&h[16 * n + 12]);
  378. m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  379. m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  380. /* Load (unaligned) input data */
  381. m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
  382. m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
  383. m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
  384. m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
  385. m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
  386. m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
  387. m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
  388. m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
  389. /* Quad multiply */
  390. m10 = _mm_mul_ps(m6, m4);
  391. m11 = _mm_mul_ps(m8, m5);
  392. m12 = _mm_mul_ps(m7, m4);
  393. m13 = _mm_mul_ps(m9, m5);
  394. /* Accumulate */
  395. m10 = _mm_add_ps(m10, m11);
  396. m12 = _mm_add_ps(m12, m13);
  397. m14 = _mm_add_ps(m14, m10);
  398. m15 = _mm_add_ps(m15, m12);
  399. }
  400. m0 = _mm_hadd_ps(m14, m15);
  401. m0 = _mm_hadd_ps(m0, m0);
  402. _mm_store_sd((double *) &y[2 * i], _mm_castps_pd(m0));
  403. }
  404. }
  405. #endif
  406. static int check_params(const struct cxvec *in,
  407. const struct cxvec *h,
  408. const struct cxvec *out)
  409. {
  410. if (in->len < out->len) {
  411. fprintf(stderr, "convolve: Invalid vector length\n");
  412. return -1;
  413. }
  414. if (in->flags & CXVEC_FLG_REAL_ONLY) {
  415. fprintf(stderr, "convolve: Input data must be complex\n");
  416. return -1;
  417. }
  418. if (!(h->flags & (CXVEC_FLG_REAL_ONLY | CXVEC_FLG_MEM_ALIGN))) {
  419. fprintf(stderr, "convolve: Taps must be real and aligned\n");
  420. return -1;
  421. }
  422. return 0;
  423. }
  424. /*! \brief Convolve two complex vectors
  425. * \param[in] in_vec Complex input signal
  426. * \param[in] h_vec Complex filter taps (stored in reverse order)
  427. * \param[out] out_vec Complex vector output
  428. *
  429. * Modified convole with filter length dependent SSE optimization.
  430. * See generic convole call for additional information.
  431. */
  432. int cxvec_convolve(const struct cxvec *in_vec,
  433. const struct cxvec *h_vec,
  434. struct cxvec *out_vec)
  435. {
  436. int i;
  437. const float complex *x, *h;
  438. float complex *y;
  439. void (*conv_func)
  440. (const float *, const float *, float *, int) = NULL;
  441. void (*conv_func_n)
  442. (const float *, const float *, float *, int, int) = NULL;
  443. if (check_params(in_vec, h_vec, out_vec) < 0)
  444. return -1;
  445. switch (h_vec->len) {
  446. case 4:
  447. conv_func = conv_real_sse4;
  448. break;
  449. case 8:
  450. conv_func = conv_real_sse8;
  451. break;
  452. case 12:
  453. conv_func = conv_real_sse12;
  454. break;
  455. case 16:
  456. conv_func = conv_real_sse16;
  457. break;
  458. case 20:
  459. conv_func = conv_real_sse20;
  460. break;
  461. default:
  462. #ifdef HAVE_FMA
  463. if (!(h_vec->len % 8))
  464. conv_func_n = conv_real_fma_8n;
  465. #else
  466. if (!(h_vec->len % 8))
  467. conv_func_n = sse_conv_real8n;
  468. #endif
  469. }
  470. x = in_vec->data; /* input */
  471. h = h_vec->data; /* taps */
  472. y = out_vec->data; /* output */
  473. if (conv_func) {
  474. conv_func((const float *) &x[-(h_vec->len - 1)],
  475. (const float *) h,
  476. (float *) y, out_vec->len);
  477. } else if (conv_func_n) {
  478. conv_func_n((const float *) &x[-(h_vec->len - 1)],
  479. (const float *) h,
  480. (float *) y,
  481. h_vec->len, out_vec->len);
  482. } else {
  483. conv_real_generic((const float *) &x[-(h_vec->len - 1)],
  484. (const float *) h,
  485. (float *) y,
  486. h_vec->len, out_vec->len);
  487. }
  488. return i;
  489. }
  490. int cxvec_convolve_nodelay(const struct cxvec *in_vec,
  491. const struct cxvec *h_vec,
  492. struct cxvec *out_vec)
  493. {
  494. int i;
  495. float complex *x, *h, *y;
  496. void (*conv_func)(const float *, const float *, float *, int) = NULL;
  497. void (*conv_func2)(const float *, const float *,
  498. float *, int, int) = NULL;
  499. if (check_params(in_vec, h_vec, out_vec) < 0)
  500. return -1;
  501. switch (h_vec->len) {
  502. case 4:
  503. conv_func = conv_real_sse4;
  504. break;
  505. case 8:
  506. conv_func = conv_real_sse8;
  507. break;
  508. case 12:
  509. conv_func = conv_real_sse12;
  510. break;
  511. case 16:
  512. conv_func = conv_real_sse16;
  513. break;
  514. case 20:
  515. conv_func = conv_real_sse20;
  516. break;
  517. default:
  518. #ifdef HAVE_FMA
  519. if (!(h_vec->len % 8))
  520. conv_func2 = conv_real_fma_8n;
  521. #else
  522. if (!(h_vec->len % 8))
  523. conv_func2 = sse_conv_real8n;
  524. #endif
  525. }
  526. x = in_vec->data; /* input */
  527. h = h_vec->data; /* taps */
  528. y = out_vec->data; /* output */
  529. if (conv_func2) {
  530. conv_func2((const float *) &x[-(h_vec->len / 2)],
  531. (const float *) h,
  532. (float *) y,
  533. h_vec->len, out_vec->len);
  534. } else if (!conv_func) {
  535. conv_real_generic((const float *) &x[-(h_vec->len / 2)],
  536. (const float *) h,
  537. (float *) y,
  538. h_vec->len, out_vec->len);
  539. } else {
  540. conv_func((const float *) &x[-(h_vec->len / 2)],
  541. (const float *) h,
  542. (float *) y, out_vec->len);
  543. }
  544. return i;
  545. }
  546. /*! \brief Single output convolution
  547. * \param[in] in Pointer to complex input samples
  548. * \param[in] h Complex vector filter taps
  549. * \param[out] out Pointer to complex output sample
  550. *
  551. * Modified single output convole with filter length dependent SSE
  552. * optimization. See generic convole call for additional information.
  553. */
  554. int single_convolve(const float *in,
  555. const float *h,
  556. size_t hlen,
  557. float *out)
  558. {
  559. const float complex *_in = (const float complex *) in;
  560. float complex *_out = (float complex *) out;
  561. void (*conv_func)
  562. (const float *, const float *, float *, int) = NULL;
  563. void (*conv_func_n)
  564. (const float *, const float *, float *, int, int) = NULL;
  565. switch (hlen) {
  566. case 4:
  567. conv_func = conv_real_sse4;
  568. break;
  569. case 8:
  570. conv_func = conv_real_sse8;
  571. break;
  572. case 12:
  573. conv_func = conv_real_sse12;
  574. break;
  575. case 16:
  576. conv_func = conv_real_sse16;
  577. break;
  578. case 20:
  579. conv_func = conv_real_sse20;
  580. break;
  581. default:
  582. #ifdef HAVE_FMA
  583. if (!(hlen % 8))
  584. conv_func_n = conv_real_fma_8n;
  585. #else
  586. if (!(hlen % 8))
  587. conv_func_n = sse_conv_real8n;
  588. #endif
  589. break;
  590. }
  591. if (conv_func) {
  592. conv_func((const float *) &_in[-(hlen - 1)],
  593. (const float *) h,
  594. (float *) _out, 1);
  595. } else if (conv_func_n) {
  596. conv_func_n((const float *) &_in[-(hlen - 1)],
  597. (const float *) h,
  598. (float *) _out, hlen, 1);
  599. } else {
  600. conv_real_generic((const float *) &_in[-(hlen - 1)],
  601. (const float *) h,
  602. (float *) _out, hlen, 1);
  603. }
  604. return 1;
  605. }