/src/rt/arraybyte.d

http://github.com/AlexeyProkhin/druntime · D · 1882 lines · 1552 code · 237 blank · 93 comment · 140 complexity · 8673feb792a3492dd1f2fe30eb55ba09 MD5 · raw file

Large files are truncated click here to view the full file

  1. /**
  2. * Contains SSE2 and MMX versions of certain operations for char, byte, and
  3. * ubyte ('a', 'g' and 'h' suffixes).
  4. *
  5. * Copyright: Copyright Digital Mars 2008 - 2010.
  6. * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  7. * Authors: Walter Bright, based on code originally written by Burton Radons
  8. */
  9. /* Copyright Digital Mars 2008 - 2010.
  10. * Distributed under the Boost Software License, Version 1.0.
  11. * (See accompanying file LICENSE or copy at
  12. * http://www.boost.org/LICENSE_1_0.txt)
  13. */
  14. module rt.arraybyte;
  15. import core.cpuid;
  16. // debug=PRINTF
  17. version (unittest)
  18. {
  19. private import core.stdc.stdio : printf;
  20. /* This is so unit tests will test every CPU variant
  21. */
  22. int cpuid;
  23. const int CPUID_MAX = 4;
  24. @property bool mmx() { return cpuid == 1 && core.cpuid.mmx; }
  25. @property bool sse() { return cpuid == 2 && core.cpuid.sse; }
  26. @property bool sse2() { return cpuid == 3 && core.cpuid.sse2; }
  27. @property bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow; }
  28. }
  29. else
  30. {
  31. alias core.cpuid.mmx mmx;
  32. alias core.cpuid.sse sse;
  33. alias core.cpuid.sse2 sse2;
  34. alias core.cpuid.amd3dnow amd3dnow;
  35. }
  36. //version = log;
  37. @trusted pure nothrow
  38. bool disjoint(T)(T[] a, T[] b)
  39. {
  40. return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
  41. }
  42. alias byte T;
  43. extern (C) @trusted nothrow:
  44. /* ======================================================================== */
  45. /***********************
  46. * Computes:
  47. * a[] = b[] + value
  48. */
  49. T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
  50. {
  51. return _arraySliceExpAddSliceAssign_g(a, value, b);
  52. }
  53. T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
  54. {
  55. return _arraySliceExpAddSliceAssign_g(a, value, b);
  56. }
  57. T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
  58. in
  59. {
  60. assert(a.length == b.length);
  61. assert(disjoint(a, b));
  62. }
  63. body
  64. {
  65. //printf("_arraySliceExpAddSliceAssign_g()\n");
  66. auto aptr = a.ptr;
  67. auto aend = aptr + a.length;
  68. auto bptr = b.ptr;
  69. version (D_InlineAsm_X86)
  70. {
  71. // SSE2 aligned version is 1088% faster
  72. if (sse2 && a.length >= 64)
  73. {
  74. auto n = aptr + (a.length & ~63);
  75. uint l = cast(ubyte)value * 0x01010101;
  76. if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  77. {
  78. asm // unaligned case
  79. {
  80. mov ESI, aptr;
  81. mov EDI, n;
  82. mov EAX, bptr;
  83. movd XMM4, l;
  84. pshufd XMM4, XMM4, 0;
  85. align 8;
  86. startaddsse2u:
  87. add ESI, 64;
  88. movdqu XMM0, [EAX];
  89. movdqu XMM1, [EAX+16];
  90. movdqu XMM2, [EAX+32];
  91. movdqu XMM3, [EAX+48];
  92. add EAX, 64;
  93. paddb XMM0, XMM4;
  94. paddb XMM1, XMM4;
  95. paddb XMM2, XMM4;
  96. paddb XMM3, XMM4;
  97. movdqu [ESI -64], XMM0;
  98. movdqu [ESI+16-64], XMM1;
  99. movdqu [ESI+32-64], XMM2;
  100. movdqu [ESI+48-64], XMM3;
  101. cmp ESI, EDI;
  102. jb startaddsse2u;
  103. mov aptr, ESI;
  104. mov bptr, EAX;
  105. }
  106. }
  107. else
  108. {
  109. asm // aligned case
  110. {
  111. mov ESI, aptr;
  112. mov EDI, n;
  113. mov EAX, bptr;
  114. movd XMM4, l;
  115. pshufd XMM4, XMM4, 0;
  116. align 8;
  117. startaddsse2a:
  118. add ESI, 64;
  119. movdqa XMM0, [EAX];
  120. movdqa XMM1, [EAX+16];
  121. movdqa XMM2, [EAX+32];
  122. movdqa XMM3, [EAX+48];
  123. add EAX, 64;
  124. paddb XMM0, XMM4;
  125. paddb XMM1, XMM4;
  126. paddb XMM2, XMM4;
  127. paddb XMM3, XMM4;
  128. movdqa [ESI -64], XMM0;
  129. movdqa [ESI+16-64], XMM1;
  130. movdqa [ESI+32-64], XMM2;
  131. movdqa [ESI+48-64], XMM3;
  132. cmp ESI, EDI;
  133. jb startaddsse2a;
  134. mov aptr, ESI;
  135. mov bptr, EAX;
  136. }
  137. }
  138. }
  139. else
  140. // MMX version is 1000% faster
  141. if (mmx && a.length >= 32)
  142. {
  143. auto n = aptr + (a.length & ~31);
  144. uint l = cast(ubyte)value * 0x0101;
  145. asm
  146. {
  147. mov ESI, aptr;
  148. mov EDI, n;
  149. mov EAX, bptr;
  150. movd MM4, l;
  151. pshufw MM4, MM4, 0;
  152. align 4;
  153. startaddmmx:
  154. add ESI, 32;
  155. movq MM0, [EAX];
  156. movq MM1, [EAX+8];
  157. movq MM2, [EAX+16];
  158. movq MM3, [EAX+24];
  159. add EAX, 32;
  160. paddb MM0, MM4;
  161. paddb MM1, MM4;
  162. paddb MM2, MM4;
  163. paddb MM3, MM4;
  164. movq [ESI -32], MM0;
  165. movq [ESI+8 -32], MM1;
  166. movq [ESI+16-32], MM2;
  167. movq [ESI+24-32], MM3;
  168. cmp ESI, EDI;
  169. jb startaddmmx;
  170. emms;
  171. mov aptr, ESI;
  172. mov bptr, EAX;
  173. }
  174. }
  175. /* trying to be fair and treat normal 32-bit cpu the same way as we do
  176. * the SIMD units, with unrolled asm. There's not enough registers,
  177. * really.
  178. */
  179. else
  180. if (a.length >= 4)
  181. {
  182. auto n = aptr + (a.length & ~3);
  183. asm
  184. {
  185. mov ESI, aptr;
  186. mov EDI, n;
  187. mov EAX, bptr;
  188. mov CL, value;
  189. align 4;
  190. startadd386:
  191. add ESI, 4;
  192. mov DX, [EAX];
  193. mov BX, [EAX+2];
  194. add EAX, 4;
  195. add BL, CL;
  196. add BH, CL;
  197. add DL, CL;
  198. add DH, CL;
  199. mov [ESI -4], DX;
  200. mov [ESI+2 -4], BX;
  201. cmp ESI, EDI;
  202. jb startadd386;
  203. mov aptr, ESI;
  204. mov bptr, EAX;
  205. }
  206. }
  207. }
  208. while (aptr < aend)
  209. *aptr++ = cast(T)(*bptr++ + value);
  210. return a;
  211. }
  212. unittest
  213. {
  214. debug(PRINTF) printf("_arraySliceExpAddSliceAssign_g unittest\n");
  215. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  216. {
  217. version (log) printf(" cpuid %d\n", cpuid);
  218. for (int j = 0; j < 2; j++)
  219. {
  220. const int dim = 67;
  221. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  222. a = a[j .. dim + j]; // misalign for second iteration
  223. T[] b = new T[dim + j];
  224. b = b[j .. dim + j];
  225. T[] c = new T[dim + j];
  226. c = c[j .. dim + j];
  227. for (int i = 0; i < dim; i++)
  228. { a[i] = cast(T)i;
  229. b[i] = cast(T)(i + 7);
  230. c[i] = cast(T)(i * 2);
  231. }
  232. c[] = a[] + 6;
  233. for (int i = 0; i < dim; i++)
  234. {
  235. if (c[i] != cast(T)(a[i] + 6))
  236. {
  237. printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
  238. assert(0);
  239. }
  240. }
  241. }
  242. }
  243. }
  244. /* ======================================================================== */
  245. /***********************
  246. * Computes:
  247. * a[] = b[] + c[]
  248. */
  249. T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
  250. {
  251. return _arraySliceSliceAddSliceAssign_g(a, c, b);
  252. }
  253. T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
  254. {
  255. return _arraySliceSliceAddSliceAssign_g(a, c, b);
  256. }
  257. T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
  258. in
  259. {
  260. assert(a.length == b.length && b.length == c.length);
  261. assert(disjoint(a, b));
  262. assert(disjoint(a, c));
  263. assert(disjoint(b, c));
  264. }
  265. body
  266. {
  267. //printf("_arraySliceSliceAddSliceAssign_g()\n");
  268. auto aptr = a.ptr;
  269. auto aend = aptr + a.length;
  270. auto bptr = b.ptr;
  271. auto cptr = c.ptr;
  272. version (D_InlineAsm_X86)
  273. {
  274. // SSE2 aligned version is 5739% faster
  275. if (sse2 && a.length >= 64)
  276. {
  277. auto n = aptr + (a.length & ~63);
  278. if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
  279. {
  280. version (log) printf("\tsse2 unaligned\n");
  281. asm // unaligned case
  282. {
  283. mov ESI, aptr;
  284. mov EDI, n;
  285. mov EAX, bptr;
  286. mov ECX, cptr;
  287. align 8;
  288. startaddlsse2u:
  289. add ESI, 64;
  290. movdqu XMM0, [EAX];
  291. movdqu XMM1, [EAX+16];
  292. movdqu XMM2, [EAX+32];
  293. movdqu XMM3, [EAX+48];
  294. add EAX, 64;
  295. movdqu XMM4, [ECX];
  296. movdqu XMM5, [ECX+16];
  297. movdqu XMM6, [ECX+32];
  298. movdqu XMM7, [ECX+48];
  299. add ECX, 64;
  300. paddb XMM0, XMM4;
  301. paddb XMM1, XMM5;
  302. paddb XMM2, XMM6;
  303. paddb XMM3, XMM7;
  304. movdqu [ESI -64], XMM0;
  305. movdqu [ESI+16-64], XMM1;
  306. movdqu [ESI+32-64], XMM2;
  307. movdqu [ESI+48-64], XMM3;
  308. cmp ESI, EDI;
  309. jb startaddlsse2u;
  310. mov aptr, ESI;
  311. mov bptr, EAX;
  312. mov cptr, ECX;
  313. }
  314. }
  315. else
  316. {
  317. version (log) printf("\tsse2 aligned\n");
  318. asm // aligned case
  319. {
  320. mov ESI, aptr;
  321. mov EDI, n;
  322. mov EAX, bptr;
  323. mov ECX, cptr;
  324. align 8;
  325. startaddlsse2a:
  326. add ESI, 64;
  327. movdqa XMM0, [EAX];
  328. movdqa XMM1, [EAX+16];
  329. movdqa XMM2, [EAX+32];
  330. movdqa XMM3, [EAX+48];
  331. add EAX, 64;
  332. movdqa XMM4, [ECX];
  333. movdqa XMM5, [ECX+16];
  334. movdqa XMM6, [ECX+32];
  335. movdqa XMM7, [ECX+48];
  336. add ECX, 64;
  337. paddb XMM0, XMM4;
  338. paddb XMM1, XMM5;
  339. paddb XMM2, XMM6;
  340. paddb XMM3, XMM7;
  341. movdqa [ESI -64], XMM0;
  342. movdqa [ESI+16-64], XMM1;
  343. movdqa [ESI+32-64], XMM2;
  344. movdqa [ESI+48-64], XMM3;
  345. cmp ESI, EDI;
  346. jb startaddlsse2a;
  347. mov aptr, ESI;
  348. mov bptr, EAX;
  349. mov cptr, ECX;
  350. }
  351. }
  352. }
  353. else
  354. // MMX version is 4428% faster
  355. if (mmx && a.length >= 32)
  356. {
  357. version (log) printf("\tmmx\n");
  358. auto n = aptr + (a.length & ~31);
  359. asm
  360. {
  361. mov ESI, aptr;
  362. mov EDI, n;
  363. mov EAX, bptr;
  364. mov ECX, cptr;
  365. align 4;
  366. startaddlmmx:
  367. add ESI, 32;
  368. movq MM0, [EAX];
  369. movq MM1, [EAX+8];
  370. movq MM2, [EAX+16];
  371. movq MM3, [EAX+24];
  372. add EAX, 32;
  373. movq MM4, [ECX];
  374. movq MM5, [ECX+8];
  375. movq MM6, [ECX+16];
  376. movq MM7, [ECX+24];
  377. add ECX, 32;
  378. paddb MM0, MM4;
  379. paddb MM1, MM5;
  380. paddb MM2, MM6;
  381. paddb MM3, MM7;
  382. movq [ESI -32], MM0;
  383. movq [ESI+8 -32], MM1;
  384. movq [ESI+16-32], MM2;
  385. movq [ESI+24-32], MM3;
  386. cmp ESI, EDI;
  387. jb startaddlmmx;
  388. emms;
  389. mov aptr, ESI;
  390. mov bptr, EAX;
  391. mov cptr, ECX;
  392. }
  393. }
  394. }
  395. version (log) if (aptr < aend) printf("\tbase\n");
  396. while (aptr < aend)
  397. *aptr++ = cast(T)(*bptr++ + *cptr++);
  398. return a;
  399. }
  400. unittest
  401. {
  402. debug(PRINTF) printf("_arraySliceSliceAddSliceAssign_g unittest\n");
  403. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  404. {
  405. version (log) printf(" cpuid %d\n", cpuid);
  406. for (int j = 0; j < 2; j++)
  407. {
  408. const int dim = 67;
  409. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  410. a = a[j .. dim + j]; // misalign for second iteration
  411. T[] b = new T[dim + j];
  412. b = b[j .. dim + j];
  413. T[] c = new T[dim + j];
  414. c = c[j .. dim + j];
  415. for (int i = 0; i < dim; i++)
  416. { a[i] = cast(T)i;
  417. b[i] = cast(T)(i + 7);
  418. c[i] = cast(T)(i * 2);
  419. }
  420. c[] = a[] + b[];
  421. for (int i = 0; i < dim; i++)
  422. {
  423. if (c[i] != cast(T)(a[i] + b[i]))
  424. {
  425. printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
  426. assert(0);
  427. }
  428. }
  429. }
  430. }
  431. }
  432. /* ======================================================================== */
  433. /***********************
  434. * Computes:
  435. * a[] += value
  436. */
  437. T[] _arrayExpSliceAddass_a(T[] a, T value)
  438. {
  439. return _arrayExpSliceAddass_g(a, value);
  440. }
  441. T[] _arrayExpSliceAddass_h(T[] a, T value)
  442. {
  443. return _arrayExpSliceAddass_g(a, value);
  444. }
  445. T[] _arrayExpSliceAddass_g(T[] a, T value)
  446. {
  447. //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
  448. auto aptr = a.ptr;
  449. auto aend = aptr + a.length;
  450. version (D_InlineAsm_X86)
  451. {
  452. // SSE2 aligned version is 1578% faster
  453. if (sse2 && a.length >= 64)
  454. {
  455. auto n = aptr + (a.length & ~63);
  456. uint l = cast(ubyte)value * 0x01010101;
  457. if (((cast(uint) aptr) & 15) != 0)
  458. {
  459. asm // unaligned case
  460. {
  461. mov ESI, aptr;
  462. mov EDI, n;
  463. movd XMM4, l;
  464. pshufd XMM4, XMM4, 0;
  465. align 8;
  466. startaddasssse2u:
  467. movdqu XMM0, [ESI];
  468. movdqu XMM1, [ESI+16];
  469. movdqu XMM2, [ESI+32];
  470. movdqu XMM3, [ESI+48];
  471. add ESI, 64;
  472. paddb XMM0, XMM4;
  473. paddb XMM1, XMM4;
  474. paddb XMM2, XMM4;
  475. paddb XMM3, XMM4;
  476. movdqu [ESI -64], XMM0;
  477. movdqu [ESI+16-64], XMM1;
  478. movdqu [ESI+32-64], XMM2;
  479. movdqu [ESI+48-64], XMM3;
  480. cmp ESI, EDI;
  481. jb startaddasssse2u;
  482. mov aptr, ESI;
  483. }
  484. }
  485. else
  486. {
  487. asm // aligned case
  488. {
  489. mov ESI, aptr;
  490. mov EDI, n;
  491. movd XMM4, l;
  492. pshufd XMM4, XMM4, 0;
  493. align 8;
  494. startaddasssse2a:
  495. movdqa XMM0, [ESI];
  496. movdqa XMM1, [ESI+16];
  497. movdqa XMM2, [ESI+32];
  498. movdqa XMM3, [ESI+48];
  499. add ESI, 64;
  500. paddb XMM0, XMM4;
  501. paddb XMM1, XMM4;
  502. paddb XMM2, XMM4;
  503. paddb XMM3, XMM4;
  504. movdqa [ESI -64], XMM0;
  505. movdqa [ESI+16-64], XMM1;
  506. movdqa [ESI+32-64], XMM2;
  507. movdqa [ESI+48-64], XMM3;
  508. cmp ESI, EDI;
  509. jb startaddasssse2a;
  510. mov aptr, ESI;
  511. }
  512. }
  513. }
  514. else
  515. // MMX version is 1721% faster
  516. if (mmx && a.length >= 32)
  517. {
  518. auto n = aptr + (a.length & ~31);
  519. uint l = cast(ubyte)value * 0x0101;
  520. asm
  521. {
  522. mov ESI, aptr;
  523. mov EDI, n;
  524. movd MM4, l;
  525. pshufw MM4, MM4, 0;
  526. align 8;
  527. startaddassmmx:
  528. movq MM0, [ESI];
  529. movq MM1, [ESI+8];
  530. movq MM2, [ESI+16];
  531. movq MM3, [ESI+24];
  532. add ESI, 32;
  533. paddb MM0, MM4;
  534. paddb MM1, MM4;
  535. paddb MM2, MM4;
  536. paddb MM3, MM4;
  537. movq [ESI -32], MM0;
  538. movq [ESI+8 -32], MM1;
  539. movq [ESI+16-32], MM2;
  540. movq [ESI+24-32], MM3;
  541. cmp ESI, EDI;
  542. jb startaddassmmx;
  543. emms;
  544. mov aptr, ESI;
  545. }
  546. }
  547. }
  548. while (aptr < aend)
  549. *aptr++ += value;
  550. return a;
  551. }
  552. unittest
  553. {
  554. debug(PRINTF) printf("_arrayExpSliceAddass_g unittest\n");
  555. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  556. {
  557. version (log) printf(" cpuid %d\n", cpuid);
  558. for (int j = 0; j < 2; j++)
  559. {
  560. const int dim = 67;
  561. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  562. a = a[j .. dim + j]; // misalign for second iteration
  563. T[] b = new T[dim + j];
  564. b = b[j .. dim + j];
  565. T[] c = new T[dim + j];
  566. c = c[j .. dim + j];
  567. for (int i = 0; i < dim; i++)
  568. { a[i] = cast(T)i;
  569. b[i] = cast(T)(i + 7);
  570. c[i] = cast(T)(i * 2);
  571. }
  572. a[] = c[];
  573. c[] += 6;
  574. for (int i = 0; i < dim; i++)
  575. {
  576. if (c[i] != cast(T)(a[i] + 6))
  577. {
  578. printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
  579. assert(0);
  580. }
  581. }
  582. }
  583. }
  584. }
  585. /* ======================================================================== */
  586. /***********************
  587. * Computes:
  588. * a[] += b[]
  589. */
  590. T[] _arraySliceSliceAddass_a(T[] a, T[] b)
  591. {
  592. return _arraySliceSliceAddass_g(a, b);
  593. }
  594. T[] _arraySliceSliceAddass_h(T[] a, T[] b)
  595. {
  596. return _arraySliceSliceAddass_g(a, b);
  597. }
  598. T[] _arraySliceSliceAddass_g(T[] a, T[] b)
  599. in
  600. {
  601. assert (a.length == b.length);
  602. assert (disjoint(a, b));
  603. }
  604. body
  605. {
  606. //printf("_arraySliceSliceAddass_g()\n");
  607. auto aptr = a.ptr;
  608. auto aend = aptr + a.length;
  609. auto bptr = b.ptr;
  610. version (D_InlineAsm_X86)
  611. {
  612. // SSE2 aligned version is 4727% faster
  613. if (sse2 && a.length >= 64)
  614. {
  615. auto n = aptr + (a.length & ~63);
  616. if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  617. {
  618. asm // unaligned case
  619. {
  620. mov ESI, aptr;
  621. mov EDI, n;
  622. mov ECX, bptr;
  623. align 8;
  624. startaddasslsse2u:
  625. movdqu XMM0, [ESI];
  626. movdqu XMM1, [ESI+16];
  627. movdqu XMM2, [ESI+32];
  628. movdqu XMM3, [ESI+48];
  629. add ESI, 64;
  630. movdqu XMM4, [ECX];
  631. movdqu XMM5, [ECX+16];
  632. movdqu XMM6, [ECX+32];
  633. movdqu XMM7, [ECX+48];
  634. add ECX, 64;
  635. paddb XMM0, XMM4;
  636. paddb XMM1, XMM5;
  637. paddb XMM2, XMM6;
  638. paddb XMM3, XMM7;
  639. movdqu [ESI -64], XMM0;
  640. movdqu [ESI+16-64], XMM1;
  641. movdqu [ESI+32-64], XMM2;
  642. movdqu [ESI+48-64], XMM3;
  643. cmp ESI, EDI;
  644. jb startaddasslsse2u;
  645. mov aptr, ESI;
  646. mov bptr, ECX;
  647. }
  648. }
  649. else
  650. {
  651. asm // aligned case
  652. {
  653. mov ESI, aptr;
  654. mov EDI, n;
  655. mov ECX, bptr;
  656. align 8;
  657. startaddasslsse2a:
  658. movdqa XMM0, [ESI];
  659. movdqa XMM1, [ESI+16];
  660. movdqa XMM2, [ESI+32];
  661. movdqa XMM3, [ESI+48];
  662. add ESI, 64;
  663. movdqa XMM4, [ECX];
  664. movdqa XMM5, [ECX+16];
  665. movdqa XMM6, [ECX+32];
  666. movdqa XMM7, [ECX+48];
  667. add ECX, 64;
  668. paddb XMM0, XMM4;
  669. paddb XMM1, XMM5;
  670. paddb XMM2, XMM6;
  671. paddb XMM3, XMM7;
  672. movdqa [ESI -64], XMM0;
  673. movdqa [ESI+16-64], XMM1;
  674. movdqa [ESI+32-64], XMM2;
  675. movdqa [ESI+48-64], XMM3;
  676. cmp ESI, EDI;
  677. jb startaddasslsse2a;
  678. mov aptr, ESI;
  679. mov bptr, ECX;
  680. }
  681. }
  682. }
  683. else
  684. // MMX version is 3059% faster
  685. if (mmx && a.length >= 32)
  686. {
  687. auto n = aptr + (a.length & ~31);
  688. asm
  689. {
  690. mov ESI, aptr;
  691. mov EDI, n;
  692. mov ECX, bptr;
  693. align 8;
  694. startaddasslmmx:
  695. movq MM0, [ESI];
  696. movq MM1, [ESI+8];
  697. movq MM2, [ESI+16];
  698. movq MM3, [ESI+24];
  699. add ESI, 32;
  700. movq MM4, [ECX];
  701. movq MM5, [ECX+8];
  702. movq MM6, [ECX+16];
  703. movq MM7, [ECX+24];
  704. add ECX, 32;
  705. paddb MM0, MM4;
  706. paddb MM1, MM5;
  707. paddb MM2, MM6;
  708. paddb MM3, MM7;
  709. movq [ESI -32], MM0;
  710. movq [ESI+8 -32], MM1;
  711. movq [ESI+16-32], MM2;
  712. movq [ESI+24-32], MM3;
  713. cmp ESI, EDI;
  714. jb startaddasslmmx;
  715. emms;
  716. mov aptr, ESI;
  717. mov bptr, ECX;
  718. }
  719. }
  720. }
  721. while (aptr < aend)
  722. *aptr++ += *bptr++;
  723. return a;
  724. }
  725. unittest
  726. {
  727. debug(PRINTF) printf("_arraySliceSliceAddass_g unittest\n");
  728. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  729. {
  730. version (log) printf(" cpuid %d\n", cpuid);
  731. for (int j = 0; j < 2; j++)
  732. {
  733. const int dim = 67;
  734. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  735. a = a[j .. dim + j]; // misalign for second iteration
  736. T[] b = new T[dim + j];
  737. b = b[j .. dim + j];
  738. T[] c = new T[dim + j];
  739. c = c[j .. dim + j];
  740. for (int i = 0; i < dim; i++)
  741. { a[i] = cast(T)i;
  742. b[i] = cast(T)(i + 7);
  743. c[i] = cast(T)(i * 2);
  744. }
  745. a[] = c[];
  746. c[] += b[];
  747. for (int i = 0; i < dim; i++)
  748. {
  749. if (c[i] != cast(T)(a[i] + b[i]))
  750. {
  751. printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
  752. assert(0);
  753. }
  754. }
  755. }
  756. }
  757. }
  758. /* ======================================================================== */
  759. /***********************
  760. * Computes:
  761. * a[] = b[] - value
  762. */
  763. T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
  764. {
  765. return _arraySliceExpMinSliceAssign_g(a, value, b);
  766. }
  767. T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
  768. {
  769. return _arraySliceExpMinSliceAssign_g(a, value, b);
  770. }
  771. T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
  772. in
  773. {
  774. assert(a.length == b.length);
  775. assert(disjoint(a, b));
  776. }
  777. body
  778. {
  779. //printf("_arraySliceExpMinSliceAssign_g()\n");
  780. auto aptr = a.ptr;
  781. auto aend = aptr + a.length;
  782. auto bptr = b.ptr;
  783. version (D_InlineAsm_X86)
  784. {
  785. // SSE2 aligned version is 1189% faster
  786. if (sse2 && a.length >= 64)
  787. {
  788. auto n = aptr + (a.length & ~63);
  789. uint l = cast(ubyte)value * 0x01010101;
  790. if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  791. {
  792. asm // unaligned case
  793. {
  794. mov ESI, aptr;
  795. mov EDI, n;
  796. mov EAX, bptr;
  797. movd XMM4, l;
  798. pshufd XMM4, XMM4, 0;
  799. align 8;
  800. startsubsse2u:
  801. add ESI, 64;
  802. movdqu XMM0, [EAX];
  803. movdqu XMM1, [EAX+16];
  804. movdqu XMM2, [EAX+32];
  805. movdqu XMM3, [EAX+48];
  806. add EAX, 64;
  807. psubb XMM0, XMM4;
  808. psubb XMM1, XMM4;
  809. psubb XMM2, XMM4;
  810. psubb XMM3, XMM4;
  811. movdqu [ESI -64], XMM0;
  812. movdqu [ESI+16-64], XMM1;
  813. movdqu [ESI+32-64], XMM2;
  814. movdqu [ESI+48-64], XMM3;
  815. cmp ESI, EDI;
  816. jb startsubsse2u;
  817. mov aptr, ESI;
  818. mov bptr, EAX;
  819. }
  820. }
  821. else
  822. {
  823. asm // aligned case
  824. {
  825. mov ESI, aptr;
  826. mov EDI, n;
  827. mov EAX, bptr;
  828. movd XMM4, l;
  829. pshufd XMM4, XMM4, 0;
  830. align 8;
  831. startsubsse2a:
  832. add ESI, 64;
  833. movdqa XMM0, [EAX];
  834. movdqa XMM1, [EAX+16];
  835. movdqa XMM2, [EAX+32];
  836. movdqa XMM3, [EAX+48];
  837. add EAX, 64;
  838. psubb XMM0, XMM4;
  839. psubb XMM1, XMM4;
  840. psubb XMM2, XMM4;
  841. psubb XMM3, XMM4;
  842. movdqa [ESI -64], XMM0;
  843. movdqa [ESI+16-64], XMM1;
  844. movdqa [ESI+32-64], XMM2;
  845. movdqa [ESI+48-64], XMM3;
  846. cmp ESI, EDI;
  847. jb startsubsse2a;
  848. mov aptr, ESI;
  849. mov bptr, EAX;
  850. }
  851. }
  852. }
  853. else
  854. // MMX version is 1079% faster
  855. if (mmx && a.length >= 32)
  856. {
  857. auto n = aptr + (a.length & ~31);
  858. uint l = cast(ubyte)value * 0x0101;
  859. asm
  860. {
  861. mov ESI, aptr;
  862. mov EDI, n;
  863. mov EAX, bptr;
  864. movd MM4, l;
  865. pshufw MM4, MM4, 0;
  866. align 4;
  867. startsubmmx:
  868. add ESI, 32;
  869. movq MM0, [EAX];
  870. movq MM1, [EAX+8];
  871. movq MM2, [EAX+16];
  872. movq MM3, [EAX+24];
  873. add EAX, 32;
  874. psubb MM0, MM4;
  875. psubb MM1, MM4;
  876. psubb MM2, MM4;
  877. psubb MM3, MM4;
  878. movq [ESI -32], MM0;
  879. movq [ESI+8 -32], MM1;
  880. movq [ESI+16-32], MM2;
  881. movq [ESI+24-32], MM3;
  882. cmp ESI, EDI;
  883. jb startsubmmx;
  884. emms;
  885. mov aptr, ESI;
  886. mov bptr, EAX;
  887. }
  888. }
  889. // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
  890. else
  891. if (a.length >= 4)
  892. {
  893. auto n = aptr + (a.length & ~3);
  894. asm
  895. {
  896. mov ESI, aptr;
  897. mov EDI, n;
  898. mov EAX, bptr;
  899. mov CL, value;
  900. align 4;
  901. startsub386:
  902. add ESI, 4;
  903. mov DX, [EAX];
  904. mov BX, [EAX+2];
  905. add EAX, 4;
  906. sub BL, CL;
  907. sub BH, CL;
  908. sub DL, CL;
  909. sub DH, CL;
  910. mov [ESI -4], DX;
  911. mov [ESI+2 -4], BX;
  912. cmp ESI, EDI;
  913. jb startsub386;
  914. mov aptr, ESI;
  915. mov bptr, EAX;
  916. }
  917. }
  918. }
  919. while (aptr < aend)
  920. *aptr++ = cast(T)(*bptr++ - value);
  921. return a;
  922. }
  923. unittest
  924. {
  925. debug(PRINTF) printf("_arraySliceExpMinSliceAssign_g unittest\n");
  926. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  927. {
  928. version (log) printf(" cpuid %d\n", cpuid);
  929. for (int j = 0; j < 2; j++)
  930. {
  931. const int dim = 67;
  932. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  933. a = a[j .. dim + j]; // misalign for second iteration
  934. T[] b = new T[dim + j];
  935. b = b[j .. dim + j];
  936. T[] c = new T[dim + j];
  937. c = c[j .. dim + j];
  938. for (int i = 0; i < dim; i++)
  939. { a[i] = cast(T)i;
  940. b[i] = cast(T)(i + 7);
  941. c[i] = cast(T)(i * 2);
  942. }
  943. a[] = c[];
  944. c[] = b[] - 6;
  945. for (int i = 0; i < dim; i++)
  946. {
  947. if (c[i] != cast(T)(b[i] - 6))
  948. {
  949. printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
  950. assert(0);
  951. }
  952. }
  953. }
  954. }
  955. }
  956. /* ======================================================================== */
  957. /***********************
  958. * Computes:
  959. * a[] = value - b[]
  960. */
  961. T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
  962. {
  963. return _arrayExpSliceMinSliceAssign_g(a, b, value);
  964. }
  965. T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
  966. {
  967. return _arrayExpSliceMinSliceAssign_g(a, b, value);
  968. }
  969. T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
  970. in
  971. {
  972. assert(a.length == b.length);
  973. assert(disjoint(a, b));
  974. }
  975. body
  976. {
  977. //printf("_arrayExpSliceMinSliceAssign_g()\n");
  978. auto aptr = a.ptr;
  979. auto aend = aptr + a.length;
  980. auto bptr = b.ptr;
  981. version (D_InlineAsm_X86)
  982. {
  983. // SSE2 aligned version is 8748% faster
  984. if (sse2 && a.length >= 64)
  985. {
  986. auto n = aptr + (a.length & ~63);
  987. uint l = cast(ubyte)value * 0x01010101;
  988. if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  989. {
  990. asm // unaligned case
  991. {
  992. mov ESI, aptr;
  993. mov EDI, n;
  994. mov EAX, bptr;
  995. movd XMM4, l;
  996. pshufd XMM4, XMM4, 0;
  997. align 8;
  998. startsubrsse2u:
  999. add ESI, 64;
  1000. movdqa XMM5, XMM4;
  1001. movdqa XMM6, XMM4;
  1002. movdqu XMM0, [EAX];
  1003. movdqu XMM1, [EAX+16];
  1004. psubb XMM5, XMM0;
  1005. psubb XMM6, XMM1;
  1006. movdqu [ESI -64], XMM5;
  1007. movdqu [ESI+16-64], XMM6;
  1008. movdqa XMM5, XMM4;
  1009. movdqa XMM6, XMM4;
  1010. movdqu XMM2, [EAX+32];
  1011. movdqu XMM3, [EAX+48];
  1012. add EAX, 64;
  1013. psubb XMM5, XMM2;
  1014. psubb XMM6, XMM3;
  1015. movdqu [ESI+32-64], XMM5;
  1016. movdqu [ESI+48-64], XMM6;
  1017. cmp ESI, EDI;
  1018. jb startsubrsse2u;
  1019. mov aptr, ESI;
  1020. mov bptr, EAX;
  1021. }
  1022. }
  1023. else
  1024. {
  1025. asm // aligned case
  1026. {
  1027. mov ESI, aptr;
  1028. mov EDI, n;
  1029. mov EAX, bptr;
  1030. movd XMM4, l;
  1031. pshufd XMM4, XMM4, 0;
  1032. align 8;
  1033. startsubrsse2a:
  1034. add ESI, 64;
  1035. movdqa XMM5, XMM4;
  1036. movdqa XMM6, XMM4;
  1037. movdqa XMM0, [EAX];
  1038. movdqa XMM1, [EAX+16];
  1039. psubb XMM5, XMM0;
  1040. psubb XMM6, XMM1;
  1041. movdqa [ESI -64], XMM5;
  1042. movdqa [ESI+16-64], XMM6;
  1043. movdqa XMM5, XMM4;
  1044. movdqa XMM6, XMM4;
  1045. movdqa XMM2, [EAX+32];
  1046. movdqa XMM3, [EAX+48];
  1047. add EAX, 64;
  1048. psubb XMM5, XMM2;
  1049. psubb XMM6, XMM3;
  1050. movdqa [ESI+32-64], XMM5;
  1051. movdqa [ESI+48-64], XMM6;
  1052. cmp ESI, EDI;
  1053. jb startsubrsse2a;
  1054. mov aptr, ESI;
  1055. mov bptr, EAX;
  1056. }
  1057. }
  1058. }
  1059. else
  1060. // MMX version is 7397% faster
  1061. if (mmx && a.length >= 32)
  1062. {
  1063. auto n = aptr + (a.length & ~31);
  1064. uint l = cast(ubyte)value * 0x0101;
  1065. asm
  1066. {
  1067. mov ESI, aptr;
  1068. mov EDI, n;
  1069. mov EAX, bptr;
  1070. movd MM4, l;
  1071. pshufw MM4, MM4, 0;
  1072. align 4;
  1073. startsubrmmx:
  1074. add ESI, 32;
  1075. movq MM5, MM4;
  1076. movq MM6, MM4;
  1077. movq MM0, [EAX];
  1078. movq MM1, [EAX+8];
  1079. psubb MM5, MM0;
  1080. psubb MM6, MM1;
  1081. movq [ESI -32], MM5;
  1082. movq [ESI+8 -32], MM6;
  1083. movq MM5, MM4;
  1084. movq MM6, MM4;
  1085. movq MM2, [EAX+16];
  1086. movq MM3, [EAX+24];
  1087. add EAX, 32;
  1088. psubb MM5, MM2;
  1089. psubb MM6, MM3;
  1090. movq [ESI+16-32], MM5;
  1091. movq [ESI+24-32], MM6;
  1092. cmp ESI, EDI;
  1093. jb startsubrmmx;
  1094. emms;
  1095. mov aptr, ESI;
  1096. mov bptr, EAX;
  1097. }
  1098. }
  1099. }
  1100. while (aptr < aend)
  1101. *aptr++ = cast(T)(value - *bptr++);
  1102. return a;
  1103. }
  1104. unittest
  1105. {
  1106. debug(PRINTF) printf("_arrayExpSliceMinSliceAssign_g unittest\n");
  1107. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  1108. {
  1109. version (log) printf(" cpuid %d\n", cpuid);
  1110. for (int j = 0; j < 2; j++)
  1111. {
  1112. const int dim = 67;
  1113. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  1114. a = a[j .. dim + j]; // misalign for second iteration
  1115. T[] b = new T[dim + j];
  1116. b = b[j .. dim + j];
  1117. T[] c = new T[dim + j];
  1118. c = c[j .. dim + j];
  1119. for (int i = 0; i < dim; i++)
  1120. { a[i] = cast(T)i;
  1121. b[i] = cast(T)(i + 7);
  1122. c[i] = cast(T)(i * 2);
  1123. }
  1124. a[] = c[];
  1125. c[] = 6 - b[];
  1126. for (int i = 0; i < dim; i++)
  1127. {
  1128. if (c[i] != cast(T)(6 - b[i]))
  1129. {
  1130. printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
  1131. assert(0);
  1132. }
  1133. }
  1134. }
  1135. }
  1136. }
  1137. /* ======================================================================== */
  1138. /***********************
  1139. * Computes:
  1140. * a[] = b[] - c[]
  1141. */
  1142. T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
  1143. {
  1144. return _arraySliceSliceMinSliceAssign_g(a, c, b);
  1145. }
  1146. T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
  1147. {
  1148. return _arraySliceSliceMinSliceAssign_g(a, c, b);
  1149. }
  1150. T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
  1151. in
  1152. {
  1153. assert(a.length == b.length && b.length == c.length);
  1154. assert(disjoint(a, b));
  1155. assert(disjoint(a, c));
  1156. assert(disjoint(b, c));
  1157. }
  1158. body
  1159. {
  1160. auto aptr = a.ptr;
  1161. auto aend = aptr + a.length;
  1162. auto bptr = b.ptr;
  1163. auto cptr = c.ptr;
  1164. version (D_InlineAsm_X86)
  1165. {
  1166. // SSE2 aligned version is 5756% faster
  1167. if (sse2 && a.length >= 64)
  1168. {
  1169. auto n = aptr + (a.length & ~63);
  1170. if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
  1171. {
  1172. asm // unaligned case
  1173. {
  1174. mov ESI, aptr;
  1175. mov EDI, n;
  1176. mov EAX, bptr;
  1177. mov ECX, cptr;
  1178. align 8;
  1179. startsublsse2u:
  1180. add ESI, 64;
  1181. movdqu XMM0, [EAX];
  1182. movdqu XMM1, [EAX+16];
  1183. movdqu XMM2, [EAX+32];
  1184. movdqu XMM3, [EAX+48];
  1185. add EAX, 64;
  1186. movdqu XMM4, [ECX];
  1187. movdqu XMM5, [ECX+16];
  1188. movdqu XMM6, [ECX+32];
  1189. movdqu XMM7, [ECX+48];
  1190. add ECX, 64;
  1191. psubb XMM0, XMM4;
  1192. psubb XMM1, XMM5;
  1193. psubb XMM2, XMM6;
  1194. psubb XMM3, XMM7;
  1195. movdqu [ESI -64], XMM0;
  1196. movdqu [ESI+16-64], XMM1;
  1197. movdqu [ESI+32-64], XMM2;
  1198. movdqu [ESI+48-64], XMM3;
  1199. cmp ESI, EDI;
  1200. jb startsublsse2u;
  1201. mov aptr, ESI;
  1202. mov bptr, EAX;
  1203. mov cptr, ECX;
  1204. }
  1205. }
  1206. else
  1207. {
  1208. asm // aligned case
  1209. {
  1210. mov ESI, aptr;
  1211. mov EDI, n;
  1212. mov EAX, bptr;
  1213. mov ECX, cptr;
  1214. align 8;
  1215. startsublsse2a:
  1216. add ESI, 64;
  1217. movdqa XMM0, [EAX];
  1218. movdqa XMM1, [EAX+16];
  1219. movdqa XMM2, [EAX+32];
  1220. movdqa XMM3, [EAX+48];
  1221. add EAX, 64;
  1222. movdqa XMM4, [ECX];
  1223. movdqa XMM5, [ECX+16];
  1224. movdqa XMM6, [ECX+32];
  1225. movdqa XMM7, [ECX+48];
  1226. add ECX, 64;
  1227. psubb XMM0, XMM4;
  1228. psubb XMM1, XMM5;
  1229. psubb XMM2, XMM6;
  1230. psubb XMM3, XMM7;
  1231. movdqa [ESI -64], XMM0;
  1232. movdqa [ESI+16-64], XMM1;
  1233. movdqa [ESI+32-64], XMM2;
  1234. movdqa [ESI+48-64], XMM3;
  1235. cmp ESI, EDI;
  1236. jb startsublsse2a;
  1237. mov aptr, ESI;
  1238. mov bptr, EAX;
  1239. mov cptr, ECX;
  1240. }
  1241. }
  1242. }
  1243. else
  1244. // MMX version is 4428% faster
  1245. if (mmx && a.length >= 32)
  1246. {
  1247. auto n = aptr + (a.length & ~31);
  1248. asm
  1249. {
  1250. mov ESI, aptr;
  1251. mov EDI, n;
  1252. mov EAX, bptr;
  1253. mov ECX, cptr;
  1254. align 8;
  1255. startsublmmx:
  1256. add ESI, 32;
  1257. movq MM0, [EAX];
  1258. movq MM1, [EAX+8];
  1259. movq MM2, [EAX+16];
  1260. movq MM3, [EAX+24];
  1261. add EAX, 32;
  1262. movq MM4, [ECX];
  1263. movq MM5, [ECX+8];
  1264. movq MM6, [ECX+16];
  1265. movq MM7, [ECX+24];
  1266. add ECX, 32;
  1267. psubb MM0, MM4;
  1268. psubb MM1, MM5;
  1269. psubb MM2, MM6;
  1270. psubb MM3, MM7;
  1271. movq [ESI -32], MM0;
  1272. movq [ESI+8 -32], MM1;
  1273. movq [ESI+16-32], MM2;
  1274. movq [ESI+24-32], MM3;
  1275. cmp ESI, EDI;
  1276. jb startsublmmx;
  1277. emms;
  1278. mov aptr, ESI;
  1279. mov bptr, EAX;
  1280. mov cptr, ECX;
  1281. }
  1282. }
  1283. }
  1284. while (aptr < aend)
  1285. *aptr++ = cast(T)(*bptr++ - *cptr++);
  1286. return a;
  1287. }
  1288. unittest
  1289. {
  1290. debug(PRINTF) printf("_arraySliceSliceMinSliceAssign_g unittest\n");
  1291. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  1292. {
  1293. version (log) printf(" cpuid %d\n", cpuid);
  1294. for (int j = 0; j < 2; j++)
  1295. {
  1296. const int dim = 67;
  1297. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  1298. a = a[j .. dim + j]; // misalign for second iteration
  1299. T[] b = new T[dim + j];
  1300. b = b[j .. dim + j];
  1301. T[] c = new T[dim + j];
  1302. c = c[j .. dim + j];
  1303. for (int i = 0; i < dim; i++)
  1304. { a[i] = cast(T)i;
  1305. b[i] = cast(T)(i + 7);
  1306. c[i] = cast(T)(i * 2);
  1307. }
  1308. c[] = a[] - b[];
  1309. for (int i = 0; i < dim; i++)
  1310. {
  1311. if (c[i] != cast(T)(a[i] - b[i]))
  1312. {
  1313. printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
  1314. assert(0);
  1315. }
  1316. }
  1317. }
  1318. }
  1319. }
  1320. /* ======================================================================== */
  1321. /***********************
  1322. * Computes:
  1323. * a[] -= value
  1324. */
  1325. T[] _arrayExpSliceMinass_a(T[] a, T value)
  1326. {
  1327. return _arrayExpSliceMinass_g(a, value);
  1328. }
  1329. T[] _arrayExpSliceMinass_h(T[] a, T value)
  1330. {
  1331. return _arrayExpSliceMinass_g(a, value);
  1332. }
  1333. T[] _arrayExpSliceMinass_g(T[] a, T value)
  1334. {
  1335. //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
  1336. auto aptr = a.ptr;
  1337. auto aend = aptr + a.length;
  1338. version (D_InlineAsm_X86)
  1339. {
  1340. // SSE2 aligned version is 1577% faster
  1341. if (sse2 && a.length >= 64)
  1342. {
  1343. auto n = aptr + (a.length & ~63);
  1344. uint l = cast(ubyte)value * 0x01010101;
  1345. if (((cast(uint) aptr) & 15) != 0)
  1346. {
  1347. asm // unaligned case
  1348. {
  1349. mov ESI, aptr;
  1350. mov EDI, n;
  1351. movd XMM4, l;
  1352. pshufd XMM4, XMM4, 0;
  1353. align 8;
  1354. startsubasssse2u:
  1355. movdqu XMM0, [ESI];
  1356. movdqu XMM1, [ESI+16];
  1357. movdqu XMM2, [ESI+32];
  1358. movdqu XMM3, [ESI+48];
  1359. add ESI, 64;
  1360. psubb XMM0, XMM4;
  1361. psubb XMM1, XMM4;
  1362. psubb XMM2, XMM4;
  1363. psubb XMM3, XMM4;
  1364. movdqu [ESI -64], XMM0;
  1365. movdqu [ESI+16-64], XMM1;
  1366. movdqu [ESI+32-64], XMM2;
  1367. movdqu [ESI+48-64], XMM3;
  1368. cmp ESI, EDI;
  1369. jb startsubasssse2u;
  1370. mov aptr, ESI;
  1371. }
  1372. }
  1373. else
  1374. {
  1375. asm // aligned case
  1376. {
  1377. mov ESI, aptr;
  1378. mov EDI, n;
  1379. movd XMM4, l;
  1380. pshufd XMM4, XMM4, 0;
  1381. align 8;
  1382. startsubasssse2a:
  1383. movdqa XMM0, [ESI];
  1384. movdqa XMM1, [ESI+16];
  1385. movdqa XMM2, [ESI+32];
  1386. movdqa XMM3, [ESI+48];
  1387. add ESI, 64;
  1388. psubb XMM0, XMM4;
  1389. psubb XMM1, XMM4;
  1390. psubb XMM2, XMM4;
  1391. psubb XMM3, XMM4;
  1392. movdqa [ESI -64], XMM0;
  1393. movdqa [ESI+16-64], XMM1;
  1394. movdqa [ESI+32-64], XMM2;
  1395. movdqa [ESI+48-64], XMM3;
  1396. cmp ESI, EDI;
  1397. jb startsubasssse2a;
  1398. mov aptr, ESI;
  1399. }
  1400. }
  1401. }
  1402. else
  1403. // MMX version is 1577% faster
  1404. if (mmx && a.length >= 32)
  1405. {
  1406. auto n = aptr + (a.length & ~31);
  1407. uint l = cast(ubyte)value * 0x0101;
  1408. asm
  1409. {
  1410. mov ESI, aptr;
  1411. mov EDI, n;
  1412. movd MM4, l;
  1413. pshufw MM4, MM4, 0;
  1414. align 8;
  1415. startsubassmmx:
  1416. movq MM0, [ESI];
  1417. movq MM1, [ESI+8];
  1418. movq MM2, [ESI+16];
  1419. movq MM3, [ESI+24];
  1420. add ESI, 32;
  1421. psubb MM0, MM4;
  1422. psubb MM1, MM4;
  1423. psubb MM2, MM4;
  1424. psubb MM3, MM4;
  1425. movq [ESI -32], MM0;
  1426. movq [ESI+8 -32], MM1;
  1427. movq [ESI+16-32], MM2;
  1428. movq [ESI+24-32], MM3;
  1429. cmp ESI, EDI;
  1430. jb startsubassmmx;
  1431. emms;
  1432. mov aptr, ESI;
  1433. }
  1434. }
  1435. }
  1436. while (aptr < aend)
  1437. *aptr++ -= value;
  1438. return a;
  1439. }
  1440. unittest
  1441. {
  1442. debug(PRINTF) printf("_arrayExpSliceMinass_g unittest\n");
  1443. for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
  1444. {
  1445. version (log) printf(" cpuid %d\n", cpuid);
  1446. for (int j = 0; j < 2; j++)
  1447. {
  1448. const int dim = 67;
  1449. T[] a = new T[dim + j]; // aligned on 16 byte boundary
  1450. a = a[j .. dim + j]; // misalign for second iteration
  1451. T[] b = new T[dim + j];
  1452. b = b[j .. dim + j];
  1453. T[] c = new T[dim + j];
  1454. c = c[j .. dim + j];
  1455. for (int i = 0; i < dim; i++)
  1456. { a[i] = cast(T)i;
  1457. b[i] = cast(T)(i + 7);
  1458. c[i] = cast(T)(i * 2);
  1459. }
  1460. a[] = c[];
  1461. c[] -= 6;
  1462. for (int i = 0; i < dim; i++)
  1463. {
  1464. if (c[i] != cast(T)(a[i] - 6))
  1465. {
  1466. printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
  1467. assert(0);
  1468. }
  1469. }
  1470. }
  1471. }
  1472. }
  1473. /* ======================================================================== */
  1474. /***********************
  1475. * Computes:
  1476. * a[] -= b[]
  1477. */
  1478. T[] _arraySliceSliceMinass_a(T[] a, T[] b)
  1479. {
  1480. return _arraySliceSliceMinass_g(a, b);
  1481. }
  1482. T[] _arraySliceSliceMinass_h(T[] a, T[] b)
  1483. {
  1484. return _arraySliceSliceMinass_g(a, b);
  1485. }
  1486. T[] _arraySliceSliceMinass_g(T[] a, T[] b)
  1487. in
  1488. {
  1489. assert (a.length == b.length);
  1490. assert (disjoint(a, b));
  1491. }
  1492. body
  1493. {
  1494. //printf("_arraySliceSliceMinass_g()\n");
  1495. auto aptr = a.ptr;
  1496. auto aend = aptr + a.length;
  1497. auto bptr = b.ptr;
  1498. version (D_InlineAsm_X86)
  1499. {
  1500. // SSE2 aligned version is 4800% faster
  1501. if (sse2 && a.length >= 64)
  1502. {
  1503. auto n = aptr + (a.length & ~63);
  1504. if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  1505. {
  1506. asm // unaligned case
  1507. {
  1508. mov ESI, aptr;
  1509. mov EDI, n;
  1510. mov ECX, bptr;
  1511. align 8;
  1512. startsubasslsse2u:
  1513. movdqu XMM0, [ESI];
  1514. movdqu XMM1, [ESI+16];
  1515. movdqu XMM2, [ESI+32];
  1516. movdqu XMM3, [ESI+48];
  1517. add ESI, 64;
  1518. movdqu XMM4, [ECX];
  1519. movdqu XMM5, [ECX+16];
  1520. movdqu XMM6, [ECX+32];
  1521. movdqu XMM7, [ECX+48];
  1522. add ECX, 64;
  1523. psubb XMM0, XMM4;
  1524. psubb XMM1, XMM5;
  1525. psubb XMM2, XMM6;
  1526. psubb XMM3, XMM7;
  1527. movdqu [ESI -64], XMM0;
  1528. movdqu [ESI+16-64], XMM1;
  1529. movdqu [ESI+32-64], XMM2;
  1530. movdqu [ESI+48-64], XMM3;
  1531. cmp ESI, EDI;
  1532. jb startsubasslsse2u;
  1533. mov aptr, ESI;
  1534. mov bptr, ECX;
  1535. }
  1536. }
  1537. else
  1538. {
  1539. asm // aligned case
  1540. {
  1541. mov ESI, aptr;
  1542. mov EDI, n;
  1543. mov ECX, bptr;
  1544. align 8;
  1545. startsubasslsse2a:
  1546. movdqa XMM0, [ESI];
  1547. movdqa XMM1, [ESI+16];
  1548. movdqa XMM2, [ESI+32];
  1549. movdqa XMM3, [ESI+48];
  1550. add ESI, 64;
  1551. movdqa XMM4, [ECX];
  1552. movdqa XMM5, [ECX+16];
  1553. movdqa XMM6, [ECX+32];
  1554. movdqa XMM7, [ECX+48];
  1555. add ECX, 64;
  1556. psubb XMM0, XMM4;
  1557. psubb XMM1, XMM5;
  1558. psubb XMM2, XMM6;
  1559. psubb XMM3, XMM7;
  1560. movdqa [ESI -64], XMM0;
  1561. movdqa [ESI+16-64], XMM1;
  1562. movdqa [ESI+32-64], XMM2;
  1563. movdqa [ESI+48-64], XMM3;
  1564. cmp ESI, EDI;
  1565. jb startsubasslsse2a;
  1566. mov aptr, ESI;
  1567. mov bptr, ECX;
  1568. }
  1569. }
  1570. }
  1571. else
  1572. // MMX version is 3107% faster
  1573. if (mmx && a.length >= 32)
  1574. {
  1575. auto n = aptr + (a.length & ~31);
  1576. asm
  1577. {
  1578. mov ESI, aptr;
  1579. mov EDI, n;