/src/rt/arraybyte.d
D | 1882 lines | 1552 code | 237 blank | 93 comment | 140 complexity | 8673feb792a3492dd1f2fe30eb55ba09 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/** 2 * Contains SSE2 and MMX versions of certain operations for char, byte, and 3 * ubyte ('a', 'g' and 'h' suffixes). 4 * 5 * Copyright: Copyright Digital Mars 2008 - 2010. 6 * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>. 7 * Authors: Walter Bright, based on code originally written by Burton Radons 8 */ 9 10/* Copyright Digital Mars 2008 - 2010. 11 * Distributed under the Boost Software License, Version 1.0. 12 * (See accompanying file LICENSE or copy at 13 * http://www.boost.org/LICENSE_1_0.txt) 14 */ 15module rt.arraybyte; 16 17import core.cpuid; 18 19// debug=PRINTF 20 21version (unittest) 22{ 23 private import core.stdc.stdio : printf; 24 /* This is so unit tests will test every CPU variant 25 */ 26 int cpuid; 27 const int CPUID_MAX = 4; 28 @property bool mmx() { return cpuid == 1 && core.cpuid.mmx; } 29 @property bool sse() { return cpuid == 2 && core.cpuid.sse; } 30 @property bool sse2() { return cpuid == 3 && core.cpuid.sse2; } 31 @property bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow; } 32} 33else 34{ 35 alias core.cpuid.mmx mmx; 36 alias core.cpuid.sse sse; 37 alias core.cpuid.sse2 sse2; 38 alias core.cpuid.amd3dnow amd3dnow; 39} 40 41//version = log; 42 43@trusted pure nothrow 44bool disjoint(T)(T[] a, T[] b) 45{ 46 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 47} 48 49alias byte T; 50 51extern (C) @trusted nothrow: 52 53/* ======================================================================== */ 54 55 56/*********************** 57 * Computes: 58 * a[] = b[] + value 59 */ 60 61T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) 62{ 63 return _arraySliceExpAddSliceAssign_g(a, value, b); 64} 65 66T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) 67{ 68 return _arraySliceExpAddSliceAssign_g(a, value, b); 69} 70 71T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) 72in 73{ 74 assert(a.length == b.length); 75 assert(disjoint(a, b)); 76} 77body 78{ 79 //printf("_arraySliceExpAddSliceAssign_g()\n"); 80 auto aptr = a.ptr; 81 auto aend = aptr + a.length; 82 auto bptr = b.ptr; 83 84 version (D_InlineAsm_X86) 85 { 86 // SSE2 aligned version is 1088% faster 87 if (sse2 && a.length >= 64) 88 { 89 auto n = aptr + (a.length & ~63); 90 91 uint l = cast(ubyte)value * 0x01010101; 92 93 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 94 { 95 asm // unaligned case 96 { 97 mov ESI, aptr; 98 mov EDI, n; 99 mov EAX, bptr; 100 movd XMM4, l; 101 pshufd XMM4, XMM4, 0; 102 103 align 8; 104 startaddsse2u: 105 add ESI, 64; 106 movdqu XMM0, [EAX]; 107 movdqu XMM1, [EAX+16]; 108 movdqu XMM2, [EAX+32]; 109 movdqu XMM3, [EAX+48]; 110 add EAX, 64; 111 paddb XMM0, XMM4; 112 paddb XMM1, XMM4; 113 paddb XMM2, XMM4; 114 paddb XMM3, XMM4; 115 movdqu [ESI -64], XMM0; 116 movdqu [ESI+16-64], XMM1; 117 movdqu [ESI+32-64], XMM2; 118 movdqu [ESI+48-64], XMM3; 119 cmp ESI, EDI; 120 jb startaddsse2u; 121 122 mov aptr, ESI; 123 mov bptr, EAX; 124 } 125 } 126 else 127 { 128 asm // aligned case 129 { 130 mov ESI, aptr; 131 mov EDI, n; 132 mov EAX, bptr; 133 movd XMM4, l; 134 pshufd XMM4, XMM4, 0; 135 136 align 8; 137 startaddsse2a: 138 add ESI, 64; 139 movdqa XMM0, [EAX]; 140 movdqa XMM1, [EAX+16]; 141 movdqa XMM2, [EAX+32]; 142 movdqa XMM3, [EAX+48]; 143 add EAX, 64; 144 paddb XMM0, XMM4; 145 paddb XMM1, XMM4; 146 paddb XMM2, XMM4; 147 paddb XMM3, XMM4; 148 movdqa [ESI -64], XMM0; 149 movdqa [ESI+16-64], XMM1; 150 movdqa [ESI+32-64], XMM2; 151 movdqa [ESI+48-64], XMM3; 152 cmp ESI, EDI; 153 jb startaddsse2a; 154 155 mov aptr, ESI; 156 mov bptr, EAX; 157 } 158 } 159 } 160 else 161 // MMX version is 1000% faster 162 if (mmx && a.length >= 32) 163 { 164 auto n = aptr + (a.length & ~31); 165 166 uint l = cast(ubyte)value * 0x0101; 167 168 asm 169 { 170 mov ESI, aptr; 171 mov EDI, n; 172 mov EAX, bptr; 173 movd MM4, l; 174 pshufw MM4, MM4, 0; 175 176 align 4; 177 startaddmmx: 178 add ESI, 32; 179 movq MM0, [EAX]; 180 movq MM1, [EAX+8]; 181 movq MM2, [EAX+16]; 182 movq MM3, [EAX+24]; 183 add EAX, 32; 184 paddb MM0, MM4; 185 paddb MM1, MM4; 186 paddb MM2, MM4; 187 paddb MM3, MM4; 188 movq [ESI -32], MM0; 189 movq [ESI+8 -32], MM1; 190 movq [ESI+16-32], MM2; 191 movq [ESI+24-32], MM3; 192 cmp ESI, EDI; 193 jb startaddmmx; 194 195 emms; 196 mov aptr, ESI; 197 mov bptr, EAX; 198 } 199 } 200 /* trying to be fair and treat normal 32-bit cpu the same way as we do 201 * the SIMD units, with unrolled asm. There's not enough registers, 202 * really. 203 */ 204 else 205 if (a.length >= 4) 206 { 207 208 auto n = aptr + (a.length & ~3); 209 asm 210 { 211 mov ESI, aptr; 212 mov EDI, n; 213 mov EAX, bptr; 214 mov CL, value; 215 216 align 4; 217 startadd386: 218 add ESI, 4; 219 mov DX, [EAX]; 220 mov BX, [EAX+2]; 221 add EAX, 4; 222 add BL, CL; 223 add BH, CL; 224 add DL, CL; 225 add DH, CL; 226 mov [ESI -4], DX; 227 mov [ESI+2 -4], BX; 228 cmp ESI, EDI; 229 jb startadd386; 230 231 mov aptr, ESI; 232 mov bptr, EAX; 233 } 234 235 } 236 } 237 238 while (aptr < aend) 239 *aptr++ = cast(T)(*bptr++ + value); 240 241 return a; 242} 243 244unittest 245{ 246 debug(PRINTF) printf("_arraySliceExpAddSliceAssign_g unittest\n"); 247 248 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 249 { 250 version (log) printf(" cpuid %d\n", cpuid); 251 252 for (int j = 0; j < 2; j++) 253 { 254 const int dim = 67; 255 T[] a = new T[dim + j]; // aligned on 16 byte boundary 256 a = a[j .. dim + j]; // misalign for second iteration 257 T[] b = new T[dim + j]; 258 b = b[j .. dim + j]; 259 T[] c = new T[dim + j]; 260 c = c[j .. dim + j]; 261 262 for (int i = 0; i < dim; i++) 263 { a[i] = cast(T)i; 264 b[i] = cast(T)(i + 7); 265 c[i] = cast(T)(i * 2); 266 } 267 268 c[] = a[] + 6; 269 270 for (int i = 0; i < dim; i++) 271 { 272 if (c[i] != cast(T)(a[i] + 6)) 273 { 274 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 275 assert(0); 276 } 277 } 278 } 279 } 280} 281 282 283/* ======================================================================== */ 284 285/*********************** 286 * Computes: 287 * a[] = b[] + c[] 288 */ 289 290T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) 291{ 292 return _arraySliceSliceAddSliceAssign_g(a, c, b); 293} 294 295T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) 296{ 297 return _arraySliceSliceAddSliceAssign_g(a, c, b); 298} 299 300T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) 301in 302{ 303 assert(a.length == b.length && b.length == c.length); 304 assert(disjoint(a, b)); 305 assert(disjoint(a, c)); 306 assert(disjoint(b, c)); 307} 308body 309{ 310 //printf("_arraySliceSliceAddSliceAssign_g()\n"); 311 auto aptr = a.ptr; 312 auto aend = aptr + a.length; 313 auto bptr = b.ptr; 314 auto cptr = c.ptr; 315 316 version (D_InlineAsm_X86) 317 { 318 // SSE2 aligned version is 5739% faster 319 if (sse2 && a.length >= 64) 320 { 321 auto n = aptr + (a.length & ~63); 322 323 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 324 { 325 version (log) printf("\tsse2 unaligned\n"); 326 asm // unaligned case 327 { 328 mov ESI, aptr; 329 mov EDI, n; 330 mov EAX, bptr; 331 mov ECX, cptr; 332 333 align 8; 334 startaddlsse2u: 335 add ESI, 64; 336 movdqu XMM0, [EAX]; 337 movdqu XMM1, [EAX+16]; 338 movdqu XMM2, [EAX+32]; 339 movdqu XMM3, [EAX+48]; 340 add EAX, 64; 341 movdqu XMM4, [ECX]; 342 movdqu XMM5, [ECX+16]; 343 movdqu XMM6, [ECX+32]; 344 movdqu XMM7, [ECX+48]; 345 add ECX, 64; 346 paddb XMM0, XMM4; 347 paddb XMM1, XMM5; 348 paddb XMM2, XMM6; 349 paddb XMM3, XMM7; 350 movdqu [ESI -64], XMM0; 351 movdqu [ESI+16-64], XMM1; 352 movdqu [ESI+32-64], XMM2; 353 movdqu [ESI+48-64], XMM3; 354 cmp ESI, EDI; 355 jb startaddlsse2u; 356 357 mov aptr, ESI; 358 mov bptr, EAX; 359 mov cptr, ECX; 360 } 361 } 362 else 363 { 364 version (log) printf("\tsse2 aligned\n"); 365 asm // aligned case 366 { 367 mov ESI, aptr; 368 mov EDI, n; 369 mov EAX, bptr; 370 mov ECX, cptr; 371 372 align 8; 373 startaddlsse2a: 374 add ESI, 64; 375 movdqa XMM0, [EAX]; 376 movdqa XMM1, [EAX+16]; 377 movdqa XMM2, [EAX+32]; 378 movdqa XMM3, [EAX+48]; 379 add EAX, 64; 380 movdqa XMM4, [ECX]; 381 movdqa XMM5, [ECX+16]; 382 movdqa XMM6, [ECX+32]; 383 movdqa XMM7, [ECX+48]; 384 add ECX, 64; 385 paddb XMM0, XMM4; 386 paddb XMM1, XMM5; 387 paddb XMM2, XMM6; 388 paddb XMM3, XMM7; 389 movdqa [ESI -64], XMM0; 390 movdqa [ESI+16-64], XMM1; 391 movdqa [ESI+32-64], XMM2; 392 movdqa [ESI+48-64], XMM3; 393 cmp ESI, EDI; 394 jb startaddlsse2a; 395 396 mov aptr, ESI; 397 mov bptr, EAX; 398 mov cptr, ECX; 399 } 400 } 401 } 402 else 403 // MMX version is 4428% faster 404 if (mmx && a.length >= 32) 405 { 406 version (log) printf("\tmmx\n"); 407 auto n = aptr + (a.length & ~31); 408 409 asm 410 { 411 mov ESI, aptr; 412 mov EDI, n; 413 mov EAX, bptr; 414 mov ECX, cptr; 415 416 align 4; 417 startaddlmmx: 418 add ESI, 32; 419 movq MM0, [EAX]; 420 movq MM1, [EAX+8]; 421 movq MM2, [EAX+16]; 422 movq MM3, [EAX+24]; 423 add EAX, 32; 424 movq MM4, [ECX]; 425 movq MM5, [ECX+8]; 426 movq MM6, [ECX+16]; 427 movq MM7, [ECX+24]; 428 add ECX, 32; 429 paddb MM0, MM4; 430 paddb MM1, MM5; 431 paddb MM2, MM6; 432 paddb MM3, MM7; 433 movq [ESI -32], MM0; 434 movq [ESI+8 -32], MM1; 435 movq [ESI+16-32], MM2; 436 movq [ESI+24-32], MM3; 437 cmp ESI, EDI; 438 jb startaddlmmx; 439 440 emms; 441 mov aptr, ESI; 442 mov bptr, EAX; 443 mov cptr, ECX; 444 } 445 } 446 } 447 448 version (log) if (aptr < aend) printf("\tbase\n"); 449 while (aptr < aend) 450 *aptr++ = cast(T)(*bptr++ + *cptr++); 451 452 return a; 453} 454 455unittest 456{ 457 debug(PRINTF) printf("_arraySliceSliceAddSliceAssign_g unittest\n"); 458 459 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 460 { 461 version (log) printf(" cpuid %d\n", cpuid); 462 463 for (int j = 0; j < 2; j++) 464 { 465 const int dim = 67; 466 T[] a = new T[dim + j]; // aligned on 16 byte boundary 467 a = a[j .. dim + j]; // misalign for second iteration 468 T[] b = new T[dim + j]; 469 b = b[j .. dim + j]; 470 T[] c = new T[dim + j]; 471 c = c[j .. dim + j]; 472 473 for (int i = 0; i < dim; i++) 474 { a[i] = cast(T)i; 475 b[i] = cast(T)(i + 7); 476 c[i] = cast(T)(i * 2); 477 } 478 479 c[] = a[] + b[]; 480 481 for (int i = 0; i < dim; i++) 482 { 483 if (c[i] != cast(T)(a[i] + b[i])) 484 { 485 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 486 assert(0); 487 } 488 } 489 } 490 } 491} 492 493 494/* ======================================================================== */ 495 496/*********************** 497 * Computes: 498 * a[] += value 499 */ 500 501T[] _arrayExpSliceAddass_a(T[] a, T value) 502{ 503 return _arrayExpSliceAddass_g(a, value); 504} 505 506T[] _arrayExpSliceAddass_h(T[] a, T value) 507{ 508 return _arrayExpSliceAddass_g(a, value); 509} 510 511T[] _arrayExpSliceAddass_g(T[] a, T value) 512{ 513 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 514 auto aptr = a.ptr; 515 auto aend = aptr + a.length; 516 517 version (D_InlineAsm_X86) 518 { 519 // SSE2 aligned version is 1578% faster 520 if (sse2 && a.length >= 64) 521 { 522 auto n = aptr + (a.length & ~63); 523 524 uint l = cast(ubyte)value * 0x01010101; 525 526 if (((cast(uint) aptr) & 15) != 0) 527 { 528 asm // unaligned case 529 { 530 mov ESI, aptr; 531 mov EDI, n; 532 movd XMM4, l; 533 pshufd XMM4, XMM4, 0; 534 535 align 8; 536 startaddasssse2u: 537 movdqu XMM0, [ESI]; 538 movdqu XMM1, [ESI+16]; 539 movdqu XMM2, [ESI+32]; 540 movdqu XMM3, [ESI+48]; 541 add ESI, 64; 542 paddb XMM0, XMM4; 543 paddb XMM1, XMM4; 544 paddb XMM2, XMM4; 545 paddb XMM3, XMM4; 546 movdqu [ESI -64], XMM0; 547 movdqu [ESI+16-64], XMM1; 548 movdqu [ESI+32-64], XMM2; 549 movdqu [ESI+48-64], XMM3; 550 cmp ESI, EDI; 551 jb startaddasssse2u; 552 553 mov aptr, ESI; 554 } 555 } 556 else 557 { 558 asm // aligned case 559 { 560 mov ESI, aptr; 561 mov EDI, n; 562 movd XMM4, l; 563 pshufd XMM4, XMM4, 0; 564 565 align 8; 566 startaddasssse2a: 567 movdqa XMM0, [ESI]; 568 movdqa XMM1, [ESI+16]; 569 movdqa XMM2, [ESI+32]; 570 movdqa XMM3, [ESI+48]; 571 add ESI, 64; 572 paddb XMM0, XMM4; 573 paddb XMM1, XMM4; 574 paddb XMM2, XMM4; 575 paddb XMM3, XMM4; 576 movdqa [ESI -64], XMM0; 577 movdqa [ESI+16-64], XMM1; 578 movdqa [ESI+32-64], XMM2; 579 movdqa [ESI+48-64], XMM3; 580 cmp ESI, EDI; 581 jb startaddasssse2a; 582 583 mov aptr, ESI; 584 } 585 } 586 } 587 else 588 // MMX version is 1721% faster 589 if (mmx && a.length >= 32) 590 { 591 592 auto n = aptr + (a.length & ~31); 593 594 uint l = cast(ubyte)value * 0x0101; 595 596 asm 597 { 598 mov ESI, aptr; 599 mov EDI, n; 600 movd MM4, l; 601 pshufw MM4, MM4, 0; 602 603 align 8; 604 startaddassmmx: 605 movq MM0, [ESI]; 606 movq MM1, [ESI+8]; 607 movq MM2, [ESI+16]; 608 movq MM3, [ESI+24]; 609 add ESI, 32; 610 paddb MM0, MM4; 611 paddb MM1, MM4; 612 paddb MM2, MM4; 613 paddb MM3, MM4; 614 movq [ESI -32], MM0; 615 movq [ESI+8 -32], MM1; 616 movq [ESI+16-32], MM2; 617 movq [ESI+24-32], MM3; 618 cmp ESI, EDI; 619 jb startaddassmmx; 620 621 emms; 622 mov aptr, ESI; 623 } 624 } 625 } 626 627 while (aptr < aend) 628 *aptr++ += value; 629 630 return a; 631} 632 633unittest 634{ 635 debug(PRINTF) printf("_arrayExpSliceAddass_g unittest\n"); 636 637 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 638 { 639 version (log) printf(" cpuid %d\n", cpuid); 640 641 for (int j = 0; j < 2; j++) 642 { 643 const int dim = 67; 644 T[] a = new T[dim + j]; // aligned on 16 byte boundary 645 a = a[j .. dim + j]; // misalign for second iteration 646 T[] b = new T[dim + j]; 647 b = b[j .. dim + j]; 648 T[] c = new T[dim + j]; 649 c = c[j .. dim + j]; 650 651 for (int i = 0; i < dim; i++) 652 { a[i] = cast(T)i; 653 b[i] = cast(T)(i + 7); 654 c[i] = cast(T)(i * 2); 655 } 656 657 a[] = c[]; 658 c[] += 6; 659 660 for (int i = 0; i < dim; i++) 661 { 662 if (c[i] != cast(T)(a[i] + 6)) 663 { 664 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 665 assert(0); 666 } 667 } 668 } 669 } 670} 671 672 673/* ======================================================================== */ 674 675/*********************** 676 * Computes: 677 * a[] += b[] 678 */ 679 680T[] _arraySliceSliceAddass_a(T[] a, T[] b) 681{ 682 return _arraySliceSliceAddass_g(a, b); 683} 684 685T[] _arraySliceSliceAddass_h(T[] a, T[] b) 686{ 687 return _arraySliceSliceAddass_g(a, b); 688} 689 690T[] _arraySliceSliceAddass_g(T[] a, T[] b) 691in 692{ 693 assert (a.length == b.length); 694 assert (disjoint(a, b)); 695} 696body 697{ 698 //printf("_arraySliceSliceAddass_g()\n"); 699 auto aptr = a.ptr; 700 auto aend = aptr + a.length; 701 auto bptr = b.ptr; 702 703 version (D_InlineAsm_X86) 704 { 705 // SSE2 aligned version is 4727% faster 706 if (sse2 && a.length >= 64) 707 { 708 auto n = aptr + (a.length & ~63); 709 710 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 711 { 712 asm // unaligned case 713 { 714 mov ESI, aptr; 715 mov EDI, n; 716 mov ECX, bptr; 717 718 align 8; 719 startaddasslsse2u: 720 movdqu XMM0, [ESI]; 721 movdqu XMM1, [ESI+16]; 722 movdqu XMM2, [ESI+32]; 723 movdqu XMM3, [ESI+48]; 724 add ESI, 64; 725 movdqu XMM4, [ECX]; 726 movdqu XMM5, [ECX+16]; 727 movdqu XMM6, [ECX+32]; 728 movdqu XMM7, [ECX+48]; 729 add ECX, 64; 730 paddb XMM0, XMM4; 731 paddb XMM1, XMM5; 732 paddb XMM2, XMM6; 733 paddb XMM3, XMM7; 734 movdqu [ESI -64], XMM0; 735 movdqu [ESI+16-64], XMM1; 736 movdqu [ESI+32-64], XMM2; 737 movdqu [ESI+48-64], XMM3; 738 cmp ESI, EDI; 739 jb startaddasslsse2u; 740 741 mov aptr, ESI; 742 mov bptr, ECX; 743 } 744 } 745 else 746 { 747 asm // aligned case 748 { 749 mov ESI, aptr; 750 mov EDI, n; 751 mov ECX, bptr; 752 753 align 8; 754 startaddasslsse2a: 755 movdqa XMM0, [ESI]; 756 movdqa XMM1, [ESI+16]; 757 movdqa XMM2, [ESI+32]; 758 movdqa XMM3, [ESI+48]; 759 add ESI, 64; 760 movdqa XMM4, [ECX]; 761 movdqa XMM5, [ECX+16]; 762 movdqa XMM6, [ECX+32]; 763 movdqa XMM7, [ECX+48]; 764 add ECX, 64; 765 paddb XMM0, XMM4; 766 paddb XMM1, XMM5; 767 paddb XMM2, XMM6; 768 paddb XMM3, XMM7; 769 movdqa [ESI -64], XMM0; 770 movdqa [ESI+16-64], XMM1; 771 movdqa [ESI+32-64], XMM2; 772 movdqa [ESI+48-64], XMM3; 773 cmp ESI, EDI; 774 jb startaddasslsse2a; 775 776 mov aptr, ESI; 777 mov bptr, ECX; 778 } 779 } 780 } 781 else 782 // MMX version is 3059% faster 783 if (mmx && a.length >= 32) 784 { 785 786 auto n = aptr + (a.length & ~31); 787 788 asm 789 { 790 mov ESI, aptr; 791 mov EDI, n; 792 mov ECX, bptr; 793 794 align 8; 795 startaddasslmmx: 796 movq MM0, [ESI]; 797 movq MM1, [ESI+8]; 798 movq MM2, [ESI+16]; 799 movq MM3, [ESI+24]; 800 add ESI, 32; 801 movq MM4, [ECX]; 802 movq MM5, [ECX+8]; 803 movq MM6, [ECX+16]; 804 movq MM7, [ECX+24]; 805 add ECX, 32; 806 paddb MM0, MM4; 807 paddb MM1, MM5; 808 paddb MM2, MM6; 809 paddb MM3, MM7; 810 movq [ESI -32], MM0; 811 movq [ESI+8 -32], MM1; 812 movq [ESI+16-32], MM2; 813 movq [ESI+24-32], MM3; 814 cmp ESI, EDI; 815 jb startaddasslmmx; 816 817 emms; 818 mov aptr, ESI; 819 mov bptr, ECX; 820 } 821 } 822 } 823 824 while (aptr < aend) 825 *aptr++ += *bptr++; 826 827 return a; 828} 829 830unittest 831{ 832 debug(PRINTF) printf("_arraySliceSliceAddass_g unittest\n"); 833 834 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 835 { 836 version (log) printf(" cpuid %d\n", cpuid); 837 838 for (int j = 0; j < 2; j++) 839 { 840 const int dim = 67; 841 T[] a = new T[dim + j]; // aligned on 16 byte boundary 842 a = a[j .. dim + j]; // misalign for second iteration 843 T[] b = new T[dim + j]; 844 b = b[j .. dim + j]; 845 T[] c = new T[dim + j]; 846 c = c[j .. dim + j]; 847 848 for (int i = 0; i < dim; i++) 849 { a[i] = cast(T)i; 850 b[i] = cast(T)(i + 7); 851 c[i] = cast(T)(i * 2); 852 } 853 854 a[] = c[]; 855 c[] += b[]; 856 857 for (int i = 0; i < dim; i++) 858 { 859 if (c[i] != cast(T)(a[i] + b[i])) 860 { 861 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 862 assert(0); 863 } 864 } 865 } 866 } 867} 868 869 870/* ======================================================================== */ 871 872 873/*********************** 874 * Computes: 875 * a[] = b[] - value 876 */ 877 878T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) 879{ 880 return _arraySliceExpMinSliceAssign_g(a, value, b); 881} 882 883T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) 884{ 885 return _arraySliceExpMinSliceAssign_g(a, value, b); 886} 887 888T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) 889in 890{ 891 assert(a.length == b.length); 892 assert(disjoint(a, b)); 893} 894body 895{ 896 //printf("_arraySliceExpMinSliceAssign_g()\n"); 897 auto aptr = a.ptr; 898 auto aend = aptr + a.length; 899 auto bptr = b.ptr; 900 901 version (D_InlineAsm_X86) 902 { 903 // SSE2 aligned version is 1189% faster 904 if (sse2 && a.length >= 64) 905 { 906 auto n = aptr + (a.length & ~63); 907 908 uint l = cast(ubyte)value * 0x01010101; 909 910 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 911 { 912 asm // unaligned case 913 { 914 mov ESI, aptr; 915 mov EDI, n; 916 mov EAX, bptr; 917 movd XMM4, l; 918 pshufd XMM4, XMM4, 0; 919 920 align 8; 921 startsubsse2u: 922 add ESI, 64; 923 movdqu XMM0, [EAX]; 924 movdqu XMM1, [EAX+16]; 925 movdqu XMM2, [EAX+32]; 926 movdqu XMM3, [EAX+48]; 927 add EAX, 64; 928 psubb XMM0, XMM4; 929 psubb XMM1, XMM4; 930 psubb XMM2, XMM4; 931 psubb XMM3, XMM4; 932 movdqu [ESI -64], XMM0; 933 movdqu [ESI+16-64], XMM1; 934 movdqu [ESI+32-64], XMM2; 935 movdqu [ESI+48-64], XMM3; 936 cmp ESI, EDI; 937 jb startsubsse2u; 938 939 mov aptr, ESI; 940 mov bptr, EAX; 941 } 942 } 943 else 944 { 945 asm // aligned case 946 { 947 mov ESI, aptr; 948 mov EDI, n; 949 mov EAX, bptr; 950 movd XMM4, l; 951 pshufd XMM4, XMM4, 0; 952 953 align 8; 954 startsubsse2a: 955 add ESI, 64; 956 movdqa XMM0, [EAX]; 957 movdqa XMM1, [EAX+16]; 958 movdqa XMM2, [EAX+32]; 959 movdqa XMM3, [EAX+48]; 960 add EAX, 64; 961 psubb XMM0, XMM4; 962 psubb XMM1, XMM4; 963 psubb XMM2, XMM4; 964 psubb XMM3, XMM4; 965 movdqa [ESI -64], XMM0; 966 movdqa [ESI+16-64], XMM1; 967 movdqa [ESI+32-64], XMM2; 968 movdqa [ESI+48-64], XMM3; 969 cmp ESI, EDI; 970 jb startsubsse2a; 971 972 mov aptr, ESI; 973 mov bptr, EAX; 974 } 975 } 976 } 977 else 978 // MMX version is 1079% faster 979 if (mmx && a.length >= 32) 980 { 981 auto n = aptr + (a.length & ~31); 982 983 uint l = cast(ubyte)value * 0x0101; 984 985 asm 986 { 987 mov ESI, aptr; 988 mov EDI, n; 989 mov EAX, bptr; 990 movd MM4, l; 991 pshufw MM4, MM4, 0; 992 993 align 4; 994 startsubmmx: 995 add ESI, 32; 996 movq MM0, [EAX]; 997 movq MM1, [EAX+8]; 998 movq MM2, [EAX+16]; 999 movq MM3, [EAX+24]; 1000 add EAX, 32; 1001 psubb MM0, MM4; 1002 psubb MM1, MM4; 1003 psubb MM2, MM4; 1004 psubb MM3, MM4; 1005 movq [ESI -32], MM0; 1006 movq [ESI+8 -32], MM1; 1007 movq [ESI+16-32], MM2; 1008 movq [ESI+24-32], MM3; 1009 cmp ESI, EDI; 1010 jb startsubmmx; 1011 1012 emms; 1013 mov aptr, ESI; 1014 mov bptr, EAX; 1015 } 1016 } 1017 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. 1018 else 1019 if (a.length >= 4) 1020 { 1021 auto n = aptr + (a.length & ~3); 1022 asm 1023 { 1024 mov ESI, aptr; 1025 mov EDI, n; 1026 mov EAX, bptr; 1027 mov CL, value; 1028 1029 align 4; 1030 startsub386: 1031 add ESI, 4; 1032 mov DX, [EAX]; 1033 mov BX, [EAX+2]; 1034 add EAX, 4; 1035 sub BL, CL; 1036 sub BH, CL; 1037 sub DL, CL; 1038 sub DH, CL; 1039 mov [ESI -4], DX; 1040 mov [ESI+2 -4], BX; 1041 cmp ESI, EDI; 1042 jb startsub386; 1043 1044 mov aptr, ESI; 1045 mov bptr, EAX; 1046 } 1047 } 1048 } 1049 1050 while (aptr < aend) 1051 *aptr++ = cast(T)(*bptr++ - value); 1052 1053 return a; 1054} 1055 1056unittest 1057{ 1058 debug(PRINTF) printf("_arraySliceExpMinSliceAssign_g unittest\n"); 1059 1060 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1061 { 1062 version (log) printf(" cpuid %d\n", cpuid); 1063 1064 for (int j = 0; j < 2; j++) 1065 { 1066 const int dim = 67; 1067 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1068 a = a[j .. dim + j]; // misalign for second iteration 1069 T[] b = new T[dim + j]; 1070 b = b[j .. dim + j]; 1071 T[] c = new T[dim + j]; 1072 c = c[j .. dim + j]; 1073 1074 for (int i = 0; i < dim; i++) 1075 { a[i] = cast(T)i; 1076 b[i] = cast(T)(i + 7); 1077 c[i] = cast(T)(i * 2); 1078 } 1079 1080 a[] = c[]; 1081 c[] = b[] - 6; 1082 1083 for (int i = 0; i < dim; i++) 1084 { 1085 if (c[i] != cast(T)(b[i] - 6)) 1086 { 1087 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); 1088 assert(0); 1089 } 1090 } 1091 } 1092 } 1093} 1094 1095 1096/* ======================================================================== */ 1097 1098/*********************** 1099 * Computes: 1100 * a[] = value - b[] 1101 */ 1102 1103T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) 1104{ 1105 return _arrayExpSliceMinSliceAssign_g(a, b, value); 1106} 1107 1108T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) 1109{ 1110 return _arrayExpSliceMinSliceAssign_g(a, b, value); 1111} 1112 1113T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) 1114in 1115{ 1116 assert(a.length == b.length); 1117 assert(disjoint(a, b)); 1118} 1119body 1120{ 1121 //printf("_arrayExpSliceMinSliceAssign_g()\n"); 1122 auto aptr = a.ptr; 1123 auto aend = aptr + a.length; 1124 auto bptr = b.ptr; 1125 1126 version (D_InlineAsm_X86) 1127 { 1128 // SSE2 aligned version is 8748% faster 1129 if (sse2 && a.length >= 64) 1130 { 1131 auto n = aptr + (a.length & ~63); 1132 1133 uint l = cast(ubyte)value * 0x01010101; 1134 1135 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1136 { 1137 asm // unaligned case 1138 { 1139 mov ESI, aptr; 1140 mov EDI, n; 1141 mov EAX, bptr; 1142 movd XMM4, l; 1143 pshufd XMM4, XMM4, 0; 1144 1145 align 8; 1146 startsubrsse2u: 1147 add ESI, 64; 1148 movdqa XMM5, XMM4; 1149 movdqa XMM6, XMM4; 1150 movdqu XMM0, [EAX]; 1151 movdqu XMM1, [EAX+16]; 1152 psubb XMM5, XMM0; 1153 psubb XMM6, XMM1; 1154 movdqu [ESI -64], XMM5; 1155 movdqu [ESI+16-64], XMM6; 1156 movdqa XMM5, XMM4; 1157 movdqa XMM6, XMM4; 1158 movdqu XMM2, [EAX+32]; 1159 movdqu XMM3, [EAX+48]; 1160 add EAX, 64; 1161 psubb XMM5, XMM2; 1162 psubb XMM6, XMM3; 1163 movdqu [ESI+32-64], XMM5; 1164 movdqu [ESI+48-64], XMM6; 1165 cmp ESI, EDI; 1166 jb startsubrsse2u; 1167 1168 mov aptr, ESI; 1169 mov bptr, EAX; 1170 } 1171 } 1172 else 1173 { 1174 asm // aligned case 1175 { 1176 mov ESI, aptr; 1177 mov EDI, n; 1178 mov EAX, bptr; 1179 movd XMM4, l; 1180 pshufd XMM4, XMM4, 0; 1181 1182 align 8; 1183 startsubrsse2a: 1184 add ESI, 64; 1185 movdqa XMM5, XMM4; 1186 movdqa XMM6, XMM4; 1187 movdqa XMM0, [EAX]; 1188 movdqa XMM1, [EAX+16]; 1189 psubb XMM5, XMM0; 1190 psubb XMM6, XMM1; 1191 movdqa [ESI -64], XMM5; 1192 movdqa [ESI+16-64], XMM6; 1193 movdqa XMM5, XMM4; 1194 movdqa XMM6, XMM4; 1195 movdqa XMM2, [EAX+32]; 1196 movdqa XMM3, [EAX+48]; 1197 add EAX, 64; 1198 psubb XMM5, XMM2; 1199 psubb XMM6, XMM3; 1200 movdqa [ESI+32-64], XMM5; 1201 movdqa [ESI+48-64], XMM6; 1202 cmp ESI, EDI; 1203 jb startsubrsse2a; 1204 1205 mov aptr, ESI; 1206 mov bptr, EAX; 1207 } 1208 } 1209 } 1210 else 1211 // MMX version is 7397% faster 1212 if (mmx && a.length >= 32) 1213 { 1214 auto n = aptr + (a.length & ~31); 1215 1216 uint l = cast(ubyte)value * 0x0101; 1217 1218 asm 1219 { 1220 mov ESI, aptr; 1221 mov EDI, n; 1222 mov EAX, bptr; 1223 movd MM4, l; 1224 pshufw MM4, MM4, 0; 1225 1226 align 4; 1227 startsubrmmx: 1228 add ESI, 32; 1229 movq MM5, MM4; 1230 movq MM6, MM4; 1231 movq MM0, [EAX]; 1232 movq MM1, [EAX+8]; 1233 psubb MM5, MM0; 1234 psubb MM6, MM1; 1235 movq [ESI -32], MM5; 1236 movq [ESI+8 -32], MM6; 1237 movq MM5, MM4; 1238 movq MM6, MM4; 1239 movq MM2, [EAX+16]; 1240 movq MM3, [EAX+24]; 1241 add EAX, 32; 1242 psubb MM5, MM2; 1243 psubb MM6, MM3; 1244 movq [ESI+16-32], MM5; 1245 movq [ESI+24-32], MM6; 1246 cmp ESI, EDI; 1247 jb startsubrmmx; 1248 1249 emms; 1250 mov aptr, ESI; 1251 mov bptr, EAX; 1252 } 1253 } 1254 1255 } 1256 1257 while (aptr < aend) 1258 *aptr++ = cast(T)(value - *bptr++); 1259 1260 return a; 1261} 1262 1263unittest 1264{ 1265 debug(PRINTF) printf("_arrayExpSliceMinSliceAssign_g unittest\n"); 1266 1267 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1268 { 1269 version (log) printf(" cpuid %d\n", cpuid); 1270 1271 for (int j = 0; j < 2; j++) 1272 { 1273 const int dim = 67; 1274 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1275 a = a[j .. dim + j]; // misalign for second iteration 1276 T[] b = new T[dim + j]; 1277 b = b[j .. dim + j]; 1278 T[] c = new T[dim + j]; 1279 c = c[j .. dim + j]; 1280 1281 for (int i = 0; i < dim; i++) 1282 { a[i] = cast(T)i; 1283 b[i] = cast(T)(i + 7); 1284 c[i] = cast(T)(i * 2); 1285 } 1286 1287 a[] = c[]; 1288 c[] = 6 - b[]; 1289 1290 for (int i = 0; i < dim; i++) 1291 { 1292 if (c[i] != cast(T)(6 - b[i])) 1293 { 1294 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); 1295 assert(0); 1296 } 1297 } 1298 } 1299 } 1300} 1301 1302 1303/* ======================================================================== */ 1304 1305/*********************** 1306 * Computes: 1307 * a[] = b[] - c[] 1308 */ 1309 1310T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) 1311{ 1312 return _arraySliceSliceMinSliceAssign_g(a, c, b); 1313} 1314 1315T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) 1316{ 1317 return _arraySliceSliceMinSliceAssign_g(a, c, b); 1318} 1319 1320T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) 1321in 1322{ 1323 assert(a.length == b.length && b.length == c.length); 1324 assert(disjoint(a, b)); 1325 assert(disjoint(a, c)); 1326 assert(disjoint(b, c)); 1327} 1328body 1329{ 1330 auto aptr = a.ptr; 1331 auto aend = aptr + a.length; 1332 auto bptr = b.ptr; 1333 auto cptr = c.ptr; 1334 1335 version (D_InlineAsm_X86) 1336 { 1337 // SSE2 aligned version is 5756% faster 1338 if (sse2 && a.length >= 64) 1339 { 1340 auto n = aptr + (a.length & ~63); 1341 1342 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1343 { 1344 asm // unaligned case 1345 { 1346 mov ESI, aptr; 1347 mov EDI, n; 1348 mov EAX, bptr; 1349 mov ECX, cptr; 1350 1351 align 8; 1352 startsublsse2u: 1353 add ESI, 64; 1354 movdqu XMM0, [EAX]; 1355 movdqu XMM1, [EAX+16]; 1356 movdqu XMM2, [EAX+32]; 1357 movdqu XMM3, [EAX+48]; 1358 add EAX, 64; 1359 movdqu XMM4, [ECX]; 1360 movdqu XMM5, [ECX+16]; 1361 movdqu XMM6, [ECX+32]; 1362 movdqu XMM7, [ECX+48]; 1363 add ECX, 64; 1364 psubb XMM0, XMM4; 1365 psubb XMM1, XMM5; 1366 psubb XMM2, XMM6; 1367 psubb XMM3, XMM7; 1368 movdqu [ESI -64], XMM0; 1369 movdqu [ESI+16-64], XMM1; 1370 movdqu [ESI+32-64], XMM2; 1371 movdqu [ESI+48-64], XMM3; 1372 cmp ESI, EDI; 1373 jb startsublsse2u; 1374 1375 mov aptr, ESI; 1376 mov bptr, EAX; 1377 mov cptr, ECX; 1378 } 1379 } 1380 else 1381 { 1382 asm // aligned case 1383 { 1384 mov ESI, aptr; 1385 mov EDI, n; 1386 mov EAX, bptr; 1387 mov ECX, cptr; 1388 1389 align 8; 1390 startsublsse2a: 1391 add ESI, 64; 1392 movdqa XMM0, [EAX]; 1393 movdqa XMM1, [EAX+16]; 1394 movdqa XMM2, [EAX+32]; 1395 movdqa XMM3, [EAX+48]; 1396 add EAX, 64; 1397 movdqa XMM4, [ECX]; 1398 movdqa XMM5, [ECX+16]; 1399 movdqa XMM6, [ECX+32]; 1400 movdqa XMM7, [ECX+48]; 1401 add ECX, 64; 1402 psubb XMM0, XMM4; 1403 psubb XMM1, XMM5; 1404 psubb XMM2, XMM6; 1405 psubb XMM3, XMM7; 1406 movdqa [ESI -64], XMM0; 1407 movdqa [ESI+16-64], XMM1; 1408 movdqa [ESI+32-64], XMM2; 1409 movdqa [ESI+48-64], XMM3; 1410 cmp ESI, EDI; 1411 jb startsublsse2a; 1412 1413 mov aptr, ESI; 1414 mov bptr, EAX; 1415 mov cptr, ECX; 1416 } 1417 } 1418 } 1419 else 1420 // MMX version is 4428% faster 1421 if (mmx && a.length >= 32) 1422 { 1423 auto n = aptr + (a.length & ~31); 1424 1425 asm 1426 { 1427 mov ESI, aptr; 1428 mov EDI, n; 1429 mov EAX, bptr; 1430 mov ECX, cptr; 1431 1432 align 8; 1433 startsublmmx: 1434 add ESI, 32; 1435 movq MM0, [EAX]; 1436 movq MM1, [EAX+8]; 1437 movq MM2, [EAX+16]; 1438 movq MM3, [EAX+24]; 1439 add EAX, 32; 1440 movq MM4, [ECX]; 1441 movq MM5, [ECX+8]; 1442 movq MM6, [ECX+16]; 1443 movq MM7, [ECX+24]; 1444 add ECX, 32; 1445 psubb MM0, MM4; 1446 psubb MM1, MM5; 1447 psubb MM2, MM6; 1448 psubb MM3, MM7; 1449 movq [ESI -32], MM0; 1450 movq [ESI+8 -32], MM1; 1451 movq [ESI+16-32], MM2; 1452 movq [ESI+24-32], MM3; 1453 cmp ESI, EDI; 1454 jb startsublmmx; 1455 1456 emms; 1457 mov aptr, ESI; 1458 mov bptr, EAX; 1459 mov cptr, ECX; 1460 } 1461 } 1462 } 1463 1464 while (aptr < aend) 1465 *aptr++ = cast(T)(*bptr++ - *cptr++); 1466 1467 return a; 1468} 1469 1470unittest 1471{ 1472 debug(PRINTF) printf("_arraySliceSliceMinSliceAssign_g unittest\n"); 1473 1474 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1475 { 1476 version (log) printf(" cpuid %d\n", cpuid); 1477 1478 for (int j = 0; j < 2; j++) 1479 { 1480 const int dim = 67; 1481 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1482 a = a[j .. dim + j]; // misalign for second iteration 1483 T[] b = new T[dim + j]; 1484 b = b[j .. dim + j]; 1485 T[] c = new T[dim + j]; 1486 c = c[j .. dim + j]; 1487 1488 for (int i = 0; i < dim; i++) 1489 { a[i] = cast(T)i; 1490 b[i] = cast(T)(i + 7); 1491 c[i] = cast(T)(i * 2); 1492 } 1493 1494 c[] = a[] - b[]; 1495 1496 for (int i = 0; i < dim; i++) 1497 { 1498 if (c[i] != cast(T)(a[i] - b[i])) 1499 { 1500 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); 1501 assert(0); 1502 } 1503 } 1504 } 1505 } 1506} 1507 1508 1509/* ======================================================================== */ 1510 1511/*********************** 1512 * Computes: 1513 * a[] -= value 1514 */ 1515 1516T[] _arrayExpSliceMinass_a(T[] a, T value) 1517{ 1518 return _arrayExpSliceMinass_g(a, value); 1519} 1520 1521T[] _arrayExpSliceMinass_h(T[] a, T value) 1522{ 1523 return _arrayExpSliceMinass_g(a, value); 1524} 1525 1526T[] _arrayExpSliceMinass_g(T[] a, T value) 1527{ 1528 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1529 auto aptr = a.ptr; 1530 auto aend = aptr + a.length; 1531 1532 version (D_InlineAsm_X86) 1533 { 1534 // SSE2 aligned version is 1577% faster 1535 if (sse2 && a.length >= 64) 1536 { 1537 auto n = aptr + (a.length & ~63); 1538 1539 uint l = cast(ubyte)value * 0x01010101; 1540 1541 if (((cast(uint) aptr) & 15) != 0) 1542 { 1543 asm // unaligned case 1544 { 1545 mov ESI, aptr; 1546 mov EDI, n; 1547 movd XMM4, l; 1548 pshufd XMM4, XMM4, 0; 1549 1550 align 8; 1551 startsubasssse2u: 1552 movdqu XMM0, [ESI]; 1553 movdqu XMM1, [ESI+16]; 1554 movdqu XMM2, [ESI+32]; 1555 movdqu XMM3, [ESI+48]; 1556 add ESI, 64; 1557 psubb XMM0, XMM4; 1558 psubb XMM1, XMM4; 1559 psubb XMM2, XMM4; 1560 psubb XMM3, XMM4; 1561 movdqu [ESI -64], XMM0; 1562 movdqu [ESI+16-64], XMM1; 1563 movdqu [ESI+32-64], XMM2; 1564 movdqu [ESI+48-64], XMM3; 1565 cmp ESI, EDI; 1566 jb startsubasssse2u; 1567 1568 mov aptr, ESI; 1569 } 1570 } 1571 else 1572 { 1573 asm // aligned case 1574 { 1575 mov ESI, aptr; 1576 mov EDI, n; 1577 movd XMM4, l; 1578 pshufd XMM4, XMM4, 0; 1579 1580 align 8; 1581 startsubasssse2a: 1582 movdqa XMM0, [ESI]; 1583 movdqa XMM1, [ESI+16]; 1584 movdqa XMM2, [ESI+32]; 1585 movdqa XMM3, [ESI+48]; 1586 add ESI, 64; 1587 psubb XMM0, XMM4; 1588 psubb XMM1, XMM4; 1589 psubb XMM2, XMM4; 1590 psubb XMM3, XMM4; 1591 movdqa [ESI -64], XMM0; 1592 movdqa [ESI+16-64], XMM1; 1593 movdqa [ESI+32-64], XMM2; 1594 movdqa [ESI+48-64], XMM3; 1595 cmp ESI, EDI; 1596 jb startsubasssse2a; 1597 1598 mov aptr, ESI; 1599 } 1600 } 1601 } 1602 else 1603 // MMX version is 1577% faster 1604 if (mmx && a.length >= 32) 1605 { 1606 1607 auto n = aptr + (a.length & ~31); 1608 1609 uint l = cast(ubyte)value * 0x0101; 1610 1611 asm 1612 { 1613 mov ESI, aptr; 1614 mov EDI, n; 1615 movd MM4, l; 1616 pshufw MM4, MM4, 0; 1617 1618 align 8; 1619 startsubassmmx: 1620 movq MM0, [ESI]; 1621 movq MM1, [ESI+8]; 1622 movq MM2, [ESI+16]; 1623 movq MM3, [ESI+24]; 1624 add ESI, 32; 1625 psubb MM0, MM4; 1626 psubb MM1, MM4; 1627 psubb MM2, MM4; 1628 psubb MM3, MM4; 1629 movq [ESI -32], MM0; 1630 movq [ESI+8 -32], MM1; 1631 movq [ESI+16-32], MM2; 1632 movq [ESI+24-32], MM3; 1633 cmp ESI, EDI; 1634 jb startsubassmmx; 1635 1636 emms; 1637 mov aptr, ESI; 1638 } 1639 } 1640 } 1641 1642 while (aptr < aend) 1643 *aptr++ -= value; 1644 1645 return a; 1646} 1647 1648unittest 1649{ 1650 debug(PRINTF) printf("_arrayExpSliceMinass_g unittest\n"); 1651 1652 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1653 { 1654 version (log) printf(" cpuid %d\n", cpuid); 1655 1656 for (int j = 0; j < 2; j++) 1657 { 1658 const int dim = 67; 1659 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1660 a = a[j .. dim + j]; // misalign for second iteration 1661 T[] b = new T[dim + j]; 1662 b = b[j .. dim + j]; 1663 T[] c = new T[dim + j]; 1664 c = c[j .. dim + j]; 1665 1666 for (int i = 0; i < dim; i++) 1667 { a[i] = cast(T)i; 1668 b[i] = cast(T)(i + 7); 1669 c[i] = cast(T)(i * 2); 1670 } 1671 1672 a[] = c[]; 1673 c[] -= 6; 1674 1675 for (int i = 0; i < dim; i++) 1676 { 1677 if (c[i] != cast(T)(a[i] - 6)) 1678 { 1679 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); 1680 assert(0); 1681 } 1682 } 1683 } 1684 } 1685} 1686 1687 1688/* ======================================================================== */ 1689 1690/*********************** 1691 * Computes: 1692 * a[] -= b[] 1693 */ 1694 1695T[] _arraySliceSliceMinass_a(T[] a, T[] b) 1696{ 1697 return _arraySliceSliceMinass_g(a, b); 1698} 1699 1700T[] _arraySliceSliceMinass_h(T[] a, T[] b) 1701{ 1702 return _arraySliceSliceMinass_g(a, b); 1703} 1704 1705T[] _arraySliceSliceMinass_g(T[] a, T[] b) 1706in 1707{ 1708 assert (a.length == b.length); 1709 assert (disjoint(a, b)); 1710} 1711body 1712{ 1713 //printf("_arraySliceSliceMinass_g()\n"); 1714 auto aptr = a.ptr; 1715 auto aend = aptr + a.length; 1716 auto bptr = b.ptr; 1717 1718 version (D_InlineAsm_X86) 1719 { 1720 // SSE2 aligned version is 4800% faster 1721 if (sse2 && a.length >= 64) 1722 { 1723 auto n = aptr + (a.length & ~63); 1724 1725 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1726 { 1727 asm // unaligned case 1728 { 1729 mov ESI, aptr; 1730 mov EDI, n; 1731 mov ECX, bptr; 1732 1733 align 8; 1734 startsubasslsse2u: 1735 movdqu XMM0, [ESI]; 1736 movdqu XMM1, [ESI+16]; 1737 movdqu XMM2, [ESI+32]; 1738 movdqu XMM3, [ESI+48]; 1739 add ESI, 64; 1740 movdqu XMM4, [ECX]; 1741 movdqu XMM5, [ECX+16]; 1742 movdqu XMM6, [ECX+32]; 1743 movdqu XMM7, [ECX+48]; 1744 add ECX, 64; 1745 psubb XMM0, XMM4; 1746 psubb XMM1, XMM5; 1747 psubb XMM2, XMM6; 1748 psubb XMM3, XMM7; 1749 movdqu [ESI -64], XMM0; 1750 movdqu [ESI+16-64], XMM1; 1751 movdqu [ESI+32-64], XMM2; 1752 movdqu [ESI+48-64], XMM3; 1753 cmp ESI, EDI; 1754 jb startsubasslsse2u; 1755 1756 mov aptr, ESI; 1757 mov bptr, ECX; 1758 } 1759 } 1760 else 1761 { 1762 asm // aligned case 1763 { 1764 mov ESI, aptr; 1765 mov EDI, n; 1766 mov ECX, bptr; 1767 1768 align 8; 1769 startsubasslsse2a: 1770 movdqa XMM0, [ESI]; 1771 movdqa XMM1, [ESI+16]; 1772 movdqa XMM2, [ESI+32]; 1773 movdqa XMM3, [ESI+48]; 1774 add ESI, 64; 1775 movdqa XMM4, [ECX]; 1776 movdqa XMM5, [ECX+16]; 1777 movdqa XMM6, [ECX+32]; 1778 movdqa XMM7, [ECX+48]; 1779 add ECX, 64; 1780 psubb XMM0, XMM4; 1781 psubb XMM1, XMM5; 1782 psubb XMM2, XMM6; 1783 psubb XMM3, XMM7; 1784 movdqa [ESI -64], XMM0; 1785 movdqa [ESI+16-64], XMM1; 1786 movdqa [ESI+32-64], XMM2; 1787 movdqa [ESI+48-64], XMM3; 1788 cmp ESI, EDI; 1789 jb startsubasslsse2a; 1790 1791 mov aptr, ESI; 1792 mov bptr, ECX; 1793 } 1794 } 1795 } 1796 else 1797 // MMX version is 3107% faster 1798 if (mmx && a.length >= 32) 1799 { 1800 1801 auto n = aptr + (a.length & ~31); 1802 1803 asm 1804 { 1805 mov ESI, aptr; 1806 mov EDI, n; 1807 …
Large files files are truncated, but you can click here to view the full file