PageRenderTime 169ms CodeModel.GetById 15ms app.highlight 144ms RepoModel.GetById 1ms app.codeStats 1ms

/src/rt/arraybyte.d

http://github.com/AlexeyProkhin/druntime
D | 1882 lines | 1552 code | 237 blank | 93 comment | 140 complexity | 8673feb792a3492dd1f2fe30eb55ba09 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/**
   2 * Contains SSE2 and MMX versions of certain operations for char, byte, and
   3 * ubyte ('a', 'g' and 'h' suffixes).
   4 *
   5 * Copyright: Copyright Digital Mars 2008 - 2010.
   6 * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
   7 * Authors:   Walter Bright, based on code originally written by Burton Radons
   8 */
   9
  10/*          Copyright Digital Mars 2008 - 2010.
  11 * Distributed under the Boost Software License, Version 1.0.
  12 *    (See accompanying file LICENSE or copy at
  13 *          http://www.boost.org/LICENSE_1_0.txt)
  14 */
  15module rt.arraybyte;
  16
  17import core.cpuid;
  18
  19// debug=PRINTF
  20
  21version (unittest)
  22{
  23    private import core.stdc.stdio : printf;
  24    /* This is so unit tests will test every CPU variant
  25     */
  26    int cpuid;
  27    const int CPUID_MAX = 4;
  28    @property bool mmx()      { return cpuid == 1 && core.cpuid.mmx; }
  29    @property bool sse()      { return cpuid == 2 && core.cpuid.sse; }
  30    @property bool sse2()     { return cpuid == 3 && core.cpuid.sse2; }
  31    @property bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow; }
  32}
  33else
  34{
  35    alias core.cpuid.mmx mmx;
  36    alias core.cpuid.sse sse;
  37    alias core.cpuid.sse2 sse2;
  38    alias core.cpuid.amd3dnow amd3dnow;
  39}
  40
  41//version = log;
  42
  43@trusted pure nothrow
  44bool disjoint(T)(T[] a, T[] b)
  45{
  46    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
  47}
  48
  49alias byte T;
  50
  51extern (C) @trusted nothrow:
  52
  53/* ======================================================================== */
  54
  55
  56/***********************
  57 * Computes:
  58 *      a[] = b[] + value
  59 */
  60
  61T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
  62{
  63    return _arraySliceExpAddSliceAssign_g(a, value, b);
  64}
  65
  66T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
  67{
  68    return _arraySliceExpAddSliceAssign_g(a, value, b);
  69}
  70
  71T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
  72in
  73{
  74    assert(a.length == b.length);
  75    assert(disjoint(a, b));
  76}
  77body
  78{
  79    //printf("_arraySliceExpAddSliceAssign_g()\n");
  80    auto aptr = a.ptr;
  81    auto aend = aptr + a.length;
  82    auto bptr = b.ptr;
  83
  84    version (D_InlineAsm_X86)
  85    {
  86        // SSE2 aligned version is 1088% faster
  87        if (sse2 && a.length >= 64)
  88        {
  89            auto n = aptr + (a.length & ~63);
  90
  91            uint l = cast(ubyte)value * 0x01010101;
  92
  93            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
  94            {
  95                asm // unaligned case
  96                {
  97                    mov ESI, aptr;
  98                    mov EDI, n;
  99                    mov EAX, bptr;
 100                    movd XMM4, l;
 101                    pshufd XMM4, XMM4, 0;
 102
 103                    align 8;
 104                startaddsse2u:
 105                    add ESI, 64;
 106                    movdqu XMM0, [EAX];
 107                    movdqu XMM1, [EAX+16];
 108                    movdqu XMM2, [EAX+32];
 109                    movdqu XMM3, [EAX+48];
 110                    add EAX, 64;
 111                    paddb XMM0, XMM4;
 112                    paddb XMM1, XMM4;
 113                    paddb XMM2, XMM4;
 114                    paddb XMM3, XMM4;
 115                    movdqu [ESI   -64], XMM0;
 116                    movdqu [ESI+16-64], XMM1;
 117                    movdqu [ESI+32-64], XMM2;
 118                    movdqu [ESI+48-64], XMM3;
 119                    cmp ESI, EDI;
 120                    jb startaddsse2u;
 121
 122                    mov aptr, ESI;
 123                    mov bptr, EAX;
 124                }
 125            }
 126            else
 127            {
 128                asm // aligned case
 129                {
 130                    mov ESI, aptr;
 131                    mov EDI, n;
 132                    mov EAX, bptr;
 133                    movd XMM4, l;
 134                    pshufd XMM4, XMM4, 0;
 135
 136                    align 8;
 137                startaddsse2a:
 138                    add ESI, 64;
 139                    movdqa XMM0, [EAX];
 140                    movdqa XMM1, [EAX+16];
 141                    movdqa XMM2, [EAX+32];
 142                    movdqa XMM3, [EAX+48];
 143                    add EAX, 64;
 144                    paddb XMM0, XMM4;
 145                    paddb XMM1, XMM4;
 146                    paddb XMM2, XMM4;
 147                    paddb XMM3, XMM4;
 148                    movdqa [ESI   -64], XMM0;
 149                    movdqa [ESI+16-64], XMM1;
 150                    movdqa [ESI+32-64], XMM2;
 151                    movdqa [ESI+48-64], XMM3;
 152                    cmp ESI, EDI;
 153                    jb startaddsse2a;
 154
 155                    mov aptr, ESI;
 156                    mov bptr, EAX;
 157                }
 158            }
 159        }
 160        else
 161        // MMX version is 1000% faster
 162        if (mmx && a.length >= 32)
 163        {
 164            auto n = aptr + (a.length & ~31);
 165
 166            uint l = cast(ubyte)value * 0x0101;
 167
 168            asm
 169            {
 170                mov ESI, aptr;
 171                mov EDI, n;
 172                mov EAX, bptr;
 173                movd MM4, l;
 174                pshufw MM4, MM4, 0;
 175
 176                align 4;
 177            startaddmmx:
 178                add ESI, 32;
 179                movq MM0, [EAX];
 180                movq MM1, [EAX+8];
 181                movq MM2, [EAX+16];
 182                movq MM3, [EAX+24];
 183                add EAX, 32;
 184                paddb MM0, MM4;
 185                paddb MM1, MM4;
 186                paddb MM2, MM4;
 187                paddb MM3, MM4;
 188                movq [ESI   -32], MM0;
 189                movq [ESI+8 -32], MM1;
 190                movq [ESI+16-32], MM2;
 191                movq [ESI+24-32], MM3;
 192                cmp ESI, EDI;
 193                jb startaddmmx;
 194
 195                emms;
 196                mov aptr, ESI;
 197                mov bptr, EAX;
 198            }
 199        }
 200        /* trying to be fair and treat normal 32-bit cpu the same way as we do
 201         * the SIMD units, with unrolled asm.  There's not enough registers,
 202         * really.
 203         */
 204        else
 205        if (a.length >= 4)
 206        {
 207
 208            auto n = aptr + (a.length & ~3);
 209            asm
 210            {
 211                mov ESI, aptr;
 212                mov EDI, n;
 213                mov EAX, bptr;
 214                mov CL, value;
 215
 216                align 4;
 217            startadd386:
 218                add ESI, 4;
 219                mov DX, [EAX];
 220                mov BX, [EAX+2];
 221                add EAX, 4;
 222                add BL, CL;
 223                add BH, CL;
 224                add DL, CL;
 225                add DH, CL;
 226                mov [ESI   -4], DX;
 227                mov [ESI+2 -4], BX;
 228                cmp ESI, EDI;
 229                jb startadd386;
 230
 231                mov aptr, ESI;
 232                mov bptr, EAX;
 233            }
 234
 235        }
 236    }
 237
 238    while (aptr < aend)
 239        *aptr++ = cast(T)(*bptr++ + value);
 240
 241    return a;
 242}
 243
 244unittest
 245{
 246    debug(PRINTF) printf("_arraySliceExpAddSliceAssign_g unittest\n");
 247
 248    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 249    {
 250        version (log) printf("    cpuid %d\n", cpuid);
 251
 252        for (int j = 0; j < 2; j++)
 253        {
 254            const int dim = 67;
 255            T[] a = new T[dim + j];     // aligned on 16 byte boundary
 256            a = a[j .. dim + j];        // misalign for second iteration
 257            T[] b = new T[dim + j];
 258            b = b[j .. dim + j];
 259            T[] c = new T[dim + j];
 260            c = c[j .. dim + j];
 261
 262            for (int i = 0; i < dim; i++)
 263            {   a[i] = cast(T)i;
 264                b[i] = cast(T)(i + 7);
 265                c[i] = cast(T)(i * 2);
 266            }
 267
 268            c[] = a[] + 6;
 269
 270            for (int i = 0; i < dim; i++)
 271            {
 272                if (c[i] != cast(T)(a[i] + 6))
 273                {
 274                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
 275                    assert(0);
 276                }
 277            }
 278        }
 279    }
 280}
 281
 282
 283/* ======================================================================== */
 284
 285/***********************
 286 * Computes:
 287 *      a[] = b[] + c[]
 288 */
 289
 290T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
 291{
 292    return _arraySliceSliceAddSliceAssign_g(a, c, b);
 293}
 294
 295T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
 296{
 297    return _arraySliceSliceAddSliceAssign_g(a, c, b);
 298}
 299
 300T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
 301in
 302{
 303        assert(a.length == b.length && b.length == c.length);
 304        assert(disjoint(a, b));
 305        assert(disjoint(a, c));
 306        assert(disjoint(b, c));
 307}
 308body
 309{
 310    //printf("_arraySliceSliceAddSliceAssign_g()\n");
 311    auto aptr = a.ptr;
 312    auto aend = aptr + a.length;
 313    auto bptr = b.ptr;
 314    auto cptr = c.ptr;
 315
 316    version (D_InlineAsm_X86)
 317    {
 318        // SSE2 aligned version is 5739% faster
 319        if (sse2 && a.length >= 64)
 320        {
 321            auto n = aptr + (a.length & ~63);
 322
 323            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
 324            {
 325                version (log) printf("\tsse2 unaligned\n");
 326                asm // unaligned case
 327                {
 328                    mov ESI, aptr;
 329                    mov EDI, n;
 330                    mov EAX, bptr;
 331                    mov ECX, cptr;
 332
 333                    align 8;
 334                startaddlsse2u:
 335                    add ESI, 64;
 336                    movdqu XMM0, [EAX];
 337                    movdqu XMM1, [EAX+16];
 338                    movdqu XMM2, [EAX+32];
 339                    movdqu XMM3, [EAX+48];
 340                    add EAX, 64;
 341                    movdqu XMM4, [ECX];
 342                    movdqu XMM5, [ECX+16];
 343                    movdqu XMM6, [ECX+32];
 344                    movdqu XMM7, [ECX+48];
 345                    add ECX, 64;
 346                    paddb XMM0, XMM4;
 347                    paddb XMM1, XMM5;
 348                    paddb XMM2, XMM6;
 349                    paddb XMM3, XMM7;
 350                    movdqu [ESI   -64], XMM0;
 351                    movdqu [ESI+16-64], XMM1;
 352                    movdqu [ESI+32-64], XMM2;
 353                    movdqu [ESI+48-64], XMM3;
 354                    cmp ESI, EDI;
 355                    jb startaddlsse2u;
 356
 357                    mov aptr, ESI;
 358                    mov bptr, EAX;
 359                    mov cptr, ECX;
 360                }
 361            }
 362            else
 363            {
 364                version (log) printf("\tsse2 aligned\n");
 365                asm // aligned case
 366                {
 367                    mov ESI, aptr;
 368                    mov EDI, n;
 369                    mov EAX, bptr;
 370                    mov ECX, cptr;
 371
 372                    align 8;
 373                startaddlsse2a:
 374                    add ESI, 64;
 375                    movdqa XMM0, [EAX];
 376                    movdqa XMM1, [EAX+16];
 377                    movdqa XMM2, [EAX+32];
 378                    movdqa XMM3, [EAX+48];
 379                    add EAX, 64;
 380                    movdqa XMM4, [ECX];
 381                    movdqa XMM5, [ECX+16];
 382                    movdqa XMM6, [ECX+32];
 383                    movdqa XMM7, [ECX+48];
 384                    add ECX, 64;
 385                    paddb XMM0, XMM4;
 386                    paddb XMM1, XMM5;
 387                    paddb XMM2, XMM6;
 388                    paddb XMM3, XMM7;
 389                    movdqa [ESI   -64], XMM0;
 390                    movdqa [ESI+16-64], XMM1;
 391                    movdqa [ESI+32-64], XMM2;
 392                    movdqa [ESI+48-64], XMM3;
 393                    cmp ESI, EDI;
 394                    jb startaddlsse2a;
 395
 396                    mov aptr, ESI;
 397                    mov bptr, EAX;
 398                    mov cptr, ECX;
 399                }
 400            }
 401        }
 402        else
 403        // MMX version is 4428% faster
 404        if (mmx && a.length >= 32)
 405        {
 406            version (log) printf("\tmmx\n");
 407            auto n = aptr + (a.length & ~31);
 408
 409            asm
 410            {
 411                mov ESI, aptr;
 412                mov EDI, n;
 413                mov EAX, bptr;
 414                mov ECX, cptr;
 415
 416                align 4;
 417            startaddlmmx:
 418                add ESI, 32;
 419                movq MM0, [EAX];
 420                movq MM1, [EAX+8];
 421                movq MM2, [EAX+16];
 422                movq MM3, [EAX+24];
 423                add EAX, 32;
 424                movq MM4, [ECX];
 425                movq MM5, [ECX+8];
 426                movq MM6, [ECX+16];
 427                movq MM7, [ECX+24];
 428                add ECX, 32;
 429                paddb MM0, MM4;
 430                paddb MM1, MM5;
 431                paddb MM2, MM6;
 432                paddb MM3, MM7;
 433                movq [ESI   -32], MM0;
 434                movq [ESI+8 -32], MM1;
 435                movq [ESI+16-32], MM2;
 436                movq [ESI+24-32], MM3;
 437                cmp ESI, EDI;
 438                jb startaddlmmx;
 439
 440                emms;
 441                mov aptr, ESI;
 442                mov bptr, EAX;
 443                mov cptr, ECX;
 444            }
 445        }
 446    }
 447
 448    version (log) if (aptr < aend) printf("\tbase\n");
 449    while (aptr < aend)
 450        *aptr++ = cast(T)(*bptr++ + *cptr++);
 451
 452    return a;
 453}
 454
 455unittest
 456{
 457    debug(PRINTF) printf("_arraySliceSliceAddSliceAssign_g unittest\n");
 458
 459    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 460    {
 461        version (log) printf("    cpuid %d\n", cpuid);
 462
 463        for (int j = 0; j < 2; j++)
 464        {
 465            const int dim = 67;
 466            T[] a = new T[dim + j];     // aligned on 16 byte boundary
 467            a = a[j .. dim + j];        // misalign for second iteration
 468            T[] b = new T[dim + j];
 469            b = b[j .. dim + j];
 470            T[] c = new T[dim + j];
 471            c = c[j .. dim + j];
 472
 473            for (int i = 0; i < dim; i++)
 474            {   a[i] = cast(T)i;
 475                b[i] = cast(T)(i + 7);
 476                c[i] = cast(T)(i * 2);
 477            }
 478
 479            c[] = a[] + b[];
 480
 481            for (int i = 0; i < dim; i++)
 482            {
 483                if (c[i] != cast(T)(a[i] + b[i]))
 484                {
 485                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
 486                    assert(0);
 487                }
 488            }
 489        }
 490    }
 491}
 492
 493
 494/* ======================================================================== */
 495
 496/***********************
 497 * Computes:
 498 *      a[] += value
 499 */
 500
 501T[] _arrayExpSliceAddass_a(T[] a, T value)
 502{
 503    return _arrayExpSliceAddass_g(a, value);
 504}
 505
 506T[] _arrayExpSliceAddass_h(T[] a, T value)
 507{
 508    return _arrayExpSliceAddass_g(a, value);
 509}
 510
 511T[] _arrayExpSliceAddass_g(T[] a, T value)
 512{
 513    //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
 514    auto aptr = a.ptr;
 515    auto aend = aptr + a.length;
 516
 517    version (D_InlineAsm_X86)
 518    {
 519        // SSE2 aligned version is 1578% faster
 520        if (sse2 && a.length >= 64)
 521        {
 522            auto n = aptr + (a.length & ~63);
 523
 524            uint l = cast(ubyte)value * 0x01010101;
 525
 526            if (((cast(uint) aptr) & 15) != 0)
 527            {
 528                asm // unaligned case
 529                {
 530                    mov ESI, aptr;
 531                    mov EDI, n;
 532                    movd XMM4, l;
 533                    pshufd XMM4, XMM4, 0;
 534
 535                    align 8;
 536                startaddasssse2u:
 537                    movdqu XMM0, [ESI];
 538                    movdqu XMM1, [ESI+16];
 539                    movdqu XMM2, [ESI+32];
 540                    movdqu XMM3, [ESI+48];
 541                    add ESI, 64;
 542                    paddb XMM0, XMM4;
 543                    paddb XMM1, XMM4;
 544                    paddb XMM2, XMM4;
 545                    paddb XMM3, XMM4;
 546                    movdqu [ESI   -64], XMM0;
 547                    movdqu [ESI+16-64], XMM1;
 548                    movdqu [ESI+32-64], XMM2;
 549                    movdqu [ESI+48-64], XMM3;
 550                    cmp ESI, EDI;
 551                    jb startaddasssse2u;
 552
 553                    mov aptr, ESI;
 554                }
 555            }
 556            else
 557            {
 558                asm // aligned case
 559                {
 560                    mov ESI, aptr;
 561                    mov EDI, n;
 562                    movd XMM4, l;
 563                    pshufd XMM4, XMM4, 0;
 564
 565                    align 8;
 566                startaddasssse2a:
 567                    movdqa XMM0, [ESI];
 568                    movdqa XMM1, [ESI+16];
 569                    movdqa XMM2, [ESI+32];
 570                    movdqa XMM3, [ESI+48];
 571                    add ESI, 64;
 572                    paddb XMM0, XMM4;
 573                    paddb XMM1, XMM4;
 574                    paddb XMM2, XMM4;
 575                    paddb XMM3, XMM4;
 576                    movdqa [ESI   -64], XMM0;
 577                    movdqa [ESI+16-64], XMM1;
 578                    movdqa [ESI+32-64], XMM2;
 579                    movdqa [ESI+48-64], XMM3;
 580                    cmp ESI, EDI;
 581                    jb startaddasssse2a;
 582
 583                    mov aptr, ESI;
 584                }
 585            }
 586        }
 587        else
 588        // MMX version is 1721% faster
 589        if (mmx && a.length >= 32)
 590        {
 591
 592            auto n = aptr + (a.length & ~31);
 593
 594            uint l = cast(ubyte)value * 0x0101;
 595
 596            asm
 597            {
 598                mov ESI, aptr;
 599                mov EDI, n;
 600                movd MM4, l;
 601                pshufw MM4, MM4, 0;
 602
 603                align 8;
 604            startaddassmmx:
 605                movq MM0, [ESI];
 606                movq MM1, [ESI+8];
 607                movq MM2, [ESI+16];
 608                movq MM3, [ESI+24];
 609                add ESI, 32;
 610                paddb MM0, MM4;
 611                paddb MM1, MM4;
 612                paddb MM2, MM4;
 613                paddb MM3, MM4;
 614                movq [ESI   -32], MM0;
 615                movq [ESI+8 -32], MM1;
 616                movq [ESI+16-32], MM2;
 617                movq [ESI+24-32], MM3;
 618                cmp ESI, EDI;
 619                jb startaddassmmx;
 620
 621                emms;
 622                mov aptr, ESI;
 623            }
 624        }
 625    }
 626
 627    while (aptr < aend)
 628        *aptr++ += value;
 629
 630    return a;
 631}
 632
 633unittest
 634{
 635    debug(PRINTF) printf("_arrayExpSliceAddass_g unittest\n");
 636
 637    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 638    {
 639        version (log) printf("    cpuid %d\n", cpuid);
 640
 641        for (int j = 0; j < 2; j++)
 642        {
 643            const int dim = 67;
 644            T[] a = new T[dim + j];     // aligned on 16 byte boundary
 645            a = a[j .. dim + j];        // misalign for second iteration
 646            T[] b = new T[dim + j];
 647            b = b[j .. dim + j];
 648            T[] c = new T[dim + j];
 649            c = c[j .. dim + j];
 650
 651            for (int i = 0; i < dim; i++)
 652            {   a[i] = cast(T)i;
 653                b[i] = cast(T)(i + 7);
 654                c[i] = cast(T)(i * 2);
 655            }
 656
 657            a[] = c[];
 658            c[] += 6;
 659
 660            for (int i = 0; i < dim; i++)
 661            {
 662                if (c[i] != cast(T)(a[i] + 6))
 663                {
 664                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
 665                    assert(0);
 666                }
 667            }
 668        }
 669    }
 670}
 671
 672
 673/* ======================================================================== */
 674
 675/***********************
 676 * Computes:
 677 *      a[] += b[]
 678 */
 679
 680T[] _arraySliceSliceAddass_a(T[] a, T[] b)
 681{
 682    return _arraySliceSliceAddass_g(a, b);
 683}
 684
 685T[] _arraySliceSliceAddass_h(T[] a, T[] b)
 686{
 687    return _arraySliceSliceAddass_g(a, b);
 688}
 689
 690T[] _arraySliceSliceAddass_g(T[] a, T[] b)
 691in
 692{
 693    assert (a.length == b.length);
 694    assert (disjoint(a, b));
 695}
 696body
 697{
 698    //printf("_arraySliceSliceAddass_g()\n");
 699    auto aptr = a.ptr;
 700    auto aend = aptr + a.length;
 701    auto bptr = b.ptr;
 702
 703    version (D_InlineAsm_X86)
 704    {
 705        // SSE2 aligned version is 4727% faster
 706        if (sse2 && a.length >= 64)
 707        {
 708            auto n = aptr + (a.length & ~63);
 709
 710            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
 711            {
 712                asm // unaligned case
 713                {
 714                    mov ESI, aptr;
 715                    mov EDI, n;
 716                    mov ECX, bptr;
 717
 718                    align 8;
 719                startaddasslsse2u:
 720                    movdqu XMM0, [ESI];
 721                    movdqu XMM1, [ESI+16];
 722                    movdqu XMM2, [ESI+32];
 723                    movdqu XMM3, [ESI+48];
 724                    add ESI, 64;
 725                    movdqu XMM4, [ECX];
 726                    movdqu XMM5, [ECX+16];
 727                    movdqu XMM6, [ECX+32];
 728                    movdqu XMM7, [ECX+48];
 729                    add ECX, 64;
 730                    paddb XMM0, XMM4;
 731                    paddb XMM1, XMM5;
 732                    paddb XMM2, XMM6;
 733                    paddb XMM3, XMM7;
 734                    movdqu [ESI   -64], XMM0;
 735                    movdqu [ESI+16-64], XMM1;
 736                    movdqu [ESI+32-64], XMM2;
 737                    movdqu [ESI+48-64], XMM3;
 738                    cmp ESI, EDI;
 739                    jb startaddasslsse2u;
 740
 741                    mov aptr, ESI;
 742                    mov bptr, ECX;
 743                }
 744            }
 745            else
 746            {
 747                asm // aligned case
 748                {
 749                    mov ESI, aptr;
 750                    mov EDI, n;
 751                    mov ECX, bptr;
 752
 753                    align 8;
 754                startaddasslsse2a:
 755                    movdqa XMM0, [ESI];
 756                    movdqa XMM1, [ESI+16];
 757                    movdqa XMM2, [ESI+32];
 758                    movdqa XMM3, [ESI+48];
 759                    add ESI, 64;
 760                    movdqa XMM4, [ECX];
 761                    movdqa XMM5, [ECX+16];
 762                    movdqa XMM6, [ECX+32];
 763                    movdqa XMM7, [ECX+48];
 764                    add ECX, 64;
 765                    paddb XMM0, XMM4;
 766                    paddb XMM1, XMM5;
 767                    paddb XMM2, XMM6;
 768                    paddb XMM3, XMM7;
 769                    movdqa [ESI   -64], XMM0;
 770                    movdqa [ESI+16-64], XMM1;
 771                    movdqa [ESI+32-64], XMM2;
 772                    movdqa [ESI+48-64], XMM3;
 773                    cmp ESI, EDI;
 774                    jb startaddasslsse2a;
 775
 776                    mov aptr, ESI;
 777                    mov bptr, ECX;
 778                }
 779            }
 780        }
 781        else
 782        // MMX version is 3059% faster
 783        if (mmx && a.length >= 32)
 784        {
 785
 786            auto n = aptr + (a.length & ~31);
 787
 788            asm
 789            {
 790                mov ESI, aptr;
 791                mov EDI, n;
 792                mov ECX, bptr;
 793
 794                align 8;
 795            startaddasslmmx:
 796                movq MM0, [ESI];
 797                movq MM1, [ESI+8];
 798                movq MM2, [ESI+16];
 799                movq MM3, [ESI+24];
 800                add ESI, 32;
 801                movq MM4, [ECX];
 802                movq MM5, [ECX+8];
 803                movq MM6, [ECX+16];
 804                movq MM7, [ECX+24];
 805                add ECX, 32;
 806                paddb MM0, MM4;
 807                paddb MM1, MM5;
 808                paddb MM2, MM6;
 809                paddb MM3, MM7;
 810                movq [ESI   -32], MM0;
 811                movq [ESI+8 -32], MM1;
 812                movq [ESI+16-32], MM2;
 813                movq [ESI+24-32], MM3;
 814                cmp ESI, EDI;
 815                jb startaddasslmmx;
 816
 817                emms;
 818                mov aptr, ESI;
 819                mov bptr, ECX;
 820            }
 821        }
 822    }
 823
 824    while (aptr < aend)
 825        *aptr++ += *bptr++;
 826
 827    return a;
 828}
 829
 830unittest
 831{
 832    debug(PRINTF) printf("_arraySliceSliceAddass_g unittest\n");
 833
 834    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 835    {
 836        version (log) printf("    cpuid %d\n", cpuid);
 837
 838        for (int j = 0; j < 2; j++)
 839        {
 840            const int dim = 67;
 841            T[] a = new T[dim + j];     // aligned on 16 byte boundary
 842            a = a[j .. dim + j];        // misalign for second iteration
 843            T[] b = new T[dim + j];
 844            b = b[j .. dim + j];
 845            T[] c = new T[dim + j];
 846            c = c[j .. dim + j];
 847
 848            for (int i = 0; i < dim; i++)
 849            {   a[i] = cast(T)i;
 850                b[i] = cast(T)(i + 7);
 851                c[i] = cast(T)(i * 2);
 852            }
 853
 854            a[] = c[];
 855            c[] += b[];
 856
 857            for (int i = 0; i < dim; i++)
 858            {
 859                if (c[i] != cast(T)(a[i] + b[i]))
 860                {
 861                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
 862                    assert(0);
 863                }
 864            }
 865        }
 866    }
 867}
 868
 869
 870/* ======================================================================== */
 871
 872
 873/***********************
 874 * Computes:
 875 *      a[] = b[] - value
 876 */
 877
 878T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
 879{
 880    return _arraySliceExpMinSliceAssign_g(a, value, b);
 881}
 882
 883T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
 884{
 885    return _arraySliceExpMinSliceAssign_g(a, value, b);
 886}
 887
 888T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
 889in
 890{
 891    assert(a.length == b.length);
 892    assert(disjoint(a, b));
 893}
 894body
 895{
 896    //printf("_arraySliceExpMinSliceAssign_g()\n");
 897    auto aptr = a.ptr;
 898    auto aend = aptr + a.length;
 899    auto bptr = b.ptr;
 900
 901    version (D_InlineAsm_X86)
 902    {
 903        // SSE2 aligned version is 1189% faster
 904        if (sse2 && a.length >= 64)
 905        {
 906            auto n = aptr + (a.length & ~63);
 907
 908            uint l = cast(ubyte)value * 0x01010101;
 909
 910            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
 911            {
 912                asm // unaligned case
 913                {
 914                    mov ESI, aptr;
 915                    mov EDI, n;
 916                    mov EAX, bptr;
 917                    movd XMM4, l;
 918                    pshufd XMM4, XMM4, 0;
 919
 920                    align 8;
 921                startsubsse2u:
 922                    add ESI, 64;
 923                    movdqu XMM0, [EAX];
 924                    movdqu XMM1, [EAX+16];
 925                    movdqu XMM2, [EAX+32];
 926                    movdqu XMM3, [EAX+48];
 927                    add EAX, 64;
 928                    psubb XMM0, XMM4;
 929                    psubb XMM1, XMM4;
 930                    psubb XMM2, XMM4;
 931                    psubb XMM3, XMM4;
 932                    movdqu [ESI   -64], XMM0;
 933                    movdqu [ESI+16-64], XMM1;
 934                    movdqu [ESI+32-64], XMM2;
 935                    movdqu [ESI+48-64], XMM3;
 936                    cmp ESI, EDI;
 937                    jb startsubsse2u;
 938
 939                    mov aptr, ESI;
 940                    mov bptr, EAX;
 941                }
 942            }
 943            else
 944            {
 945                asm // aligned case
 946                {
 947                    mov ESI, aptr;
 948                    mov EDI, n;
 949                    mov EAX, bptr;
 950                    movd XMM4, l;
 951                    pshufd XMM4, XMM4, 0;
 952
 953                    align 8;
 954                startsubsse2a:
 955                    add ESI, 64;
 956                    movdqa XMM0, [EAX];
 957                    movdqa XMM1, [EAX+16];
 958                    movdqa XMM2, [EAX+32];
 959                    movdqa XMM3, [EAX+48];
 960                    add EAX, 64;
 961                    psubb XMM0, XMM4;
 962                    psubb XMM1, XMM4;
 963                    psubb XMM2, XMM4;
 964                    psubb XMM3, XMM4;
 965                    movdqa [ESI   -64], XMM0;
 966                    movdqa [ESI+16-64], XMM1;
 967                    movdqa [ESI+32-64], XMM2;
 968                    movdqa [ESI+48-64], XMM3;
 969                    cmp ESI, EDI;
 970                    jb startsubsse2a;
 971
 972                    mov aptr, ESI;
 973                    mov bptr, EAX;
 974                }
 975            }
 976        }
 977        else
 978        // MMX version is 1079% faster
 979        if (mmx && a.length >= 32)
 980        {
 981            auto n = aptr + (a.length & ~31);
 982
 983            uint l = cast(ubyte)value * 0x0101;
 984
 985            asm
 986            {
 987                mov ESI, aptr;
 988                mov EDI, n;
 989                mov EAX, bptr;
 990                movd MM4, l;
 991                pshufw MM4, MM4, 0;
 992
 993                align 4;
 994            startsubmmx:
 995                add ESI, 32;
 996                movq MM0, [EAX];
 997                movq MM1, [EAX+8];
 998                movq MM2, [EAX+16];
 999                movq MM3, [EAX+24];
1000                add EAX, 32;
1001                psubb MM0, MM4;
1002                psubb MM1, MM4;
1003                psubb MM2, MM4;
1004                psubb MM3, MM4;
1005                movq [ESI   -32], MM0;
1006                movq [ESI+8 -32], MM1;
1007                movq [ESI+16-32], MM2;
1008                movq [ESI+24-32], MM3;
1009                cmp ESI, EDI;
1010                jb startsubmmx;
1011
1012                emms;
1013                mov aptr, ESI;
1014                mov bptr, EAX;
1015            }
1016        }
1017        // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm.  There's not enough registers, really.
1018        else
1019        if (a.length >= 4)
1020        {
1021            auto n = aptr + (a.length & ~3);
1022            asm
1023            {
1024                mov ESI, aptr;
1025                mov EDI, n;
1026                mov EAX, bptr;
1027                mov CL, value;
1028
1029                align 4;
1030            startsub386:
1031                add ESI, 4;
1032                mov DX, [EAX];
1033                mov BX, [EAX+2];
1034                add EAX, 4;
1035                sub BL, CL;
1036                sub BH, CL;
1037                sub DL, CL;
1038                sub DH, CL;
1039                mov [ESI   -4], DX;
1040                mov [ESI+2 -4], BX;
1041                cmp ESI, EDI;
1042                jb startsub386;
1043
1044                mov aptr, ESI;
1045                mov bptr, EAX;
1046            }
1047        }
1048    }
1049
1050    while (aptr < aend)
1051        *aptr++ = cast(T)(*bptr++ - value);
1052
1053    return a;
1054}
1055
1056unittest
1057{
1058    debug(PRINTF) printf("_arraySliceExpMinSliceAssign_g unittest\n");
1059
1060    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1061    {
1062        version (log) printf("    cpuid %d\n", cpuid);
1063
1064        for (int j = 0; j < 2; j++)
1065        {
1066            const int dim = 67;
1067            T[] a = new T[dim + j];     // aligned on 16 byte boundary
1068            a = a[j .. dim + j];        // misalign for second iteration
1069            T[] b = new T[dim + j];
1070            b = b[j .. dim + j];
1071            T[] c = new T[dim + j];
1072            c = c[j .. dim + j];
1073
1074            for (int i = 0; i < dim; i++)
1075            {   a[i] = cast(T)i;
1076                b[i] = cast(T)(i + 7);
1077                c[i] = cast(T)(i * 2);
1078            }
1079
1080            a[] = c[];
1081            c[] = b[] - 6;
1082
1083            for (int i = 0; i < dim; i++)
1084            {
1085                if (c[i] != cast(T)(b[i] - 6))
1086                {
1087                    printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1088                    assert(0);
1089                }
1090            }
1091        }
1092    }
1093}
1094
1095
1096/* ======================================================================== */
1097
1098/***********************
1099 * Computes:
1100 *      a[] = value - b[]
1101 */
1102
1103T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1104{
1105    return _arrayExpSliceMinSliceAssign_g(a, b, value);
1106}
1107
1108T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1109{
1110    return _arrayExpSliceMinSliceAssign_g(a, b, value);
1111}
1112
1113T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1114in
1115{
1116    assert(a.length == b.length);
1117    assert(disjoint(a, b));
1118}
1119body
1120{
1121    //printf("_arrayExpSliceMinSliceAssign_g()\n");
1122    auto aptr = a.ptr;
1123    auto aend = aptr + a.length;
1124    auto bptr = b.ptr;
1125
1126    version (D_InlineAsm_X86)
1127    {
1128        // SSE2 aligned version is 8748% faster
1129        if (sse2 && a.length >= 64)
1130        {
1131            auto n = aptr + (a.length & ~63);
1132
1133            uint l = cast(ubyte)value * 0x01010101;
1134
1135            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1136            {
1137                asm // unaligned case
1138                {
1139                    mov ESI, aptr;
1140                    mov EDI, n;
1141                    mov EAX, bptr;
1142                    movd XMM4, l;
1143                    pshufd XMM4, XMM4, 0;
1144
1145                    align 8;
1146                startsubrsse2u:
1147                    add ESI, 64;
1148                    movdqa XMM5, XMM4;
1149                    movdqa XMM6, XMM4;
1150                    movdqu XMM0, [EAX];
1151                    movdqu XMM1, [EAX+16];
1152                    psubb XMM5, XMM0;
1153                    psubb XMM6, XMM1;
1154                    movdqu [ESI   -64], XMM5;
1155                    movdqu [ESI+16-64], XMM6;
1156                    movdqa XMM5, XMM4;
1157                    movdqa XMM6, XMM4;
1158                    movdqu XMM2, [EAX+32];
1159                    movdqu XMM3, [EAX+48];
1160                    add EAX, 64;
1161                    psubb XMM5, XMM2;
1162                    psubb XMM6, XMM3;
1163                    movdqu [ESI+32-64], XMM5;
1164                    movdqu [ESI+48-64], XMM6;
1165                    cmp ESI, EDI;
1166                    jb startsubrsse2u;
1167
1168                    mov aptr, ESI;
1169                    mov bptr, EAX;
1170                }
1171            }
1172            else
1173            {
1174                asm // aligned case
1175                {
1176                    mov ESI, aptr;
1177                    mov EDI, n;
1178                    mov EAX, bptr;
1179                    movd XMM4, l;
1180                    pshufd XMM4, XMM4, 0;
1181
1182                    align 8;
1183                startsubrsse2a:
1184                    add ESI, 64;
1185                    movdqa XMM5, XMM4;
1186                    movdqa XMM6, XMM4;
1187                    movdqa XMM0, [EAX];
1188                    movdqa XMM1, [EAX+16];
1189                    psubb XMM5, XMM0;
1190                    psubb XMM6, XMM1;
1191                    movdqa [ESI   -64], XMM5;
1192                    movdqa [ESI+16-64], XMM6;
1193                    movdqa XMM5, XMM4;
1194                    movdqa XMM6, XMM4;
1195                    movdqa XMM2, [EAX+32];
1196                    movdqa XMM3, [EAX+48];
1197                    add EAX, 64;
1198                    psubb XMM5, XMM2;
1199                    psubb XMM6, XMM3;
1200                    movdqa [ESI+32-64], XMM5;
1201                    movdqa [ESI+48-64], XMM6;
1202                    cmp ESI, EDI;
1203                    jb startsubrsse2a;
1204
1205                    mov aptr, ESI;
1206                    mov bptr, EAX;
1207                }
1208            }
1209        }
1210        else
1211        // MMX version is 7397% faster
1212        if (mmx && a.length >= 32)
1213        {
1214            auto n = aptr + (a.length & ~31);
1215
1216            uint l = cast(ubyte)value * 0x0101;
1217
1218            asm
1219            {
1220                mov ESI, aptr;
1221                mov EDI, n;
1222                mov EAX, bptr;
1223                movd MM4, l;
1224                pshufw MM4, MM4, 0;
1225
1226                align 4;
1227            startsubrmmx:
1228                add ESI, 32;
1229                movq MM5, MM4;
1230                movq MM6, MM4;
1231                movq MM0, [EAX];
1232                movq MM1, [EAX+8];
1233                psubb MM5, MM0;
1234                psubb MM6, MM1;
1235                movq [ESI   -32], MM5;
1236                movq [ESI+8 -32], MM6;
1237                movq MM5, MM4;
1238                movq MM6, MM4;
1239                movq MM2, [EAX+16];
1240                movq MM3, [EAX+24];
1241                add EAX, 32;
1242                psubb MM5, MM2;
1243                psubb MM6, MM3;
1244                movq [ESI+16-32], MM5;
1245                movq [ESI+24-32], MM6;
1246                cmp ESI, EDI;
1247                jb startsubrmmx;
1248
1249                emms;
1250                mov aptr, ESI;
1251                mov bptr, EAX;
1252            }
1253        }
1254
1255    }
1256
1257    while (aptr < aend)
1258        *aptr++ = cast(T)(value - *bptr++);
1259
1260    return a;
1261}
1262
1263unittest
1264{
1265    debug(PRINTF) printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1266
1267    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1268    {
1269        version (log) printf("    cpuid %d\n", cpuid);
1270
1271        for (int j = 0; j < 2; j++)
1272        {
1273            const int dim = 67;
1274            T[] a = new T[dim + j];     // aligned on 16 byte boundary
1275            a = a[j .. dim + j];        // misalign for second iteration
1276            T[] b = new T[dim + j];
1277            b = b[j .. dim + j];
1278            T[] c = new T[dim + j];
1279            c = c[j .. dim + j];
1280
1281            for (int i = 0; i < dim; i++)
1282            {   a[i] = cast(T)i;
1283                b[i] = cast(T)(i + 7);
1284                c[i] = cast(T)(i * 2);
1285            }
1286
1287            a[] = c[];
1288            c[] = 6 - b[];
1289
1290            for (int i = 0; i < dim; i++)
1291            {
1292                if (c[i] != cast(T)(6 - b[i]))
1293                {
1294                    printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1295                    assert(0);
1296                }
1297            }
1298        }
1299    }
1300}
1301
1302
1303/* ======================================================================== */
1304
1305/***********************
1306 * Computes:
1307 *      a[] = b[] - c[]
1308 */
1309
1310T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1311{
1312    return _arraySliceSliceMinSliceAssign_g(a, c, b);
1313}
1314
1315T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1316{
1317    return _arraySliceSliceMinSliceAssign_g(a, c, b);
1318}
1319
1320T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1321in
1322{
1323        assert(a.length == b.length && b.length == c.length);
1324        assert(disjoint(a, b));
1325        assert(disjoint(a, c));
1326        assert(disjoint(b, c));
1327}
1328body
1329{
1330    auto aptr = a.ptr;
1331    auto aend = aptr + a.length;
1332    auto bptr = b.ptr;
1333    auto cptr = c.ptr;
1334
1335    version (D_InlineAsm_X86)
1336    {
1337        // SSE2 aligned version is 5756% faster
1338        if (sse2 && a.length >= 64)
1339        {
1340            auto n = aptr + (a.length & ~63);
1341
1342            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1343            {
1344                asm // unaligned case
1345                {
1346                    mov ESI, aptr;
1347                    mov EDI, n;
1348                    mov EAX, bptr;
1349                    mov ECX, cptr;
1350
1351                    align 8;
1352                startsublsse2u:
1353                    add ESI, 64;
1354                    movdqu XMM0, [EAX];
1355                    movdqu XMM1, [EAX+16];
1356                    movdqu XMM2, [EAX+32];
1357                    movdqu XMM3, [EAX+48];
1358                    add EAX, 64;
1359                    movdqu XMM4, [ECX];
1360                    movdqu XMM5, [ECX+16];
1361                    movdqu XMM6, [ECX+32];
1362                    movdqu XMM7, [ECX+48];
1363                    add ECX, 64;
1364                    psubb XMM0, XMM4;
1365                    psubb XMM1, XMM5;
1366                    psubb XMM2, XMM6;
1367                    psubb XMM3, XMM7;
1368                    movdqu [ESI   -64], XMM0;
1369                    movdqu [ESI+16-64], XMM1;
1370                    movdqu [ESI+32-64], XMM2;
1371                    movdqu [ESI+48-64], XMM3;
1372                    cmp ESI, EDI;
1373                    jb startsublsse2u;
1374
1375                    mov aptr, ESI;
1376                    mov bptr, EAX;
1377                    mov cptr, ECX;
1378                }
1379            }
1380            else
1381            {
1382                asm // aligned case
1383                {
1384                    mov ESI, aptr;
1385                    mov EDI, n;
1386                    mov EAX, bptr;
1387                    mov ECX, cptr;
1388
1389                    align 8;
1390                startsublsse2a:
1391                    add ESI, 64;
1392                    movdqa XMM0, [EAX];
1393                    movdqa XMM1, [EAX+16];
1394                    movdqa XMM2, [EAX+32];
1395                    movdqa XMM3, [EAX+48];
1396                    add EAX, 64;
1397                    movdqa XMM4, [ECX];
1398                    movdqa XMM5, [ECX+16];
1399                    movdqa XMM6, [ECX+32];
1400                    movdqa XMM7, [ECX+48];
1401                    add ECX, 64;
1402                    psubb XMM0, XMM4;
1403                    psubb XMM1, XMM5;
1404                    psubb XMM2, XMM6;
1405                    psubb XMM3, XMM7;
1406                    movdqa [ESI   -64], XMM0;
1407                    movdqa [ESI+16-64], XMM1;
1408                    movdqa [ESI+32-64], XMM2;
1409                    movdqa [ESI+48-64], XMM3;
1410                    cmp ESI, EDI;
1411                    jb startsublsse2a;
1412
1413                    mov aptr, ESI;
1414                    mov bptr, EAX;
1415                    mov cptr, ECX;
1416                }
1417            }
1418        }
1419        else
1420        // MMX version is 4428% faster
1421        if (mmx && a.length >= 32)
1422        {
1423            auto n = aptr + (a.length & ~31);
1424
1425            asm
1426            {
1427                mov ESI, aptr;
1428                mov EDI, n;
1429                mov EAX, bptr;
1430                mov ECX, cptr;
1431
1432                align 8;
1433            startsublmmx:
1434                add ESI, 32;
1435                movq MM0, [EAX];
1436                movq MM1, [EAX+8];
1437                movq MM2, [EAX+16];
1438                movq MM3, [EAX+24];
1439                add EAX, 32;
1440                movq MM4, [ECX];
1441                movq MM5, [ECX+8];
1442                movq MM6, [ECX+16];
1443                movq MM7, [ECX+24];
1444                add ECX, 32;
1445                psubb MM0, MM4;
1446                psubb MM1, MM5;
1447                psubb MM2, MM6;
1448                psubb MM3, MM7;
1449                movq [ESI   -32], MM0;
1450                movq [ESI+8 -32], MM1;
1451                movq [ESI+16-32], MM2;
1452                movq [ESI+24-32], MM3;
1453                cmp ESI, EDI;
1454                jb startsublmmx;
1455
1456                emms;
1457                mov aptr, ESI;
1458                mov bptr, EAX;
1459                mov cptr, ECX;
1460            }
1461        }
1462    }
1463
1464    while (aptr < aend)
1465        *aptr++ = cast(T)(*bptr++ - *cptr++);
1466
1467    return a;
1468}
1469
1470unittest
1471{
1472    debug(PRINTF) printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1473
1474    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1475    {
1476        version (log) printf("    cpuid %d\n", cpuid);
1477
1478        for (int j = 0; j < 2; j++)
1479        {
1480            const int dim = 67;
1481            T[] a = new T[dim + j];     // aligned on 16 byte boundary
1482            a = a[j .. dim + j];        // misalign for second iteration
1483            T[] b = new T[dim + j];
1484            b = b[j .. dim + j];
1485            T[] c = new T[dim + j];
1486            c = c[j .. dim + j];
1487
1488            for (int i = 0; i < dim; i++)
1489            {   a[i] = cast(T)i;
1490                b[i] = cast(T)(i + 7);
1491                c[i] = cast(T)(i * 2);
1492            }
1493
1494            c[] = a[] - b[];
1495
1496            for (int i = 0; i < dim; i++)
1497            {
1498                if (c[i] != cast(T)(a[i] - b[i]))
1499                {
1500                    printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1501                    assert(0);
1502                }
1503            }
1504        }
1505    }
1506}
1507
1508
1509/* ======================================================================== */
1510
1511/***********************
1512 * Computes:
1513 *      a[] -= value
1514 */
1515
1516T[] _arrayExpSliceMinass_a(T[] a, T value)
1517{
1518    return _arrayExpSliceMinass_g(a, value);
1519}
1520
1521T[] _arrayExpSliceMinass_h(T[] a, T value)
1522{
1523    return _arrayExpSliceMinass_g(a, value);
1524}
1525
1526T[] _arrayExpSliceMinass_g(T[] a, T value)
1527{
1528    //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1529    auto aptr = a.ptr;
1530    auto aend = aptr + a.length;
1531
1532    version (D_InlineAsm_X86)
1533    {
1534        // SSE2 aligned version is 1577% faster
1535        if (sse2 && a.length >= 64)
1536        {
1537            auto n = aptr + (a.length & ~63);
1538
1539            uint l = cast(ubyte)value * 0x01010101;
1540
1541            if (((cast(uint) aptr) & 15) != 0)
1542            {
1543                asm // unaligned case
1544                {
1545                    mov ESI, aptr;
1546                    mov EDI, n;
1547                    movd XMM4, l;
1548                    pshufd XMM4, XMM4, 0;
1549
1550                    align 8;
1551                startsubasssse2u:
1552                    movdqu XMM0, [ESI];
1553                    movdqu XMM1, [ESI+16];
1554                    movdqu XMM2, [ESI+32];
1555                    movdqu XMM3, [ESI+48];
1556                    add ESI, 64;
1557                    psubb XMM0, XMM4;
1558                    psubb XMM1, XMM4;
1559                    psubb XMM2, XMM4;
1560                    psubb XMM3, XMM4;
1561                    movdqu [ESI   -64], XMM0;
1562                    movdqu [ESI+16-64], XMM1;
1563                    movdqu [ESI+32-64], XMM2;
1564                    movdqu [ESI+48-64], XMM3;
1565                    cmp ESI, EDI;
1566                    jb startsubasssse2u;
1567
1568                    mov aptr, ESI;
1569                }
1570            }
1571            else
1572            {
1573                asm // aligned case
1574                {
1575                    mov ESI, aptr;
1576                    mov EDI, n;
1577                    movd XMM4, l;
1578                    pshufd XMM4, XMM4, 0;
1579
1580                    align 8;
1581                startsubasssse2a:
1582                    movdqa XMM0, [ESI];
1583                    movdqa XMM1, [ESI+16];
1584                    movdqa XMM2, [ESI+32];
1585                    movdqa XMM3, [ESI+48];
1586                    add ESI, 64;
1587                    psubb XMM0, XMM4;
1588                    psubb XMM1, XMM4;
1589                    psubb XMM2, XMM4;
1590                    psubb XMM3, XMM4;
1591                    movdqa [ESI   -64], XMM0;
1592                    movdqa [ESI+16-64], XMM1;
1593                    movdqa [ESI+32-64], XMM2;
1594                    movdqa [ESI+48-64], XMM3;
1595                    cmp ESI, EDI;
1596                    jb startsubasssse2a;
1597
1598                    mov aptr, ESI;
1599                }
1600            }
1601        }
1602        else
1603        // MMX version is 1577% faster
1604        if (mmx && a.length >= 32)
1605        {
1606
1607            auto n = aptr + (a.length & ~31);
1608
1609            uint l = cast(ubyte)value * 0x0101;
1610
1611            asm
1612            {
1613                mov ESI, aptr;
1614                mov EDI, n;
1615                movd MM4, l;
1616                pshufw MM4, MM4, 0;
1617
1618                align 8;
1619            startsubassmmx:
1620                movq MM0, [ESI];
1621                movq MM1, [ESI+8];
1622                movq MM2, [ESI+16];
1623                movq MM3, [ESI+24];
1624                add ESI, 32;
1625                psubb MM0, MM4;
1626                psubb MM1, MM4;
1627                psubb MM2, MM4;
1628                psubb MM3, MM4;
1629                movq [ESI   -32], MM0;
1630                movq [ESI+8 -32], MM1;
1631                movq [ESI+16-32], MM2;
1632                movq [ESI+24-32], MM3;
1633                cmp ESI, EDI;
1634                jb startsubassmmx;
1635
1636                emms;
1637                mov aptr, ESI;
1638            }
1639        }
1640    }
1641
1642    while (aptr < aend)
1643        *aptr++ -= value;
1644
1645    return a;
1646}
1647
1648unittest
1649{
1650    debug(PRINTF) printf("_arrayExpSliceMinass_g unittest\n");
1651
1652    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1653    {
1654        version (log) printf("    cpuid %d\n", cpuid);
1655
1656        for (int j = 0; j < 2; j++)
1657        {
1658            const int dim = 67;
1659            T[] a = new T[dim + j];     // aligned on 16 byte boundary
1660            a = a[j .. dim + j];        // misalign for second iteration
1661            T[] b = new T[dim + j];
1662            b = b[j .. dim + j];
1663            T[] c = new T[dim + j];
1664            c = c[j .. dim + j];
1665
1666            for (int i = 0; i < dim; i++)
1667            {   a[i] = cast(T)i;
1668                b[i] = cast(T)(i + 7);
1669                c[i] = cast(T)(i * 2);
1670            }
1671
1672            a[] = c[];
1673            c[] -= 6;
1674
1675            for (int i = 0; i < dim; i++)
1676            {
1677                if (c[i] != cast(T)(a[i] - 6))
1678                {
1679                    printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1680                    assert(0);
1681                }
1682            }
1683        }
1684    }
1685}
1686
1687
1688/* ======================================================================== */
1689
1690/***********************
1691 * Computes:
1692 *      a[] -= b[]
1693 */
1694
1695T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1696{
1697    return _arraySliceSliceMinass_g(a, b);
1698}
1699
1700T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1701{
1702    return _arraySliceSliceMinass_g(a, b);
1703}
1704
1705T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1706in
1707{
1708    assert (a.length == b.length);
1709    assert (disjoint(a, b));
1710}
1711body
1712{
1713    //printf("_arraySliceSliceMinass_g()\n");
1714    auto aptr = a.ptr;
1715    auto aend = aptr + a.length;
1716    auto bptr = b.ptr;
1717
1718    version (D_InlineAsm_X86)
1719    {
1720        // SSE2 aligned version is 4800% faster
1721        if (sse2 && a.length >= 64)
1722        {
1723            auto n = aptr + (a.length & ~63);
1724
1725            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1726            {
1727                asm // unaligned case
1728                {
1729                    mov ESI, aptr;
1730                    mov EDI, n;
1731                    mov ECX, bptr;
1732
1733                    align 8;
1734                startsubasslsse2u:
1735                    movdqu XMM0, [ESI];
1736                    movdqu XMM1, [ESI+16];
1737                    movdqu XMM2, [ESI+32];
1738                    movdqu XMM3, [ESI+48];
1739                    add ESI, 64;
1740                    movdqu XMM4, [ECX];
1741                    movdqu XMM5, [ECX+16];
1742                    movdqu XMM6, [ECX+32];
1743                    movdqu XMM7, [ECX+48];
1744                    add ECX, 64;
1745                    psubb XMM0, XMM4;
1746                    psubb XMM1, XMM5;
1747                    psubb XMM2, XMM6;
1748                    psubb XMM3, XMM7;
1749                    movdqu [ESI   -64], XMM0;
1750                    movdqu [ESI+16-64], XMM1;
1751                    movdqu [ESI+32-64], XMM2;
1752                    movdqu [ESI+48-64], XMM3;
1753                    cmp ESI, EDI;
1754                    jb startsubasslsse2u;
1755
1756                    mov aptr, ESI;
1757                    mov bptr, ECX;
1758                }
1759            }
1760            else
1761            {
1762                asm // aligned case
1763                {
1764                    mov ESI, aptr;
1765                    mov EDI, n;
1766                    mov ECX, bptr;
1767
1768                    align 8;
1769                startsubasslsse2a:
1770                    movdqa XMM0, [ESI];
1771                    movdqa XMM1, [ESI+16];
1772                    movdqa XMM2, [ESI+32];
1773                    movdqa XMM3, [ESI+48];
1774                    add ESI, 64;
1775                    movdqa XMM4, [ECX];
1776                    movdqa XMM5, [ECX+16];
1777                    movdqa XMM6, [ECX+32];
1778                    movdqa XMM7, [ECX+48];
1779                    add ECX, 64;
1780                    psubb XMM0, XMM4;
1781                    psubb XMM1, XMM5;
1782                    psubb XMM2, XMM6;
1783                    psubb XMM3, XMM7;
1784                    movdqa [ESI   -64], XMM0;
1785                    movdqa [ESI+16-64], XMM1;
1786                    movdqa [ESI+32-64], XMM2;
1787                    movdqa [ESI+48-64], XMM3;
1788                    cmp ESI, EDI;
1789                    jb startsubasslsse2a;
1790
1791                    mov aptr, ESI;
1792                    mov bptr, ECX;
1793                }
1794            }
1795        }
1796        else
1797        // MMX version is 3107% faster
1798        if (mmx && a.length >= 32)
1799        {
1800
1801            auto n = aptr + (a.length & ~31);
1802
1803            asm
1804            {
1805                mov ESI, aptr;
1806                mov EDI, n;
1807  

Large files files are truncated, but you can click here to view the full file