/innative-env/memcpy.x86.asm

https://github.com/innative-sdk/innative · Assembly · 725 lines · 464 code · 94 blank · 167 comment · 1 complexity · 3d6a803d788cd6a5ece8e31d7ee0bf40 MD5 · raw file

  1. page ,132
  2. title _innative_internal_env_memcpy - Copy source memory bytes to destination
  3. ;***
  4. ;memcpy.asm - contains memcpy and memmove routines
  5. ;
  6. ; Copyright (c) Microsoft Corporation. All rights reserved.
  7. ;
  8. ;Purpose:
  9. ; memcpy() copies a source memory buffer to a destination buffer.
  10. ; Overlapping buffers are not treated specially, so propogation may occur.
  11. ; memmove() copies a source memory buffer to a destination buffer.
  12. ; Overlapping buffers are treated specially, to avoid propogation.
  13. ;
  14. ;*******************************************************************************
  15. .xlist
  16. include vcruntime.x86.inc
  17. .list
  18. .xmm
  19. M_EXIT macro
  20. ret ; _cdecl return
  21. endm ; M_EXIT
  22. PALIGN_memcpy macro d
  23. MovPalign&d&:
  24. movdqa xmm1,xmmword ptr [esi-d]
  25. lea esi, byte ptr [esi-d]
  26. align @WordSize
  27. PalignLoop&d&:
  28. movdqa xmm3,xmmword ptr [esi+10h]
  29. sub ecx,30h
  30. movdqa xmm0,xmmword ptr [esi+20h]
  31. movdqa xmm5,xmmword ptr [esi+30h]
  32. lea esi, xmmword ptr [esi+30h]
  33. cmp ecx,30h
  34. movdqa xmm2,xmm3
  35. palignr xmm3,xmm1,d
  36. movdqa xmmword ptr [edi],xmm3
  37. movdqa xmm4,xmm0
  38. palignr xmm0,xmm2,d
  39. movdqa xmmword ptr [edi+10h],xmm0
  40. movdqa xmm1,xmm5
  41. palignr xmm5,xmm4,d
  42. movdqa xmmword ptr [edi+20h],xmm5
  43. lea edi, xmmword ptr [edi+30h]
  44. jae PalignLoop&d&
  45. lea esi, xmmword ptr [esi+d]
  46. endm ; PALIGN_memcpy
  47. CODESEG
  48. extrn _innative_internal_env__isa_enabled:dword
  49. extrn _innative_internal_env__favor:dword
  50. page
  51. ;***
  52. ;memcpy - Copy source buffer to destination buffer
  53. ;
  54. ;Purpose:
  55. ; memcpy() copies a source memory buffer to a destination memory buffer.
  56. ; This routine does NOT recognize overlapping buffers, and thus can lead
  57. ; to propogation.
  58. ; For cases where propogation must be avoided, memmove() must be used.
  59. ;
  60. ; Algorithm:
  61. ;
  62. ; Same as memmove. See Below
  63. ;
  64. ;
  65. ;memmove - Copy source buffer to destination buffer
  66. ;
  67. ;Purpose:
  68. ; memmove() copies a source memory buffer to a destination memory buffer.
  69. ; This routine recognize overlapping buffers to avoid propogation.
  70. ; For cases where propogation is not a problem, memcpy() can be used.
  71. ;
  72. ; Algorithm:
  73. ;
  74. ; void * memmove(void * dst, void * src, size_t count)
  75. ; {
  76. ; void * ret = dst;
  77. ;
  78. ; if (dst <= src || dst >= (src + count)) {
  79. ; /*
  80. ; * Non-Overlapping Buffers
  81. ; * copy from lower addresses to higher addresses
  82. ; */
  83. ; while (count--)
  84. ; *dst++ = *src++;
  85. ; }
  86. ; else {
  87. ; /*
  88. ; * Overlapping Buffers
  89. ; * copy from higher addresses to lower addresses
  90. ; */
  91. ; dst += count - 1;
  92. ; src += count - 1;
  93. ;
  94. ; while (count--)
  95. ; *dst-- = *src--;
  96. ; }
  97. ;
  98. ; return(ret);
  99. ; }
  100. ;
  101. ;
  102. ;Entry:
  103. ; void *dst = pointer to destination buffer
  104. ; const void *src = pointer to source buffer
  105. ; size_t count = number of bytes to copy
  106. ;
  107. ;Exit:
  108. ; Returns a pointer to the destination buffer in AX/DX:AX
  109. ;
  110. ;Uses:
  111. ; CX, DX
  112. ;
  113. ;Exceptions:
  114. ;*******************************************************************************
  115. ifdef MEM_MOVE
  116. _MEM_ equ <_innative_internal_env_memmove>
  117. else ; MEM_MOVE
  118. _MEM_ equ <_innative_internal_env_memcpy>
  119. endif ; MEM_MOVE
  120. % public _MEM_
  121. _MEM_ proc \
  122. dst:ptr byte, \
  123. src:ptr byte, \
  124. count:IWORD
  125. ; destination pointer
  126. ; source pointer
  127. ; number of bytes to copy
  128. OPTION PROLOGUE:NONE, EPILOGUE:NONE
  129. push edi ; save edi
  130. push esi ; save esi
  131. ; size param/4 prolog byte #reg saved
  132. .FPO ( 0, 3 , $-_MEM_ , 2, 0, 0 )
  133. mov esi,[esp + 010h] ; esi = source
  134. mov ecx,[esp + 014h] ; ecx = number of bytes to move
  135. mov edi,[esp + 0Ch] ; edi = dest
  136. ;
  137. ; Check for overlapping buffers:
  138. ; If (dst <= src) Or (dst >= src + Count) Then
  139. ; Do normal (Upwards) Copy
  140. ; Else
  141. ; Do Downwards Copy to avoid propagation
  142. ;
  143. mov eax,ecx ; eax = byte count
  144. mov edx,ecx ; edx = byte count
  145. add eax,esi ; eax = point past source end
  146. cmp edi,esi ; dst <= src ?
  147. jbe short CopyUp ; no overlap: copy toward higher addresses
  148. cmp edi,eax ; dst < (src + count) ?
  149. jb CopyDown ; overlap: copy toward lower addresses
  150. ;
  151. ; Buffers do not overlap, copy toward higher addresses.
  152. ;
  153. CopyUp:
  154. cmp ecx, 020h
  155. jb CopyUpDwordMov ; size smaller than 32 bytes, use dwords
  156. cmp ecx, 080h
  157. jae CopyUpLargeMov ; if greater than or equal to 128 bytes, use Enhanced fast Strings
  158. bt _innative_internal_env__isa_enabled, __ISA_AVAILABLE_SSE2
  159. jc XmmCopySmallTest
  160. jmp Dword_align
  161. CopyUpLargeMov:
  162. bt _innative_internal_env__favor, __FAVOR_ENFSTRG ; check if Enhanced Fast Strings is supported
  163. jnc CopyUpSSE2Check ; if not, check for SSE2 support
  164. rep movsb
  165. mov eax,[esp + 0Ch] ; return original destination pointer
  166. pop esi
  167. pop edi
  168. M_EXIT
  169. ;
  170. ; Check if source and destination are equally aligned.
  171. ;
  172. CopyUpSSE2Check:
  173. mov eax,edi
  174. xor eax,esi
  175. test eax,15
  176. jne AtomChk ; Not aligned go check Atom
  177. bt _innative_internal_env__isa_enabled, __ISA_AVAILABLE_SSE2
  178. jc XmmCopy ; yes, go SSE2 copy (params already set)
  179. AtomChk:
  180. ; Is Atom supported?
  181. bt _innative_internal_env__favor, __FAVOR_ATOM
  182. jnc Dword_align ; no,jump
  183. ; check if dst is 4 byte aligned
  184. test edi, 3
  185. jne Dword_align
  186. ; check if src is 4 byte aligned
  187. test esi, 3
  188. jne Dword_align_Ok
  189. ; A software pipelining vectorized memcpy loop using PALIGN instructions
  190. ; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
  191. ; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
  192. PalignHead4:
  193. bt edi, 2
  194. jae PalignHead8
  195. mov eax, dword ptr [esi]
  196. sub ecx, 4
  197. lea esi, byte ptr [esi+4]
  198. mov dword ptr [edi], eax
  199. lea edi, byte ptr [edi+4]
  200. PalignHead8:
  201. bt edi, 3
  202. jae PalignLoop
  203. movq xmm1, qword ptr [esi]
  204. sub ecx, 8
  205. lea esi, byte ptr [esi+8]
  206. movq qword ptr [edi], xmm1
  207. lea edi, byte ptr [edi+8]
  208. ;(2) Use SSE palign loop
  209. PalignLoop:
  210. test esi, 7
  211. je MovPalign8
  212. bt esi, 3
  213. jae MovPalign4
  214. PALIGN_memcpy 12
  215. jmp PalignTail
  216. PALIGN_memcpy 8
  217. jmp PalignTail
  218. PALIGN_memcpy 4
  219. ;(3) Copy the tailing bytes.
  220. PalignTail:
  221. cmp ecx,10h
  222. jb PalignTail4
  223. movdqu xmm1,xmmword ptr [esi]
  224. sub ecx, 10h
  225. lea esi, xmmword ptr [esi+10h]
  226. movdqa xmmword ptr [edi],xmm1
  227. lea edi, xmmword ptr [edi+10h]
  228. jmp PalignTail
  229. PalignTail4:
  230. bt ecx, 2
  231. jae PalignTail8
  232. mov eax, dword ptr [esi]
  233. sub ecx,4
  234. lea esi, byte ptr [esi+4]
  235. mov dword ptr [edi], eax
  236. lea edi, byte ptr [edi+4]
  237. PalignTail8:
  238. bt ecx, 3
  239. jae PalignTailLE3
  240. movq xmm1, qword ptr [esi]
  241. sub ecx,8
  242. lea esi, byte ptr [esi+8]
  243. movq qword ptr [edi], xmm1
  244. lea edi, byte ptr [edi+8]
  245. PalignTailLE3:
  246. mov eax, dword ptr TrailingUpVec[ecx*4]
  247. jmp eax
  248. ; The algorithm for forward moves is to align the destination to a dword
  249. ; boundary and so we can move dwords with an aligned destination. This
  250. ; occurs in 3 steps.
  251. ;
  252. ; - move x = ((4 - Dest & 3) & 3) bytes
  253. ; - move y = ((L-x) >> 2) dwords
  254. ; - move (L - x - y*4) bytes
  255. ;
  256. Dword_align:
  257. test edi,11b ; check if destination is dword aligned
  258. jz short Dword_align_Ok ; if destination not dword aligned already, it should be aligned
  259. Dword_up_align_loop:
  260. mov al, byte ptr [esi]
  261. mov byte ptr [edi], al
  262. dec ecx
  263. add esi, 1
  264. add edi, 1
  265. test edi, 11b
  266. jnz Dword_up_align_loop
  267. Dword_align_Ok:
  268. mov edx, ecx
  269. cmp ecx, 32
  270. jb CopyUpDwordMov
  271. shr ecx,2
  272. rep movsd ; move all of our dwords
  273. and edx,11b ; trailing byte count
  274. jmp dword ptr TrailingUpVec[edx*4] ; process trailing bytes
  275. ;
  276. ; Code to do optimal memory copies for non-dword-aligned destinations.
  277. ;
  278. ; The following length check is done for two reasons:
  279. ;
  280. ; 1. to ensure that the actual move length is greater than any possiale
  281. ; alignment move, and
  282. ;
  283. ; 2. to skip the multiple move logic for small moves where it would
  284. ; be faster to move the bytes with one instruction.
  285. ;
  286. align @WordSize
  287. ByteCopyUp:
  288. jmp dword ptr TrailingUpVec[ecx*4+16] ; process just bytes
  289. ;-----------------------------------------------------------------------------
  290. align @WordSize
  291. TrailingUpVec dd TrailingUp0, TrailingUp1, TrailingUp2, TrailingUp3
  292. align @WordSize
  293. TrailingUp0:
  294. mov eax,[esp + 0Ch] ; return original destination pointer
  295. pop esi ; restore esi
  296. pop edi ; restore edi
  297. ; spare
  298. M_EXIT
  299. align @WordSize
  300. TrailingUp1:
  301. mov al,[esi] ; get byte from source
  302. ; spare
  303. mov [edi],al ; put byte in destination
  304. mov eax,[esp + 0Ch] ; return original destination pointer
  305. pop esi ; restore esi
  306. pop edi ; restore edi
  307. M_EXIT
  308. align @WordSize
  309. TrailingUp2:
  310. mov al,[esi] ; get first byte from source
  311. ; spare
  312. mov [edi],al ; put first byte into destination
  313. mov al,[esi+1] ; get second byte from source
  314. mov [edi+1],al ; put second byte into destination
  315. mov eax,[esp + 0Ch] ; return original destination pointer
  316. pop esi ; restore esi
  317. pop edi ; restore edi
  318. M_EXIT
  319. align @WordSize
  320. TrailingUp3:
  321. mov al,[esi] ; get first byte from source
  322. ; spare
  323. mov [edi],al ; put first byte into destination
  324. mov al,[esi+1] ; get second byte from source
  325. mov [edi+1],al ; put second byte into destination
  326. mov al,[esi+2] ; get third byte from source
  327. mov [edi+2],al ; put third byte into destination
  328. mov eax,[esp + 0Ch] ; return original destination pointer
  329. pop esi ; restore esi
  330. pop edi ; restore edi
  331. M_EXIT
  332. ;-----------------------------------------------------------------------------
  333. ;-----------------------------------------------------------------------------
  334. ;-----------------------------------------------------------------------------
  335. ; Copy down to avoid propogation in overlapping buffers.
  336. align @WordSize
  337. CopyDown:
  338. ; inserting check for size. For < 16 bytes, use dwords without checkign for alignment
  339. lea esi, [esi+ecx] ; esi, edi pointing to the end of the buffer
  340. lea edi, [edi+ecx]
  341. cmp ecx, 32
  342. jb CopyDownSmall
  343. bt _innative_internal_env__isa_enabled, __ISA_AVAILABLE_SSE2
  344. jc XmmMovLargeAlignTest
  345. ; See if the destination start is dword aligned
  346. test edi,11b ; Test if dword aligned
  347. jz CopyDownAligned ; If not, jump
  348. CopyDownNotAligned:
  349. mov edx,edi ; get destination offset
  350. and edx, 11b
  351. sub ecx, edx
  352. CopyDownAlignLoop:
  353. mov al, byte ptr [esi-1]
  354. mov byte ptr[edi-1], al
  355. dec esi
  356. dec edi
  357. sub edx, 1
  358. jnz CopyDownAlignLoop
  359. CopyDownAligned:
  360. cmp ecx,32 ; test if small enough for unwind copy
  361. jb CopyDownSmall ; if so, then jump
  362. mov edx, ecx
  363. shr ecx,2 ; shift down to dword count
  364. and edx,11b ; trailing byte count
  365. sub esi, 4
  366. sub edi, 4 ; settign up src, dest registers
  367. std ; set direction flag
  368. rep movsd ; move all of dwords at once
  369. cld ; clear direction flag back
  370. jmp dword ptr TrailingDownVec[edx*4]; process trailing bytes
  371. ;-----------------------------------------------------------------------------
  372. align @WordSize
  373. TrailingDownVec dd TrailingDown0, TrailingDown1, TrailingDown2, TrailingDown3
  374. align @WordSize
  375. TrailingDown0:
  376. mov eax,[esp + 0Ch] ; return original destination pointer
  377. ; spare
  378. pop esi ; restore esi
  379. pop edi ; restore edi
  380. M_EXIT
  381. align @WordSize
  382. TrailingDown1:
  383. mov al,[esi+3] ; get byte from source
  384. ; spare
  385. mov [edi+3],al ; put byte in destination
  386. mov eax,[esp + 0Ch] ; return original destination pointer
  387. pop esi ; restore esi
  388. pop edi ; restore edi
  389. M_EXIT
  390. align @WordSize
  391. TrailingDown2:
  392. mov al,[esi+3] ; get first byte from source
  393. ; spare
  394. mov [edi+3],al ; put first byte into destination
  395. mov al,[esi+2] ; get second byte from source
  396. mov [edi+2],al ; put second byte into destination
  397. mov eax,[esp + 0Ch] ; return original destination pointer
  398. pop esi ; restore esi
  399. pop edi ; restore edi
  400. M_EXIT
  401. align @WordSize
  402. TrailingDown3:
  403. mov al,[esi+3] ; get first byte from source
  404. ; spare
  405. mov [edi+3],al ; put first byte into destination
  406. mov al,[esi+2] ; get second byte from source
  407. mov [edi+2],al ; put second byte into destination
  408. mov al,[esi+1] ; get third byte from source
  409. mov [edi+1],al ; put third byte into destination
  410. mov eax,[esp + 0Ch] ; return original destination pointer
  411. pop esi ; restore esi
  412. pop edi ; restore edi
  413. M_EXIT
  414. ; Copy overlapping buffers using XMM registers
  415. XmmMovLargeAlignTest:
  416. test edi, 0Fh ; check if it's 16-byte aligned
  417. jz XmmMovLargeLoop
  418. XmmMovAlignLoop:
  419. dec ecx
  420. dec esi
  421. dec edi
  422. mov al, [esi]
  423. mov [edi], al
  424. test edi, 0Fh
  425. jnz XmmMovAlignLoop
  426. XmmMovLargeLoop:
  427. cmp ecx, 128
  428. jb XmmMovSmallTest
  429. sub esi, 128
  430. sub edi, 128
  431. movdqu xmm0, xmmword ptr[esi]
  432. movdqu xmm1, xmmword ptr[esi+16]
  433. movdqu xmm2, xmmword ptr[esi+32]
  434. movdqu xmm3, xmmword ptr[esi+48]
  435. movdqu xmm4, xmmword ptr[esi+64]
  436. movdqu xmm5, xmmword ptr[esi+80]
  437. movdqu xmm6, xmmword ptr[esi+96]
  438. movdqu xmm7, xmmword ptr[esi+112]
  439. movdqu xmmword ptr[edi], xmm0
  440. movdqu xmmword ptr[edi+16], xmm1
  441. movdqu xmmword ptr[edi+32], xmm2
  442. movdqu xmmword ptr[edi+48], xmm3
  443. movdqu xmmword ptr[edi+64], xmm4
  444. movdqu xmmword ptr[edi+80], xmm5
  445. movdqu xmmword ptr[edi+96], xmm6
  446. movdqu xmmword ptr[edi+112], xmm7
  447. sub ecx, 128
  448. test ecx, 0FFFFFF80h
  449. jnz XmmMovLargeLoop
  450. XmmMovSmallTest:
  451. cmp ecx, 32 ; if lesser than 32, use dwords
  452. jb CopyDownSmall
  453. XmmMovSmallLoop:
  454. sub esi, 32
  455. sub edi, 32
  456. movdqu xmm0, xmmword ptr[esi]
  457. movdqu xmm1, xmmword ptr[esi+16]
  458. movdqu xmmword ptr[edi], xmm0
  459. movdqu xmmword ptr[edi+16], xmm1
  460. sub ecx, 32
  461. test ecx, 0FFFFFFE0h
  462. jnz XmmMovSmallLoop
  463. CopyDownSmall:
  464. test ecx, 0FFFFFFFCh ; mask the bytes
  465. jz CopyDownByteTest
  466. CopyDownDwordLoop:
  467. sub edi, 4
  468. sub esi, 4
  469. mov eax, [esi]
  470. mov [edi], eax
  471. sub ecx, 4
  472. test ecx, 0FFFFFFFCh
  473. jnz CopyDownDwordLoop
  474. CopyDownByteTest:
  475. test ecx, ecx
  476. jz CopyDownReturn
  477. CopyDownByteLoop:
  478. sub edi, 1
  479. sub esi, 1
  480. mov al, [esi]
  481. mov [edi], al
  482. sub ecx, 1
  483. jnz CopyDownByteLoop
  484. CopyDownReturn:
  485. mov eax,[esp + 0Ch] ; return original destination pointer
  486. ; spare
  487. pop esi ; restore esi
  488. pop edi ; restore edi
  489. M_EXIT
  490. ; Using XMM registers for non-overlapping buffers
  491. align 16
  492. XmmCopy:
  493. mov eax, esi
  494. and eax, 0Fh
  495. ; eax = src and dst alignment (src mod 16)
  496. test eax, eax
  497. jne XmmCopyUnaligned
  498. ; in:
  499. ; edi = dst (16 byte aligned)
  500. ; esi = src (16 byte aligned)
  501. ; ecx = len is >= (128 - head alignment bytes)
  502. ; do block copy using SSE2 stores
  503. XmmCopyAligned:
  504. mov edx, ecx
  505. and ecx, 7Fh
  506. shr edx, 7
  507. je XmmCopySmallTest
  508. ; ecx = loop count
  509. ; edx = remaining copy length
  510. ; Copy greater than or equal to 128 bytes using XMM registers
  511. align 16
  512. XmmCopyLargeLoop:
  513. movdqa xmm0,xmmword ptr [esi]
  514. movdqa xmm1,xmmword ptr [esi + 10h]
  515. movdqa xmm2,xmmword ptr [esi + 20h]
  516. movdqa xmm3,xmmword ptr [esi + 30h]
  517. movdqa xmmword ptr [edi],xmm0
  518. movdqa xmmword ptr [edi + 10h],xmm1
  519. movdqa xmmword ptr [edi + 20h],xmm2
  520. movdqa xmmword ptr [edi + 30h],xmm3
  521. movdqa xmm4,xmmword ptr [esi + 40h]
  522. movdqa xmm5,xmmword ptr [esi + 50h]
  523. movdqa xmm6,xmmword ptr [esi + 60h]
  524. movdqa xmm7,xmmword ptr [esi + 70h]
  525. movdqa xmmword ptr [edi + 40h],xmm4
  526. movdqa xmmword ptr [edi + 50h],xmm5
  527. movdqa xmmword ptr [edi + 60h],xmm6
  528. movdqa xmmword ptr [edi + 70h],xmm7
  529. lea esi,[esi + 80h]
  530. lea edi,[edi + 80h]
  531. dec edx
  532. jne XmmCopyLargeLoop
  533. ; Copy lesser than 128 bytes
  534. XmmCopySmallTest:
  535. test ecx, ecx
  536. je CopyUpReturn
  537. ; ecx = length (< 128 bytes)
  538. mov edx, ecx
  539. shr edx, 5 ; check if there are 32 bytes that can be set
  540. test edx, edx
  541. je CopyUpDwordMov
  542. ; if > 16 bytes do a loop (16 bytes at a time)
  543. ; edx - loop count
  544. ; edi = dst
  545. ; esi = src
  546. align 16
  547. XmmCopySmallLoop:
  548. movdqu xmm0, xmmword ptr [esi]
  549. movdqu xmm1, xmmword ptr [esi + 10h]
  550. movdqu xmmword ptr [edi], xmm0
  551. movdqu xmmword ptr [edi + 10h], xmm1
  552. lea esi, [esi + 20h]
  553. lea edi, [edi + 20h]
  554. dec edx
  555. jne XmmCopySmallLoop
  556. CopyUpDwordMov:
  557. ; last 1-32 bytes: step back according to dst and src alignment and do a 16-byte copy
  558. ; esi = src
  559. ; eax = src alignment (set at the start of the procedure and preserved up to here)
  560. ; edi = dst
  561. ; ecx = remaining len
  562. and ecx, 1Fh
  563. je CopyUpReturn
  564. CopyUpDwordTest:
  565. mov eax, ecx ; save remaining len and calc number of dwords
  566. shr ecx, 2
  567. je CopyUpByteTest ; if none try bytes
  568. CopyUpDwordLoop:
  569. mov edx, dword ptr [esi]
  570. mov dword ptr [edi], edx
  571. add edi, 4
  572. add esi, 4
  573. sub ecx, 1
  574. jne CopyUpDwordLoop
  575. CopyUpByteTest:
  576. mov ecx, eax
  577. and ecx, 03h
  578. je CopyUpReturn ; if none return
  579. CopyUpByteLoop:
  580. mov al, byte ptr [esi]
  581. mov byte ptr [edi], al
  582. inc esi
  583. inc edi
  584. dec ecx
  585. jne CopyUpByteLoop
  586. align 16
  587. CopyUpReturn:
  588. ; return dst
  589. mov eax,[esp + 0Ch] ; return original destination pointer
  590. pop esi
  591. pop edi
  592. M_EXIT
  593. ; dst addr is not 16 byte aligned
  594. align 16
  595. XmmCopyUnaligned:
  596. ; copy the first the first 1-15 bytes to align both src and dst up to the nearest 16-byte boundary:
  597. ; in
  598. ; esi = src
  599. ; edi = dst
  600. ; eax = src and dst alignment
  601. ; ecx = length
  602. mov edx, 010h
  603. sub edx, eax ; calculate number of bytes to get it aligned
  604. sub ecx, edx ; calc new length and save it
  605. push ecx
  606. mov eax, edx ; save alignment byte count for dwords
  607. mov ecx, eax ; set ecx to rep count
  608. and ecx, 03h
  609. je XmmAlignDwordTest ; if no bytes go do dwords
  610. XmmAlignByte:
  611. mov dl, byte ptr [esi] ; move the bytes
  612. mov byte ptr [edi], dl
  613. inc esi ; increment the addresses
  614. inc edi
  615. dec ecx ; decrement the counter
  616. jne XmmAlignByte
  617. XmmAlignDwordTest:
  618. shr eax, 2 ; get dword count
  619. je XmmAlignAdjustCnt ; if none go to main loop
  620. XmmAlignDwordLoop:
  621. mov edx, dword ptr [esi] ; move the dwords
  622. mov dword ptr [edi], edx
  623. lea esi, [esi+4] ; increment the addresses
  624. lea edi, [edi+4]
  625. dec eax ; decrement the counter
  626. jne XmmAlignDwordLoop
  627. XmmAlignAdjustCnt:
  628. pop ecx ; retrieve the adjusted length
  629. jmp XmmCopyAligned
  630. _MEM_ endp
  631. end