/arch/arm/cpu/arm64/cpu_memcpy.S

https://gitlab.com/hussain6001/xvisor · Assembly · 205 lines · 115 code · 11 blank · 79 comment · 0 complexity · eeb4d71af87451a735ef9c3a403eb223 MD5 · raw file

  1. /**
  2. * Copyright (c) 2014 Anup Patel.
  3. * All rights reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2, or (at your option)
  8. * any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * @file cpu_memcpy.S
  20. * @author Anup Patel (anup@brainfault.org)
  21. * @brief Low-level implementation of memcpy function
  22. *
  23. * This source code has been largely adapted from Linux source:
  24. * linux-xxx/arch/arm64/lib/memcpy.S
  25. *
  26. * Copyright (C) 2013 ARM Ltd.
  27. * Copyright (C) 2013 Linaro.
  28. *
  29. * The original code is licensed under the GPL.
  30. */
  31. /*
  32. * Copy a buffer from src to dest (alignment handled by the hardware)
  33. *
  34. * Parameters:
  35. * x0 - dest
  36. * x1 - src
  37. * x2 - n
  38. * Returns:
  39. * x0 - dest
  40. */
  41. dstin .req x0
  42. src .req x1
  43. count .req x2
  44. tmp1 .req x3
  45. tmp1w .req w3
  46. tmp2 .req x4
  47. tmp2w .req w4
  48. tmp3 .req x5
  49. tmp3w .req w5
  50. dst .req x6
  51. A_l .req x7
  52. A_h .req x8
  53. B_l .req x9
  54. B_h .req x10
  55. C_l .req x11
  56. C_h .req x12
  57. D_l .req x13
  58. D_h .req x14
  59. .global memcpy
  60. memcpy:
  61. mov dst, dstin
  62. cmp count, #16
  63. /*When memory length is less than 16, the accessed are not aligned.*/
  64. b.lo .Ltiny15
  65. neg tmp2, src
  66. ands tmp2, tmp2, #15/* Bytes to reach alignment. */
  67. b.eq .LSrcAligned
  68. sub count, count, tmp2
  69. /*
  70. * Copy the leading memory data from src to dst in an increasing
  71. * address order.By this way,the risk of overwritting the source
  72. * memory data is eliminated when the distance between src and
  73. * dst is less than 16. The memory accesses here are alignment.
  74. */
  75. tbz tmp2, #0, 1f
  76. ldrb tmp1w, [src], #1
  77. strb tmp1w, [dst], #1
  78. 1:
  79. tbz tmp2, #1, 2f
  80. ldrh tmp1w, [src], #2
  81. strh tmp1w, [dst], #2
  82. 2:
  83. tbz tmp2, #2, 3f
  84. ldr tmp1w, [src], #4
  85. str tmp1w, [dst], #4
  86. 3:
  87. tbz tmp2, #3, .LSrcAligned
  88. ldr tmp1, [src],#8
  89. str tmp1, [dst],#8
  90. .LSrcAligned:
  91. cmp count, #64
  92. b.ge .Lcpy_over64
  93. /*
  94. * Deal with small copies quickly by dropping straight into the
  95. * exit block.
  96. */
  97. .Ltail63:
  98. /*
  99. * Copy up to 48 bytes of data. At this point we only need the
  100. * bottom 6 bits of count to be accurate.
  101. */
  102. ands tmp1, count, #0x30
  103. b.eq .Ltiny15
  104. cmp tmp1w, #0x20
  105. b.eq 1f
  106. b.lt 2f
  107. ldp A_l, A_h, [src], #16
  108. stp A_l, A_h, [dst], #16
  109. 1:
  110. ldp A_l, A_h, [src], #16
  111. stp A_l, A_h, [dst], #16
  112. 2:
  113. ldp A_l, A_h, [src], #16
  114. stp A_l, A_h, [dst], #16
  115. .Ltiny15:
  116. /*
  117. * Prefer to break one ldp/stp into several load/store to access
  118. * memory in an increasing address order,rather than to load/store 16
  119. * bytes from (src-16) to (dst-16) and to backward the src to aligned
  120. * address,which way is used in original cortex memcpy. If keeping
  121. * the original memcpy process here, memmove need to satisfy the
  122. * precondition that src address is at least 16 bytes bigger than dst
  123. * address,otherwise some source data will be overwritten when memove
  124. * call memcpy directly. To make memmove simpler and decouple the
  125. * memcpy's dependency on memmove, withdrew the original process.
  126. */
  127. tbz count, #3, 1f
  128. ldr tmp1, [src], #8
  129. str tmp1, [dst], #8
  130. 1:
  131. tbz count, #2, 2f
  132. ldr tmp1w, [src], #4
  133. str tmp1w, [dst], #4
  134. 2:
  135. tbz count, #1, 3f
  136. ldrh tmp1w, [src], #2
  137. strh tmp1w, [dst], #2
  138. 3:
  139. tbz count, #0, .Lexitfunc
  140. ldrb tmp1w, [src]
  141. strb tmp1w, [dst]
  142. .Lexitfunc:
  143. ret
  144. .Lcpy_over64:
  145. subs count, count, #128
  146. b.ge .Lcpy_body_large
  147. /*
  148. * Less than 128 bytes to copy, so handle 64 here and then jump
  149. * to the tail.
  150. */
  151. ldp A_l, A_h, [src],#16
  152. stp A_l, A_h, [dst],#16
  153. ldp B_l, B_h, [src],#16
  154. ldp C_l, C_h, [src],#16
  155. stp B_l, B_h, [dst],#16
  156. stp C_l, C_h, [dst],#16
  157. ldp D_l, D_h, [src],#16
  158. stp D_l, D_h, [dst],#16
  159. tst count, #0x3f
  160. b.ne .Ltail63
  161. ret
  162. /*
  163. * Critical loop. Start at a new cache line boundary. Assuming
  164. * 64 bytes per line this ensures the entire loop is in one line.
  165. */
  166. .p2align 6
  167. .Lcpy_body_large:
  168. /* pre-get 64 bytes data. */
  169. ldp A_l, A_h, [src],#16
  170. ldp B_l, B_h, [src],#16
  171. ldp C_l, C_h, [src],#16
  172. ldp D_l, D_h, [src],#16
  173. 1:
  174. /*
  175. * interlace the load of next 64 bytes data block with store of the last
  176. * loaded 64 bytes data.
  177. */
  178. stp A_l, A_h, [dst],#16
  179. ldp A_l, A_h, [src],#16
  180. stp B_l, B_h, [dst],#16
  181. ldp B_l, B_h, [src],#16
  182. stp C_l, C_h, [dst],#16
  183. ldp C_l, C_h, [src],#16
  184. stp D_l, D_h, [dst],#16
  185. ldp D_l, D_h, [src],#16
  186. subs count, count, #64
  187. b.ge 1b
  188. stp A_l, A_h, [dst],#16
  189. stp B_l, B_h, [dst],#16
  190. stp C_l, C_h, [dst],#16
  191. stp D_l, D_h, [dst],#16
  192. tst count, #0x3f
  193. b.ne .Ltail63
  194. ret