/arch/xtensa/lib/checksum.S

http://github.com/mirrors/linux · Assembly · 394 lines · 279 code · 28 blank · 87 comment · 0 complexity · af7fc1a8da38b6baa1d3543a909b6565 MD5 · raw file

  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * IP/TCP/UDP checksumming routines
  8. *
  9. * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
  10. * Optimized by Joe Taylor
  11. */
  12. #include <linux/errno.h>
  13. #include <linux/linkage.h>
  14. #include <asm/asmmacro.h>
  15. #include <asm/core.h>
  16. /*
  17. * computes a partial checksum, e.g. for TCP/UDP fragments
  18. */
  19. /*
  20. * unsigned int csum_partial(const unsigned char *buf, int len,
  21. * unsigned int sum);
  22. * a2 = buf
  23. * a3 = len
  24. * a4 = sum
  25. *
  26. * This function assumes 2- or 4-byte alignment. Other alignments will fail!
  27. */
  28. /* ONES_ADD converts twos-complement math to ones-complement. */
  29. #define ONES_ADD(sum, val) \
  30. add sum, sum, val ; \
  31. bgeu sum, val, 99f ; \
  32. addi sum, sum, 1 ; \
  33. 99: ;
  34. .text
  35. ENTRY(csum_partial)
  36. /*
  37. * Experiments with Ethernet and SLIP connections show that buf
  38. * is aligned on either a 2-byte or 4-byte boundary.
  39. */
  40. abi_entry_default
  41. extui a5, a2, 0, 2
  42. bnez a5, 8f /* branch if 2-byte aligned */
  43. /* Fall-through on common case, 4-byte alignment */
  44. 1:
  45. srli a5, a3, 5 /* 32-byte chunks */
  46. #if XCHAL_HAVE_LOOPS
  47. loopgtz a5, 2f
  48. #else
  49. beqz a5, 2f
  50. slli a5, a5, 5
  51. add a5, a5, a2 /* a5 = end of last 32-byte chunk */
  52. .Loop1:
  53. #endif
  54. l32i a6, a2, 0
  55. l32i a7, a2, 4
  56. ONES_ADD(a4, a6)
  57. ONES_ADD(a4, a7)
  58. l32i a6, a2, 8
  59. l32i a7, a2, 12
  60. ONES_ADD(a4, a6)
  61. ONES_ADD(a4, a7)
  62. l32i a6, a2, 16
  63. l32i a7, a2, 20
  64. ONES_ADD(a4, a6)
  65. ONES_ADD(a4, a7)
  66. l32i a6, a2, 24
  67. l32i a7, a2, 28
  68. ONES_ADD(a4, a6)
  69. ONES_ADD(a4, a7)
  70. addi a2, a2, 4*8
  71. #if !XCHAL_HAVE_LOOPS
  72. blt a2, a5, .Loop1
  73. #endif
  74. 2:
  75. extui a5, a3, 2, 3 /* remaining 4-byte chunks */
  76. #if XCHAL_HAVE_LOOPS
  77. loopgtz a5, 3f
  78. #else
  79. beqz a5, 3f
  80. slli a5, a5, 2
  81. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  82. .Loop2:
  83. #endif
  84. l32i a6, a2, 0
  85. ONES_ADD(a4, a6)
  86. addi a2, a2, 4
  87. #if !XCHAL_HAVE_LOOPS
  88. blt a2, a5, .Loop2
  89. #endif
  90. 3:
  91. _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
  92. l16ui a6, a2, 0
  93. ONES_ADD(a4, a6)
  94. addi a2, a2, 2
  95. 5:
  96. _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
  97. 6: l8ui a6, a2, 0
  98. #ifdef __XTENSA_EB__
  99. slli a6, a6, 8 /* load byte into bits 8..15 */
  100. #endif
  101. ONES_ADD(a4, a6)
  102. 7:
  103. mov a2, a4
  104. abi_ret_default
  105. /* uncommon case, buf is 2-byte aligned */
  106. 8:
  107. beqz a3, 7b /* branch if len == 0 */
  108. beqi a3, 1, 6b /* branch if len == 1 */
  109. extui a5, a2, 0, 1
  110. bnez a5, 8f /* branch if 1-byte aligned */
  111. l16ui a6, a2, 0 /* common case, len >= 2 */
  112. ONES_ADD(a4, a6)
  113. addi a2, a2, 2 /* adjust buf */
  114. addi a3, a3, -2 /* adjust len */
  115. j 1b /* now buf is 4-byte aligned */
  116. /* case: odd-byte aligned, len > 1
  117. * This case is dog slow, so don't give us an odd address.
  118. * (I don't think this ever happens, but just in case.)
  119. */
  120. 8:
  121. srli a5, a3, 2 /* 4-byte chunks */
  122. #if XCHAL_HAVE_LOOPS
  123. loopgtz a5, 2f
  124. #else
  125. beqz a5, 2f
  126. slli a5, a5, 2
  127. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  128. .Loop3:
  129. #endif
  130. l8ui a6, a2, 0 /* bits 24..31 */
  131. l16ui a7, a2, 1 /* bits 8..23 */
  132. l8ui a8, a2, 3 /* bits 0.. 8 */
  133. #ifdef __XTENSA_EB__
  134. slli a6, a6, 24
  135. #else
  136. slli a8, a8, 24
  137. #endif
  138. slli a7, a7, 8
  139. or a7, a7, a6
  140. or a7, a7, a8
  141. ONES_ADD(a4, a7)
  142. addi a2, a2, 4
  143. #if !XCHAL_HAVE_LOOPS
  144. blt a2, a5, .Loop3
  145. #endif
  146. 2:
  147. _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
  148. l8ui a6, a2, 0
  149. l8ui a7, a2, 1
  150. #ifdef __XTENSA_EB__
  151. slli a6, a6, 8
  152. #else
  153. slli a7, a7, 8
  154. #endif
  155. or a7, a7, a6
  156. ONES_ADD(a4, a7)
  157. addi a2, a2, 2
  158. 3:
  159. j 5b /* branch to handle the remaining byte */
  160. ENDPROC(csum_partial)
  161. /*
  162. * Copy from ds while checksumming, otherwise like csum_partial
  163. */
  164. /*
  165. unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
  166. int sum, int *src_err_ptr, int *dst_err_ptr)
  167. a2 = src
  168. a3 = dst
  169. a4 = len
  170. a5 = sum
  171. a6 = src_err_ptr
  172. a7 = dst_err_ptr
  173. a8 = temp
  174. a9 = temp
  175. a10 = temp
  176. a11 = original len for exception handling
  177. a12 = original dst for exception handling
  178. This function is optimized for 4-byte aligned addresses. Other
  179. alignments work, but not nearly as efficiently.
  180. */
  181. ENTRY(csum_partial_copy_generic)
  182. abi_entry_default
  183. mov a12, a3
  184. mov a11, a4
  185. or a10, a2, a3
  186. /* We optimize the following alignment tests for the 4-byte
  187. aligned case. Two bbsi.l instructions might seem more optimal
  188. (commented out below). However, both labels 5: and 3: are out
  189. of the imm8 range, so the assembler relaxes them into
  190. equivalent bbci.l, j combinations, which is actually
  191. slower. */
  192. extui a9, a10, 0, 2
  193. beqz a9, 1f /* branch if both are 4-byte aligned */
  194. bbsi.l a10, 0, 5f /* branch if one address is odd */
  195. j 3f /* one address is 2-byte aligned */
  196. /* _bbsi.l a10, 0, 5f */ /* branch if odd address */
  197. /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
  198. 1:
  199. /* src and dst are both 4-byte aligned */
  200. srli a10, a4, 5 /* 32-byte chunks */
  201. #if XCHAL_HAVE_LOOPS
  202. loopgtz a10, 2f
  203. #else
  204. beqz a10, 2f
  205. slli a10, a10, 5
  206. add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
  207. .Loop5:
  208. #endif
  209. EX(10f) l32i a9, a2, 0
  210. EX(10f) l32i a8, a2, 4
  211. EX(11f) s32i a9, a3, 0
  212. EX(11f) s32i a8, a3, 4
  213. ONES_ADD(a5, a9)
  214. ONES_ADD(a5, a8)
  215. EX(10f) l32i a9, a2, 8
  216. EX(10f) l32i a8, a2, 12
  217. EX(11f) s32i a9, a3, 8
  218. EX(11f) s32i a8, a3, 12
  219. ONES_ADD(a5, a9)
  220. ONES_ADD(a5, a8)
  221. EX(10f) l32i a9, a2, 16
  222. EX(10f) l32i a8, a2, 20
  223. EX(11f) s32i a9, a3, 16
  224. EX(11f) s32i a8, a3, 20
  225. ONES_ADD(a5, a9)
  226. ONES_ADD(a5, a8)
  227. EX(10f) l32i a9, a2, 24
  228. EX(10f) l32i a8, a2, 28
  229. EX(11f) s32i a9, a3, 24
  230. EX(11f) s32i a8, a3, 28
  231. ONES_ADD(a5, a9)
  232. ONES_ADD(a5, a8)
  233. addi a2, a2, 32
  234. addi a3, a3, 32
  235. #if !XCHAL_HAVE_LOOPS
  236. blt a2, a10, .Loop5
  237. #endif
  238. 2:
  239. extui a10, a4, 2, 3 /* remaining 4-byte chunks */
  240. extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
  241. #if XCHAL_HAVE_LOOPS
  242. loopgtz a10, 3f
  243. #else
  244. beqz a10, 3f
  245. slli a10, a10, 2
  246. add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
  247. .Loop6:
  248. #endif
  249. EX(10f) l32i a9, a2, 0
  250. EX(11f) s32i a9, a3, 0
  251. ONES_ADD(a5, a9)
  252. addi a2, a2, 4
  253. addi a3, a3, 4
  254. #if !XCHAL_HAVE_LOOPS
  255. blt a2, a10, .Loop6
  256. #endif
  257. 3:
  258. /*
  259. Control comes to here in two cases: (1) It may fall through
  260. to here from the 4-byte alignment case to process, at most,
  261. one 2-byte chunk. (2) It branches to here from above if
  262. either src or dst is 2-byte aligned, and we process all bytes
  263. here, except for perhaps a trailing odd byte. It's
  264. inefficient, so align your addresses to 4-byte boundaries.
  265. a2 = src
  266. a3 = dst
  267. a4 = len
  268. a5 = sum
  269. */
  270. srli a10, a4, 1 /* 2-byte chunks */
  271. #if XCHAL_HAVE_LOOPS
  272. loopgtz a10, 4f
  273. #else
  274. beqz a10, 4f
  275. slli a10, a10, 1
  276. add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
  277. .Loop7:
  278. #endif
  279. EX(10f) l16ui a9, a2, 0
  280. EX(11f) s16i a9, a3, 0
  281. ONES_ADD(a5, a9)
  282. addi a2, a2, 2
  283. addi a3, a3, 2
  284. #if !XCHAL_HAVE_LOOPS
  285. blt a2, a10, .Loop7
  286. #endif
  287. 4:
  288. /* This section processes a possible trailing odd byte. */
  289. _bbci.l a4, 0, 8f /* 1-byte chunk */
  290. EX(10f) l8ui a9, a2, 0
  291. EX(11f) s8i a9, a3, 0
  292. #ifdef __XTENSA_EB__
  293. slli a9, a9, 8 /* shift byte to bits 8..15 */
  294. #endif
  295. ONES_ADD(a5, a9)
  296. 8:
  297. mov a2, a5
  298. abi_ret_default
  299. 5:
  300. /* Control branch to here when either src or dst is odd. We
  301. process all bytes using 8-bit accesses. Grossly inefficient,
  302. so don't feed us an odd address. */
  303. srli a10, a4, 1 /* handle in pairs for 16-bit csum */
  304. #if XCHAL_HAVE_LOOPS
  305. loopgtz a10, 6f
  306. #else
  307. beqz a10, 6f
  308. slli a10, a10, 1
  309. add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
  310. .Loop8:
  311. #endif
  312. EX(10f) l8ui a9, a2, 0
  313. EX(10f) l8ui a8, a2, 1
  314. EX(11f) s8i a9, a3, 0
  315. EX(11f) s8i a8, a3, 1
  316. #ifdef __XTENSA_EB__
  317. slli a9, a9, 8 /* combine into a single 16-bit value */
  318. #else /* for checksum computation */
  319. slli a8, a8, 8
  320. #endif
  321. or a9, a9, a8
  322. ONES_ADD(a5, a9)
  323. addi a2, a2, 2
  324. addi a3, a3, 2
  325. #if !XCHAL_HAVE_LOOPS
  326. blt a2, a10, .Loop8
  327. #endif
  328. 6:
  329. j 4b /* process the possible trailing odd byte */
  330. ENDPROC(csum_partial_copy_generic)
  331. # Exception handler:
  332. .section .fixup, "ax"
  333. /*
  334. a6 = src_err_ptr
  335. a7 = dst_err_ptr
  336. a11 = original len for exception handling
  337. a12 = original dst for exception handling
  338. */
  339. 10:
  340. _movi a2, -EFAULT
  341. s32i a2, a6, 0 /* src_err_ptr */
  342. # clear the complete destination - computing the rest
  343. # is too much work
  344. movi a2, 0
  345. #if XCHAL_HAVE_LOOPS
  346. loopgtz a11, 2f
  347. #else
  348. beqz a11, 2f
  349. add a11, a11, a12 /* a11 = ending address */
  350. .Leloop:
  351. #endif
  352. s8i a2, a12, 0
  353. addi a12, a12, 1
  354. #if !XCHAL_HAVE_LOOPS
  355. blt a12, a11, .Leloop
  356. #endif
  357. 2:
  358. abi_ret_default
  359. 11:
  360. movi a2, -EFAULT
  361. s32i a2, a7, 0 /* dst_err_ptr */
  362. movi a2, 0
  363. abi_ret_default
  364. .previous