PageRenderTime 122ms CodeModel.GetById 29ms RepoModel.GetById 9ms app.codeStats 0ms

/webkit-efl/Source/WebCore/platform/graphics/arm/ShadowBlurNEON.cpp

https://review.tizen.org/git/
C++ | 460 lines | 425 code | 8 blank | 27 comment | 0 complexity | 5013c73cd0c376aae6943f6e084143da MD5 | raw file
Possible License(s): GPL-3.0, AGPL-3.0, GPL-2.0, MPL-2.0, JSON, WTFPL, CC-BY-SA-4.0, CC-BY-3.0, BSD-3-Clause, LGPL-2.0, MPL-2.0-no-copyleft-exception, AGPL-1.0, 0BSD, Zlib, Unlicense, BSD-2-Clause, Apache-2.0, LGPL-3.0, ISC, MIT, CC-BY-SA-3.0, CC0-1.0, LGPL-2.1
  1. /*
  2. This file includes NEON optimization codes for 3 BoxBlurs filter.
  3. It is 2.2-2.3 times faster than C implementation in ContextShadow.cpp.
  4. Copyright (C) 2011 Hyunki Baik <hyunki.baik@samsung.com>
  5. */
  6. #include "config.h"
  7. #include "ShadowBlurNEON.h"
  8. #if ENABLE(TIZEN_CONTEXTSHADOW_BLUR_NEON)
  9. #define ASSTRING(str) #str
  10. #define TOSTRING(value) ASSTRING(value)
  11. #define DMAX_OFFSET TOSTRING(0)
  12. #define DMIN_OFFSET TOSTRING(4)
  13. #define STRIDE_OFFSET TOSTRING(8)
  14. #define STRIDE_WIDTH_OFFSET TOSTRING(12)
  15. #define DELTA_OFFSET TOSTRING(16)
  16. #define SOURCE_LINE_END_OFFSET TOSTRING(20)
  17. #define DIM_OFFSET TOSTRING(24)
  18. #define REMAINING_STRIDES_OFFSET TOSTRING(28)
  19. #define STEP_COUNT_OFFSET TOSTRING(32)
  20. #define STEP_READALPHA_CONSTANTS_OFFSET TOSTRING(36)
  21. #define NL "\n"
  22. // Register allocation.
  23. #define SOURCE_R "r0"
  24. #define LIMIT_R "r1"
  25. #define SIDE1_R "r2"
  26. #define SIDE2_R "r3"
  27. #define SOURCE_END_R "r4"
  28. #define DMAX_R "r5"
  29. #define DMIN_R "r6"
  30. #define STRIDE_R "r7"
  31. #define DELTA_R "r8"
  32. #define STEP_COUNT_R "r9"
  33. #define SOURCE_LINE_END_R "r10"
  34. #define DIM_R "r11"
  35. #define STEP_R "r12"
  36. #define PIXELCOUNT_R "lr"
  37. // Alternate names.
  38. #define INVCOUNT_R LIMIT_R
  39. #define REMAINING_STRIDES_R SOURCE_LINE_END_R
  40. #define INIT_STEP_READALPHA_R LIMIT_R
  41. #define INIT_STEP_STOREALPHA_R PIXELCOUNT_R
  42. #define SIDE2_PLUS_ONE_R DMAX_R
  43. #define ALPHA_INDEX_R DMIN_R
  44. #define INIT_SUM_R PIXELCOUNT_R
  45. #define LOOP_INDEX_R PIXELCOUNT_R
  46. #define LOOP_TEMP_R SIDE1_R
  47. // NEON register allocation
  48. #define INVCOUNT_Q "q0"
  49. #define SUM_Q "q1"
  50. #define PIXEL_Q "q2"
  51. #define PIXEL_D0 "d4"
  52. #define PIXEL_D1 "d5"
  53. #define PIXEL_D00 "d4[0]"
  54. #define PIXEL_D01 "d4[1]"
  55. #define PIXEL_S1 "s9"
  56. #define PIXEL_D10 "d5[0]"
  57. #define PIXEL_S2 "s10"
  58. #define PIXEL_D11 "d5[1]"
  59. #define LOCAL_PIXEL_Q "q3"
  60. #define LOCAL_PIXEL_D0 "d6"
  61. #define LOCAL_PIXEL_D1 "d7"
  62. #define LOCAL_PIXEL_D00 "d6[0]"
  63. #define LOCAL_PIXEL_D01 "d6[1]"
  64. #define LOCAL_PIXEL_D10 "d7[0]"
  65. #define LOCAL_PIXEL_D11 "d7[1]"
  66. #define REMAINING_STRIDES_S "s16"
  67. #define STRIDE_WIDTH_S "s17"
  68. #define DMAX_S "s18"
  69. #define DMIN_S "s19"
  70. #define TEMP_Q "q5"
  71. #define ALPHA1_Q "q6"
  72. #define ALPHA1_D0 "d12"
  73. #define ALPHA1_D1 "d13"
  74. #define ALPHA2_Q "q7"
  75. #define ALPHA2_D0 "d14"
  76. #define ALPHA2_D1 "d15"
  77. #define REMAP_STEP_1_LOADALPHA_Q "d16"
  78. #define REMAP_STEP_2_LOADALPHA_Q "d17"
  79. #define REMAP_STEP_1_STOREALPHA_Q "d18"
  80. #define REMAP_STEP_2_STOREALPHA_Q "d19"
  81. #define READALPHA_RANGE "d20-d25"
  82. #define REMAP_STEP00_LOADALPHA_Q "d20"
  83. #define REMAP_STEP01_LOADALPHA_Q "d21"
  84. #define REMAP_STEP10_LOADALPHA_Q "d22"
  85. #define REMAP_STEP11_LOADALPHA_Q "d23"
  86. #define REMAP_STEP20_LOADALPHA_Q "d24"
  87. #define REMAP_STEP21_LOADALPHA_Q "d25"
  88. #define STOREALPHA_RANGE "d26-31"
  89. #define REMAP_STEP00_STOREALPHA_Q "d26"
  90. #define REMAP_STEP01_STOREALPHA_Q "d27"
  91. #define REMAP_STEP10_STOREALPHA_Q "d28"
  92. #define REMAP_STEP11_STOREALPHA_Q "d29"
  93. #define REMAP_STEP20_STOREALPHA_Q "d30"
  94. #define REMAP_STEP21_STOREALPHA_Q "d31"
  95. #define DATA_TRANSFER4(command, base) \
  96. command " " PIXEL_D00 ", [" base "], " DELTA_R NL \
  97. command " " PIXEL_D01 ", [" base "], " DELTA_R NL \
  98. command " " PIXEL_D10 ", [" base "], " DELTA_R NL \
  99. command " " PIXEL_D11 ", [" base "], " DELTA_R NL \
  100. "sub " base ", " base ", " DELTA_R ", lsl #2" NL
  101. // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
  102. #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
  103. command1 " " PIXEL_D00 ", [" base "], " DELTA_R NL \
  104. "cmp " REMAINING_STRIDES_R ", #2" NL \
  105. command2 "cs " PIXEL_S1 ", [" base "]" NL \
  106. "add " base ", " base ", " DELTA_R NL \
  107. "cmp " REMAINING_STRIDES_R ", #3" NL \
  108. command2 "cs " PIXEL_S2 ", [" base "]" NL \
  109. "sub " base ", " base ", " DELTA_R ", lsl #1" NL
  110. asm ( // NOLINT
  111. ".globl " TOSTRING(boxBlurNeon) NL
  112. TOSTRING(boxBlurNeon) ":" NL
  113. ".fpu neon" NL
  114. "stmdb sp!, {r4-r12, lr}" NL
  115. "vpush {d7-d15}" NL
  116. "vldr.u32 " DMAX_S ", [r1, #" DMAX_OFFSET "]" NL
  117. "vldr.u32 " DMIN_S ", [r1, #" DMIN_OFFSET "]" NL
  118. "ldr " STRIDE_R ", [r1, #" STRIDE_OFFSET "]" NL
  119. "vldr.u32 " STRIDE_WIDTH_S ", [r1, #" STRIDE_WIDTH_OFFSET "]" NL
  120. "ldr " DELTA_R ", [r1, #" DELTA_OFFSET "]" NL
  121. "ldr " SOURCE_LINE_END_R ", [r1, #" SOURCE_LINE_END_OFFSET "]" NL
  122. "ldr " DIM_R ", [r1, #" DIM_OFFSET "]" NL
  123. "vldr.u32 " REMAINING_STRIDES_S ", [r1, #" REMAINING_STRIDES_OFFSET "]" NL
  124. "ldr " STEP_COUNT_R ", [r1, #" STEP_COUNT_OFFSET "]" NL
  125. "ldr " INIT_STEP_READALPHA_R ", [r1, #" STEP_READALPHA_CONSTANTS_OFFSET "]" NL
  126. "mla " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " DELTA_R ", " SOURCE_R NL
  127. "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
  128. "beq .EarlyLeave" NL
  129. "vld1.u32 { d20-d22 }, [" INIT_STEP_READALPHA_R "]!" NL
  130. "vld1.u32 { d23-d25 }, [" INIT_STEP_READALPHA_R "]!" NL
  131. "vld1.u32 { d26-d28 }, [" INIT_STEP_READALPHA_R "]!" NL
  132. "vld1.u32 { d29-d31 }, [" INIT_STEP_READALPHA_R "]!" NL
  133. ".MainLoop:" NL
  134. // Processing 4 strides parallelly.
  135. "mov " STEP_R ", #0" NL
  136. ".StepLoop:" NL
  137. "vmov.u32 " DMAX_R ", " DMAX_S NL
  138. "vmov.u32 " DMIN_R ", " DMIN_S NL
  139. "cmp " STEP_R ", #2" NL
  140. "beq .InitStep2" NL
  141. "cmp " STEP_R ", #1" NL
  142. "beq .InitStep1" NL
  143. ".InitStep0:" NL
  144. "mov " SIDE1_R ", " DMIN_R NL
  145. "mov " SIDE2_R ", " DMAX_R NL
  146. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP00_LOADALPHA_Q NL
  147. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP01_LOADALPHA_Q NL
  148. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP00_STOREALPHA_Q NL
  149. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP01_STOREALPHA_Q NL
  150. "bal .EndInitStep" NL
  151. ".InitStep1:" NL
  152. "mov " SIDE1_R ", " DMAX_R NL
  153. "mov " SIDE2_R ", " DMIN_R NL
  154. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP10_LOADALPHA_Q NL
  155. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP11_LOADALPHA_Q NL
  156. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP10_STOREALPHA_Q NL
  157. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP11_STOREALPHA_Q NL
  158. "bal .EndInitStep" NL
  159. ".InitStep2:" NL
  160. "mov " SIDE1_R ", " DMAX_R NL
  161. "mov " SIDE2_R ", " DMAX_R NL
  162. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP20_LOADALPHA_Q NL
  163. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP21_LOADALPHA_Q NL
  164. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP20_STOREALPHA_Q NL
  165. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP21_STOREALPHA_Q NL
  166. ".EndInitStep:" NL
  167. "add " PIXELCOUNT_R ", " SIDE1_R ", " SIDE2_R NL
  168. "add " PIXELCOUNT_R ", " PIXELCOUNT_R ", #1" NL
  169. "mov " INVCOUNT_R ", #1" NL
  170. "add " INVCOUNT_R ", " PIXELCOUNT_R ", " INVCOUNT_R ", lsl #15" NL
  171. "subs " INVCOUNT_R ", " INVCOUNT_R ", #1" NL
  172. //////////////////////////////////
  173. // integer div code from http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204h/CEGECDGD.html
  174. // DIV: DMAX_R(r5), BOT: PIXELCOUNT_R(lr), TOP: INVCOUNT_R(r1), TEMP: DMIN_R(r6)
  175. "mov r6, lr" NL
  176. "cmp r6, r1, LSR #1" NL
  177. ".DivLable1:" NL
  178. "movls r6, r6, LSL #1" NL
  179. "cmp r6, r1, LSR #1" NL
  180. "bls .DivLable1" NL
  181. "mov r5, #0" NL
  182. ".DivLable2:" NL
  183. "cmp r1, r6" NL
  184. "subcs r1, r1, r6" NL
  185. "adc r5, r5, r5" NL
  186. "mov r6, r6, LSR #1" NL
  187. "cmp r6, lr" NL
  188. "bhs .DivLable2" NL
  189. "vdup.u32 " INVCOUNT_Q ", " DMAX_R NL
  190. ".EndDiv:" NL
  191. DATA_TRANSFER4("vld1.u32", SOURCE_R)
  192. "vtbl.8 " ALPHA1_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  193. "vtbl.8 " ALPHA1_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  194. "sub " ALPHA_INDEX_R ", " DIM_R ", #1" NL
  195. "mla " ALPHA_INDEX_R ", " ALPHA_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
  196. DATA_TRANSFER4("vld1.u32", ALPHA_INDEX_R)
  197. "vtbl.8 " ALPHA2_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  198. "vtbl.8 " ALPHA2_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  199. "vdup.u32 " TEMP_Q ", " SIDE1_R NL
  200. "vmul.u32 " TEMP_Q ", " TEMP_Q ", " ALPHA1_Q NL
  201. "vadd.u32 " SUM_Q ", " TEMP_Q ", " ALPHA1_Q NL
  202. "add " SIDE2_PLUS_ONE_R ", " SIDE2_R ", #1" NL
  203. "cmp " DIM_R ", " SIDE2_PLUS_ONE_R NL
  204. "bcc .DimSet" NL
  205. ".Side2PlusOneSet:" NL
  206. "mov " LIMIT_R ", " SIDE2_PLUS_ONE_R NL
  207. "bal .EndSetLimit1" NL
  208. ".DimSet:" NL
  209. "mov " LIMIT_R ", " DIM_R NL
  210. ".EndSetLimit1:" NL
  211. "mov " INIT_SUM_R ", " SOURCE_R NL
  212. "mla " SOURCE_END_R ", " LIMIT_R ", " STRIDE_R ", " SOURCE_R NL
  213. "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
  214. "cmp " INIT_SUM_R ", " SOURCE_END_R NL
  215. "bcs .InitSumDone" NL
  216. ".InitSum:" NL
  217. DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
  218. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  219. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  220. "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  221. "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
  222. "cmp " INIT_SUM_R ", " SOURCE_END_R NL
  223. "bcc .InitSum" NL
  224. ".InitSumDone:" NL
  225. "cmp " SIDE2_R ", " LIMIT_R NL
  226. "bcc .EndSetSum" NL
  227. ".SetSum:" NL
  228. "sub " INIT_SUM_R ", " SIDE2_PLUS_ONE_R ", " LIMIT_R NL
  229. "vdup.u32 " TEMP_Q ", " INIT_SUM_R NL
  230. "vmul.u32 " TEMP_Q ", " ALPHA2_Q ", " TEMP_Q NL
  231. "vadd.u32 " SUM_Q ", " SUM_Q ", " TEMP_Q NL
  232. ".EndSetSum:" NL
  233. "cmp " SIDE1_R ", " DIM_R NL
  234. "bcc .SetLimit2" NL
  235. "mov " LIMIT_R ", " DIM_R NL
  236. "bal .EndSetLimit2" NL
  237. ".SetLimit2:" NL
  238. "mov " LIMIT_R ", " SIDE1_R NL
  239. ".EndSetLimit2:" NL
  240. // Blurring.
  241. "mov " LOOP_INDEX_R ", #0" NL
  242. ".Blur:" NL
  243. "vmul.u32 " LOCAL_PIXEL_Q ", " SUM_Q ", " INVCOUNT_Q NL
  244. "vshr.u32 " LOCAL_PIXEL_Q ", " LOCAL_PIXEL_Q ", #15" NL
  245. "mla " LOOP_TEMP_R ", " LOOP_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
  246. DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
  247. "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_1_STOREALPHA_Q NL
  248. "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_2_STOREALPHA_Q NL
  249. DATA_TRANSFER4("vst1.u32", LOOP_TEMP_R)
  250. "cmp " LOOP_INDEX_R ", " LIMIT_R NL
  251. "bcc .SubtractAlpha1" NL
  252. "sub " LOOP_TEMP_R ", " LOOP_INDEX_R ", " LIMIT_R NL
  253. "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
  254. DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
  255. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  256. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  257. "vsub.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  258. "bal .EndLeft" NL
  259. ".SubtractAlpha1: " NL
  260. "vsub.u32 " SUM_Q ", " SUM_Q ", " ALPHA1_Q NL
  261. ".EndLeft: "
  262. "add " LOOP_TEMP_R ", " LOOP_INDEX_R ", " SIDE2_R NL
  263. "add " LOOP_TEMP_R ", " LOOP_TEMP_R ", #1" NL
  264. "cmp " LOOP_TEMP_R ", " DIM_R NL
  265. "bcc .SetRight" NL
  266. "vadd.u32 " SUM_Q ", " SUM_Q ", " ALPHA2_Q NL
  267. "bal .EndRight" NL
  268. ".SetRight: " NL
  269. "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
  270. DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
  271. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  272. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  273. "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  274. ".EndRight: " NL
  275. "add " LOOP_INDEX_R ", " LOOP_INDEX_R ", #1" NL
  276. "cmp " LOOP_INDEX_R ", " DIM_R NL
  277. "bcc .Blur" NL
  278. ".EndBlurLine: " NL
  279. // 3 step check
  280. "add " STEP_R ", " STEP_R ", #1" NL
  281. "cmp " STEP_R ", " STEP_COUNT_R NL
  282. "bcc .StepLoop" NL
  283. ".EndStepLoop: " NL
  284. // Line check
  285. "add " SOURCE_R ", " SOURCE_R ", " DELTA_R ", lsl #2" NL // next 4 lines
  286. "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL // check all lines are processed
  287. "bcc .MainLoop" NL
  288. /////////////////////////////////////////////////////////////////////////////////////////////////
  289. // Processing the remaining strides (0 - 3).
  290. ".EarlyLeave:" NL
  291. "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S NL
  292. // Early return for 0 strides.
  293. "cmp " REMAINING_STRIDES_R ", #1" NL
  294. "bcs .SecondStepLoopStart" NL
  295. "vpop {d7-d15}" NL
  296. "ldmia sp!, {r4-r12, pc}" NL
  297. ".SecondStepLoopStart:" NL
  298. // initialize step variable
  299. "mov " STEP_R ", #0" NL
  300. ".SecondStepLoop:" NL
  301. "vmov.u32 " DMAX_R ", " DMAX_S NL
  302. "vmov.u32 " DMIN_R ", " DMIN_S NL
  303. "cmp " STEP_R ", #2" NL
  304. "beq .SecondInitStep2" NL
  305. "cmp " STEP_R ", #1" NL
  306. "beq .SecondInitStep1" NL
  307. ".SecondInitStep0:" NL
  308. "mov " SIDE1_R ", " DMIN_R NL
  309. "mov " SIDE2_R ", " DMAX_R NL
  310. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP00_LOADALPHA_Q NL
  311. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP01_LOADALPHA_Q NL
  312. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP00_STOREALPHA_Q NL
  313. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP01_STOREALPHA_Q NL
  314. "bal .SecondEndInitStep" NL
  315. ".SecondInitStep1:" NL
  316. "mov " SIDE1_R ", " DMAX_R NL
  317. "mov " SIDE2_R ", " DMIN_R NL
  318. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP10_LOADALPHA_Q NL
  319. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP11_LOADALPHA_Q NL
  320. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP10_STOREALPHA_Q NL
  321. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP11_STOREALPHA_Q NL
  322. "bal .SecondEndInitStep" NL
  323. ".SecondInitStep2:" NL
  324. "mov " SIDE1_R ", " DMAX_R NL
  325. "mov " SIDE2_R ", " DMAX_R NL
  326. "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP20_LOADALPHA_Q NL
  327. "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP21_LOADALPHA_Q NL
  328. "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP20_STOREALPHA_Q NL
  329. "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP21_STOREALPHA_Q NL
  330. ".SecondEndInitStep:" NL
  331. "add " PIXELCOUNT_R ", " SIDE1_R ", " SIDE2_R NL
  332. "add " PIXELCOUNT_R ", " PIXELCOUNT_R ", #1" NL
  333. "mov " INVCOUNT_R ", #1" NL
  334. "add " INVCOUNT_R ", " PIXELCOUNT_R ", " INVCOUNT_R ", lsl #15" NL
  335. "subs " INVCOUNT_R ", " INVCOUNT_R ", #1" NL
  336. //////////////////////////////////
  337. // integer div code from http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204h/CEGECDGD.html
  338. // DIV: DMAX_R(r5), BOT: PIXELCOUNT_R(lr), TOP: INVCOUNT_R(r1), TEMP: DMIN_R(r6)
  339. "mov r6, lr" NL
  340. "cmp r6, r1, LSR #1" NL
  341. ".SecondDivLable1:" NL
  342. "movls r6, r6, LSL #1" NL
  343. "cmp r6, r1, LSR #1" NL
  344. "bls .SecondDivLable1" NL
  345. "mov r5, #0" NL
  346. ".SecondDivLable2:" NL
  347. "cmp r1, r6" NL
  348. "subcs r1, r1, r6" NL
  349. "adc r5, r5, r5" NL
  350. "mov r6, r6, LSR #1" NL
  351. "cmp r6, lr" NL
  352. "bhs .SecondDivLable2" NL
  353. "vdup.u32 " INVCOUNT_Q ", " DMAX_R NL
  354. ".SecondEndDiv:" NL
  355. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", SOURCE_R)
  356. "vtbl.8 " ALPHA1_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  357. "vtbl.8 " ALPHA1_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  358. "sub " ALPHA_INDEX_R ", " DIM_R ", #1" NL
  359. "mla " ALPHA_INDEX_R ", " ALPHA_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
  360. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", ALPHA_INDEX_R)
  361. "vtbl.8 " ALPHA2_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  362. "vtbl.8 " ALPHA2_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  363. "vdup.u32 " TEMP_Q ", " SIDE1_R NL
  364. "vmul.u32 " TEMP_Q ", " TEMP_Q ", " ALPHA1_Q NL
  365. "vadd.u32 " SUM_Q ", " TEMP_Q ", " ALPHA1_Q NL
  366. "add " SIDE2_PLUS_ONE_R ", " SIDE2_R ", #1" NL
  367. "cmp " DIM_R ", " SIDE2_PLUS_ONE_R NL
  368. "bcc .SecondDimSet" NL
  369. ".SecondSide2PlusOneSet:" NL
  370. "mov " LIMIT_R ", " SIDE2_PLUS_ONE_R NL
  371. "bal .SecondEndSetLimit1" NL
  372. ".SecondDimSet:" NL
  373. "mov " LIMIT_R ", " DIM_R NL
  374. ".SecondEndSetLimit1:" NL
  375. "mov " INIT_SUM_R ", " SOURCE_R NL
  376. "mla " SOURCE_END_R ", " LIMIT_R ", " STRIDE_R ", " SOURCE_R NL
  377. "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
  378. "cmp " INIT_SUM_R ", " SOURCE_END_R NL
  379. "bcs .SecondInitSumDone" NL
  380. ".SecondInitSum:" NL
  381. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
  382. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  383. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  384. "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  385. "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
  386. "cmp " INIT_SUM_R ", " SOURCE_END_R NL
  387. "bcc .SecondInitSum" NL
  388. ".SecondInitSumDone:" NL
  389. "cmp " SIDE2_R ", " LIMIT_R NL
  390. "bcc .SecondEndSetSum" NL
  391. ".SecondSetSum:" NL
  392. "sub " INIT_SUM_R ", " SIDE2_PLUS_ONE_R ", " LIMIT_R NL
  393. "vdup.u32 " TEMP_Q ", " INIT_SUM_R NL
  394. "vmul.u32 " TEMP_Q ", " ALPHA2_Q ", " TEMP_Q NL
  395. "vadd.u32 " SUM_Q ", " SUM_Q ", " TEMP_Q NL
  396. ".SecondEndSetSum:" NL
  397. "cmp " SIDE1_R ", " DIM_R NL
  398. "bcc .SecondSetLimit2" NL
  399. "mov " LIMIT_R ", " DIM_R NL
  400. "bal .SecondEndSetLimit2" NL
  401. ".SecondSetLimit2:" NL
  402. "mov " LIMIT_R ", " SIDE1_R NL
  403. ".SecondEndSetLimit2:" NL
  404. // Blurring.
  405. "mov " LOOP_INDEX_R ", #0" NL
  406. ".SecondBlur:" NL
  407. "vmul.u32 " LOCAL_PIXEL_Q ", " SUM_Q ", " INVCOUNT_Q NL
  408. "vshr.u32 " LOCAL_PIXEL_Q ", " LOCAL_PIXEL_Q ", #15" NL
  409. "mla " LOOP_TEMP_R ", " LOOP_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
  410. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
  411. "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_1_STOREALPHA_Q NL
  412. "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_2_STOREALPHA_Q NL
  413. CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", LOOP_TEMP_R)
  414. "cmp " LOOP_INDEX_R ", " LIMIT_R NL
  415. "bcc .SecondSubtractAlpha1" NL
  416. "sub " LOOP_TEMP_R ", " LOOP_INDEX_R ", " LIMIT_R NL
  417. "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
  418. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
  419. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  420. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  421. "vsub.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  422. "bal .SecondEndLeft" NL
  423. ".SecondSubtractAlpha1: " NL
  424. "vsub.u32 " SUM_Q ", " SUM_Q ", " ALPHA1_Q NL
  425. ".SecondEndLeft: "
  426. "add " LOOP_TEMP_R ", " LOOP_INDEX_R ", " SIDE2_R NL
  427. "add " LOOP_TEMP_R ", " LOOP_TEMP_R ", #1" NL
  428. "cmp " LOOP_TEMP_R ", " DIM_R NL
  429. "bcc .SecondSetRight" NL
  430. "vadd.u32 " SUM_Q ", " SUM_Q ", " ALPHA2_Q NL
  431. "bal .SecondEndRight" NL
  432. ".SecondSetRight: " NL
  433. "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
  434. CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
  435. "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
  436. "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
  437. "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
  438. ".SecondEndRight: " NL
  439. "add " LOOP_INDEX_R ", " LOOP_INDEX_R ", #1" NL
  440. "cmp " LOOP_INDEX_R ", " DIM_R NL
  441. "bcc .SecondBlur" NL
  442. ".SecondEndBlurLine: " NL
  443. // step check
  444. "add " STEP_R ", " STEP_R ", #1" NL
  445. "cmp " STEP_R ", " STEP_COUNT_R NL
  446. "bcc .SecondStepLoop" NL
  447. "vpop {d7-d15}" NL
  448. "ldmia sp!, {r4-r12, pc}" NL
  449. );
  450. #endif