/webkit-efl/Source/WebCore/platform/graphics/arm/ShadowBlurNEON.cpp
C++ | 460 lines | 425 code | 8 blank | 27 comment | 0 complexity | 5013c73cd0c376aae6943f6e084143da MD5 | raw file
Possible License(s): GPL-3.0, AGPL-3.0, GPL-2.0, MPL-2.0, JSON, WTFPL, CC-BY-SA-4.0, CC-BY-3.0, BSD-3-Clause, LGPL-2.0, MPL-2.0-no-copyleft-exception, AGPL-1.0, 0BSD, Zlib, Unlicense, BSD-2-Clause, Apache-2.0, LGPL-3.0, ISC, MIT, CC-BY-SA-3.0, CC0-1.0, LGPL-2.1
- /*
- This file includes NEON optimization codes for 3 BoxBlurs filter.
- It is 2.2-2.3 times faster than C implementation in ContextShadow.cpp.
- Copyright (C) 2011 Hyunki Baik <hyunki.baik@samsung.com>
- */
- #include "config.h"
- #include "ShadowBlurNEON.h"
- #if ENABLE(TIZEN_CONTEXTSHADOW_BLUR_NEON)
- #define ASSTRING(str) #str
- #define TOSTRING(value) ASSTRING(value)
- #define DMAX_OFFSET TOSTRING(0)
- #define DMIN_OFFSET TOSTRING(4)
- #define STRIDE_OFFSET TOSTRING(8)
- #define STRIDE_WIDTH_OFFSET TOSTRING(12)
- #define DELTA_OFFSET TOSTRING(16)
- #define SOURCE_LINE_END_OFFSET TOSTRING(20)
- #define DIM_OFFSET TOSTRING(24)
- #define REMAINING_STRIDES_OFFSET TOSTRING(28)
- #define STEP_COUNT_OFFSET TOSTRING(32)
- #define STEP_READALPHA_CONSTANTS_OFFSET TOSTRING(36)
- #define NL "\n"
- // Register allocation.
- #define SOURCE_R "r0"
- #define LIMIT_R "r1"
- #define SIDE1_R "r2"
- #define SIDE2_R "r3"
- #define SOURCE_END_R "r4"
- #define DMAX_R "r5"
- #define DMIN_R "r6"
- #define STRIDE_R "r7"
- #define DELTA_R "r8"
- #define STEP_COUNT_R "r9"
- #define SOURCE_LINE_END_R "r10"
- #define DIM_R "r11"
- #define STEP_R "r12"
- #define PIXELCOUNT_R "lr"
- // Alternate names.
- #define INVCOUNT_R LIMIT_R
- #define REMAINING_STRIDES_R SOURCE_LINE_END_R
- #define INIT_STEP_READALPHA_R LIMIT_R
- #define INIT_STEP_STOREALPHA_R PIXELCOUNT_R
- #define SIDE2_PLUS_ONE_R DMAX_R
- #define ALPHA_INDEX_R DMIN_R
- #define INIT_SUM_R PIXELCOUNT_R
- #define LOOP_INDEX_R PIXELCOUNT_R
- #define LOOP_TEMP_R SIDE1_R
- // NEON register allocation
- #define INVCOUNT_Q "q0"
- #define SUM_Q "q1"
- #define PIXEL_Q "q2"
- #define PIXEL_D0 "d4"
- #define PIXEL_D1 "d5"
- #define PIXEL_D00 "d4[0]"
- #define PIXEL_D01 "d4[1]"
- #define PIXEL_S1 "s9"
- #define PIXEL_D10 "d5[0]"
- #define PIXEL_S2 "s10"
- #define PIXEL_D11 "d5[1]"
- #define LOCAL_PIXEL_Q "q3"
- #define LOCAL_PIXEL_D0 "d6"
- #define LOCAL_PIXEL_D1 "d7"
- #define LOCAL_PIXEL_D00 "d6[0]"
- #define LOCAL_PIXEL_D01 "d6[1]"
- #define LOCAL_PIXEL_D10 "d7[0]"
- #define LOCAL_PIXEL_D11 "d7[1]"
- #define REMAINING_STRIDES_S "s16"
- #define STRIDE_WIDTH_S "s17"
- #define DMAX_S "s18"
- #define DMIN_S "s19"
- #define TEMP_Q "q5"
- #define ALPHA1_Q "q6"
- #define ALPHA1_D0 "d12"
- #define ALPHA1_D1 "d13"
- #define ALPHA2_Q "q7"
- #define ALPHA2_D0 "d14"
- #define ALPHA2_D1 "d15"
- #define REMAP_STEP_1_LOADALPHA_Q "d16"
- #define REMAP_STEP_2_LOADALPHA_Q "d17"
- #define REMAP_STEP_1_STOREALPHA_Q "d18"
- #define REMAP_STEP_2_STOREALPHA_Q "d19"
- #define READALPHA_RANGE "d20-d25"
- #define REMAP_STEP00_LOADALPHA_Q "d20"
- #define REMAP_STEP01_LOADALPHA_Q "d21"
- #define REMAP_STEP10_LOADALPHA_Q "d22"
- #define REMAP_STEP11_LOADALPHA_Q "d23"
- #define REMAP_STEP20_LOADALPHA_Q "d24"
- #define REMAP_STEP21_LOADALPHA_Q "d25"
- #define STOREALPHA_RANGE "d26-31"
- #define REMAP_STEP00_STOREALPHA_Q "d26"
- #define REMAP_STEP01_STOREALPHA_Q "d27"
- #define REMAP_STEP10_STOREALPHA_Q "d28"
- #define REMAP_STEP11_STOREALPHA_Q "d29"
- #define REMAP_STEP20_STOREALPHA_Q "d30"
- #define REMAP_STEP21_STOREALPHA_Q "d31"
- #define DATA_TRANSFER4(command, base) \
- command " " PIXEL_D00 ", [" base "], " DELTA_R NL \
- command " " PIXEL_D01 ", [" base "], " DELTA_R NL \
- command " " PIXEL_D10 ", [" base "], " DELTA_R NL \
- command " " PIXEL_D11 ", [" base "], " DELTA_R NL \
- "sub " base ", " base ", " DELTA_R ", lsl #2" NL
- // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
- #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
- command1 " " PIXEL_D00 ", [" base "], " DELTA_R NL \
- "cmp " REMAINING_STRIDES_R ", #2" NL \
- command2 "cs " PIXEL_S1 ", [" base "]" NL \
- "add " base ", " base ", " DELTA_R NL \
- "cmp " REMAINING_STRIDES_R ", #3" NL \
- command2 "cs " PIXEL_S2 ", [" base "]" NL \
- "sub " base ", " base ", " DELTA_R ", lsl #1" NL
- asm ( // NOLINT
- ".globl " TOSTRING(boxBlurNeon) NL
- TOSTRING(boxBlurNeon) ":" NL
- ".fpu neon" NL
- "stmdb sp!, {r4-r12, lr}" NL
- "vpush {d7-d15}" NL
- "vldr.u32 " DMAX_S ", [r1, #" DMAX_OFFSET "]" NL
- "vldr.u32 " DMIN_S ", [r1, #" DMIN_OFFSET "]" NL
- "ldr " STRIDE_R ", [r1, #" STRIDE_OFFSET "]" NL
- "vldr.u32 " STRIDE_WIDTH_S ", [r1, #" STRIDE_WIDTH_OFFSET "]" NL
- "ldr " DELTA_R ", [r1, #" DELTA_OFFSET "]" NL
- "ldr " SOURCE_LINE_END_R ", [r1, #" SOURCE_LINE_END_OFFSET "]" NL
- "ldr " DIM_R ", [r1, #" DIM_OFFSET "]" NL
- "vldr.u32 " REMAINING_STRIDES_S ", [r1, #" REMAINING_STRIDES_OFFSET "]" NL
- "ldr " STEP_COUNT_R ", [r1, #" STEP_COUNT_OFFSET "]" NL
- "ldr " INIT_STEP_READALPHA_R ", [r1, #" STEP_READALPHA_CONSTANTS_OFFSET "]" NL
- "mla " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " DELTA_R ", " SOURCE_R NL
- "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
- "beq .EarlyLeave" NL
- "vld1.u32 { d20-d22 }, [" INIT_STEP_READALPHA_R "]!" NL
- "vld1.u32 { d23-d25 }, [" INIT_STEP_READALPHA_R "]!" NL
- "vld1.u32 { d26-d28 }, [" INIT_STEP_READALPHA_R "]!" NL
- "vld1.u32 { d29-d31 }, [" INIT_STEP_READALPHA_R "]!" NL
- ".MainLoop:" NL
- // Processing 4 strides parallelly.
- "mov " STEP_R ", #0" NL
- ".StepLoop:" NL
- "vmov.u32 " DMAX_R ", " DMAX_S NL
- "vmov.u32 " DMIN_R ", " DMIN_S NL
- "cmp " STEP_R ", #2" NL
- "beq .InitStep2" NL
- "cmp " STEP_R ", #1" NL
- "beq .InitStep1" NL
- ".InitStep0:" NL
- "mov " SIDE1_R ", " DMIN_R NL
- "mov " SIDE2_R ", " DMAX_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP00_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP01_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP00_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP01_STOREALPHA_Q NL
- "bal .EndInitStep" NL
- ".InitStep1:" NL
- "mov " SIDE1_R ", " DMAX_R NL
- "mov " SIDE2_R ", " DMIN_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP10_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP11_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP10_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP11_STOREALPHA_Q NL
- "bal .EndInitStep" NL
- ".InitStep2:" NL
- "mov " SIDE1_R ", " DMAX_R NL
- "mov " SIDE2_R ", " DMAX_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP20_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP21_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP20_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP21_STOREALPHA_Q NL
- ".EndInitStep:" NL
- "add " PIXELCOUNT_R ", " SIDE1_R ", " SIDE2_R NL
- "add " PIXELCOUNT_R ", " PIXELCOUNT_R ", #1" NL
- "mov " INVCOUNT_R ", #1" NL
- "add " INVCOUNT_R ", " PIXELCOUNT_R ", " INVCOUNT_R ", lsl #15" NL
- "subs " INVCOUNT_R ", " INVCOUNT_R ", #1" NL
- //////////////////////////////////
- // integer div code from http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204h/CEGECDGD.html
- // DIV: DMAX_R(r5), BOT: PIXELCOUNT_R(lr), TOP: INVCOUNT_R(r1), TEMP: DMIN_R(r6)
- "mov r6, lr" NL
- "cmp r6, r1, LSR #1" NL
- ".DivLable1:" NL
- "movls r6, r6, LSL #1" NL
- "cmp r6, r1, LSR #1" NL
- "bls .DivLable1" NL
- "mov r5, #0" NL
- ".DivLable2:" NL
- "cmp r1, r6" NL
- "subcs r1, r1, r6" NL
- "adc r5, r5, r5" NL
- "mov r6, r6, LSR #1" NL
- "cmp r6, lr" NL
- "bhs .DivLable2" NL
- "vdup.u32 " INVCOUNT_Q ", " DMAX_R NL
- ".EndDiv:" NL
- DATA_TRANSFER4("vld1.u32", SOURCE_R)
- "vtbl.8 " ALPHA1_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " ALPHA1_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "sub " ALPHA_INDEX_R ", " DIM_R ", #1" NL
- "mla " ALPHA_INDEX_R ", " ALPHA_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
- DATA_TRANSFER4("vld1.u32", ALPHA_INDEX_R)
- "vtbl.8 " ALPHA2_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " ALPHA2_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vdup.u32 " TEMP_Q ", " SIDE1_R NL
- "vmul.u32 " TEMP_Q ", " TEMP_Q ", " ALPHA1_Q NL
- "vadd.u32 " SUM_Q ", " TEMP_Q ", " ALPHA1_Q NL
- "add " SIDE2_PLUS_ONE_R ", " SIDE2_R ", #1" NL
- "cmp " DIM_R ", " SIDE2_PLUS_ONE_R NL
- "bcc .DimSet" NL
- ".Side2PlusOneSet:" NL
- "mov " LIMIT_R ", " SIDE2_PLUS_ONE_R NL
- "bal .EndSetLimit1" NL
- ".DimSet:" NL
- "mov " LIMIT_R ", " DIM_R NL
- ".EndSetLimit1:" NL
- "mov " INIT_SUM_R ", " SOURCE_R NL
- "mla " SOURCE_END_R ", " LIMIT_R ", " STRIDE_R ", " SOURCE_R NL
- "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
- "cmp " INIT_SUM_R ", " SOURCE_END_R NL
- "bcs .InitSumDone" NL
- ".InitSum:" NL
- DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
- "cmp " INIT_SUM_R ", " SOURCE_END_R NL
- "bcc .InitSum" NL
- ".InitSumDone:" NL
- "cmp " SIDE2_R ", " LIMIT_R NL
- "bcc .EndSetSum" NL
- ".SetSum:" NL
- "sub " INIT_SUM_R ", " SIDE2_PLUS_ONE_R ", " LIMIT_R NL
- "vdup.u32 " TEMP_Q ", " INIT_SUM_R NL
- "vmul.u32 " TEMP_Q ", " ALPHA2_Q ", " TEMP_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " TEMP_Q NL
- ".EndSetSum:" NL
- "cmp " SIDE1_R ", " DIM_R NL
- "bcc .SetLimit2" NL
- "mov " LIMIT_R ", " DIM_R NL
- "bal .EndSetLimit2" NL
- ".SetLimit2:" NL
- "mov " LIMIT_R ", " SIDE1_R NL
- ".EndSetLimit2:" NL
- // Blurring.
- "mov " LOOP_INDEX_R ", #0" NL
- ".Blur:" NL
- "vmul.u32 " LOCAL_PIXEL_Q ", " SUM_Q ", " INVCOUNT_Q NL
- "vshr.u32 " LOCAL_PIXEL_Q ", " LOCAL_PIXEL_Q ", #15" NL
- "mla " LOOP_TEMP_R ", " LOOP_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
- DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
- "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_1_STOREALPHA_Q NL
- "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_2_STOREALPHA_Q NL
- DATA_TRANSFER4("vst1.u32", LOOP_TEMP_R)
- "cmp " LOOP_INDEX_R ", " LIMIT_R NL
- "bcc .SubtractAlpha1" NL
- "sub " LOOP_TEMP_R ", " LOOP_INDEX_R ", " LIMIT_R NL
- "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
- DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vsub.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- "bal .EndLeft" NL
- ".SubtractAlpha1: " NL
- "vsub.u32 " SUM_Q ", " SUM_Q ", " ALPHA1_Q NL
- ".EndLeft: "
- "add " LOOP_TEMP_R ", " LOOP_INDEX_R ", " SIDE2_R NL
- "add " LOOP_TEMP_R ", " LOOP_TEMP_R ", #1" NL
- "cmp " LOOP_TEMP_R ", " DIM_R NL
- "bcc .SetRight" NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " ALPHA2_Q NL
- "bal .EndRight" NL
- ".SetRight: " NL
- "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
- DATA_TRANSFER4("vld1.u32", LOOP_TEMP_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- ".EndRight: " NL
- "add " LOOP_INDEX_R ", " LOOP_INDEX_R ", #1" NL
- "cmp " LOOP_INDEX_R ", " DIM_R NL
- "bcc .Blur" NL
- ".EndBlurLine: " NL
- // 3 step check
- "add " STEP_R ", " STEP_R ", #1" NL
- "cmp " STEP_R ", " STEP_COUNT_R NL
- "bcc .StepLoop" NL
- ".EndStepLoop: " NL
- // Line check
- "add " SOURCE_R ", " SOURCE_R ", " DELTA_R ", lsl #2" NL // next 4 lines
- "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL // check all lines are processed
- "bcc .MainLoop" NL
- /////////////////////////////////////////////////////////////////////////////////////////////////
- // Processing the remaining strides (0 - 3).
- ".EarlyLeave:" NL
- "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S NL
- // Early return for 0 strides.
- "cmp " REMAINING_STRIDES_R ", #1" NL
- "bcs .SecondStepLoopStart" NL
- "vpop {d7-d15}" NL
- "ldmia sp!, {r4-r12, pc}" NL
- ".SecondStepLoopStart:" NL
- // initialize step variable
- "mov " STEP_R ", #0" NL
- ".SecondStepLoop:" NL
- "vmov.u32 " DMAX_R ", " DMAX_S NL
- "vmov.u32 " DMIN_R ", " DMIN_S NL
- "cmp " STEP_R ", #2" NL
- "beq .SecondInitStep2" NL
- "cmp " STEP_R ", #1" NL
- "beq .SecondInitStep1" NL
- ".SecondInitStep0:" NL
- "mov " SIDE1_R ", " DMIN_R NL
- "mov " SIDE2_R ", " DMAX_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP00_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP01_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP00_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP01_STOREALPHA_Q NL
- "bal .SecondEndInitStep" NL
- ".SecondInitStep1:" NL
- "mov " SIDE1_R ", " DMAX_R NL
- "mov " SIDE2_R ", " DMIN_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP10_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP11_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP10_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP11_STOREALPHA_Q NL
- "bal .SecondEndInitStep" NL
- ".SecondInitStep2:" NL
- "mov " SIDE1_R ", " DMAX_R NL
- "mov " SIDE2_R ", " DMAX_R NL
- "vmov.u8 " REMAP_STEP_1_LOADALPHA_Q "," REMAP_STEP20_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_LOADALPHA_Q "," REMAP_STEP21_LOADALPHA_Q NL
- "vmov.u8 " REMAP_STEP_1_STOREALPHA_Q "," REMAP_STEP20_STOREALPHA_Q NL
- "vmov.u8 " REMAP_STEP_2_STOREALPHA_Q "," REMAP_STEP21_STOREALPHA_Q NL
- ".SecondEndInitStep:" NL
- "add " PIXELCOUNT_R ", " SIDE1_R ", " SIDE2_R NL
- "add " PIXELCOUNT_R ", " PIXELCOUNT_R ", #1" NL
- "mov " INVCOUNT_R ", #1" NL
- "add " INVCOUNT_R ", " PIXELCOUNT_R ", " INVCOUNT_R ", lsl #15" NL
- "subs " INVCOUNT_R ", " INVCOUNT_R ", #1" NL
- //////////////////////////////////
- // integer div code from http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204h/CEGECDGD.html
- // DIV: DMAX_R(r5), BOT: PIXELCOUNT_R(lr), TOP: INVCOUNT_R(r1), TEMP: DMIN_R(r6)
- "mov r6, lr" NL
- "cmp r6, r1, LSR #1" NL
- ".SecondDivLable1:" NL
- "movls r6, r6, LSL #1" NL
- "cmp r6, r1, LSR #1" NL
- "bls .SecondDivLable1" NL
- "mov r5, #0" NL
- ".SecondDivLable2:" NL
- "cmp r1, r6" NL
- "subcs r1, r1, r6" NL
- "adc r5, r5, r5" NL
- "mov r6, r6, LSR #1" NL
- "cmp r6, lr" NL
- "bhs .SecondDivLable2" NL
- "vdup.u32 " INVCOUNT_Q ", " DMAX_R NL
- ".SecondEndDiv:" NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", SOURCE_R)
- "vtbl.8 " ALPHA1_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " ALPHA1_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "sub " ALPHA_INDEX_R ", " DIM_R ", #1" NL
- "mla " ALPHA_INDEX_R ", " ALPHA_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", ALPHA_INDEX_R)
- "vtbl.8 " ALPHA2_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " ALPHA2_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vdup.u32 " TEMP_Q ", " SIDE1_R NL
- "vmul.u32 " TEMP_Q ", " TEMP_Q ", " ALPHA1_Q NL
- "vadd.u32 " SUM_Q ", " TEMP_Q ", " ALPHA1_Q NL
- "add " SIDE2_PLUS_ONE_R ", " SIDE2_R ", #1" NL
- "cmp " DIM_R ", " SIDE2_PLUS_ONE_R NL
- "bcc .SecondDimSet" NL
- ".SecondSide2PlusOneSet:" NL
- "mov " LIMIT_R ", " SIDE2_PLUS_ONE_R NL
- "bal .SecondEndSetLimit1" NL
- ".SecondDimSet:" NL
- "mov " LIMIT_R ", " DIM_R NL
- ".SecondEndSetLimit1:" NL
- "mov " INIT_SUM_R ", " SOURCE_R NL
- "mla " SOURCE_END_R ", " LIMIT_R ", " STRIDE_R ", " SOURCE_R NL
- "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
- "cmp " INIT_SUM_R ", " SOURCE_END_R NL
- "bcs .SecondInitSumDone" NL
- ".SecondInitSum:" NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
- "cmp " INIT_SUM_R ", " SOURCE_END_R NL
- "bcc .SecondInitSum" NL
- ".SecondInitSumDone:" NL
- "cmp " SIDE2_R ", " LIMIT_R NL
- "bcc .SecondEndSetSum" NL
- ".SecondSetSum:" NL
- "sub " INIT_SUM_R ", " SIDE2_PLUS_ONE_R ", " LIMIT_R NL
- "vdup.u32 " TEMP_Q ", " INIT_SUM_R NL
- "vmul.u32 " TEMP_Q ", " ALPHA2_Q ", " TEMP_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " TEMP_Q NL
- ".SecondEndSetSum:" NL
- "cmp " SIDE1_R ", " DIM_R NL
- "bcc .SecondSetLimit2" NL
- "mov " LIMIT_R ", " DIM_R NL
- "bal .SecondEndSetLimit2" NL
- ".SecondSetLimit2:" NL
- "mov " LIMIT_R ", " SIDE1_R NL
- ".SecondEndSetLimit2:" NL
- // Blurring.
- "mov " LOOP_INDEX_R ", #0" NL
- ".SecondBlur:" NL
- "vmul.u32 " LOCAL_PIXEL_Q ", " SUM_Q ", " INVCOUNT_Q NL
- "vshr.u32 " LOCAL_PIXEL_Q ", " LOCAL_PIXEL_Q ", #15" NL
- "mla " LOOP_TEMP_R ", " LOOP_INDEX_R ", " STRIDE_R ", " SOURCE_R NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
- "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_1_STOREALPHA_Q NL
- "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "-" LOCAL_PIXEL_D1 "}, " REMAP_STEP_2_STOREALPHA_Q NL
- CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", LOOP_TEMP_R)
- "cmp " LOOP_INDEX_R ", " LIMIT_R NL
- "bcc .SecondSubtractAlpha1" NL
- "sub " LOOP_TEMP_R ", " LOOP_INDEX_R ", " LIMIT_R NL
- "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vsub.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- "bal .SecondEndLeft" NL
- ".SecondSubtractAlpha1: " NL
- "vsub.u32 " SUM_Q ", " SUM_Q ", " ALPHA1_Q NL
- ".SecondEndLeft: "
- "add " LOOP_TEMP_R ", " LOOP_INDEX_R ", " SIDE2_R NL
- "add " LOOP_TEMP_R ", " LOOP_TEMP_R ", #1" NL
- "cmp " LOOP_TEMP_R ", " DIM_R NL
- "bcc .SecondSetRight" NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " ALPHA2_Q NL
- "bal .SecondEndRight" NL
- ".SecondSetRight: " NL
- "mla " LOOP_TEMP_R ", " LOOP_TEMP_R ", " STRIDE_R ", " SOURCE_R NL
- CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LOOP_TEMP_R)
- "vtbl.8 " LOCAL_PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_1_LOADALPHA_Q NL
- "vtbl.8 " LOCAL_PIXEL_D1 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_STEP_2_LOADALPHA_Q NL
- "vadd.u32 " SUM_Q ", " SUM_Q ", " LOCAL_PIXEL_Q NL
- ".SecondEndRight: " NL
- "add " LOOP_INDEX_R ", " LOOP_INDEX_R ", #1" NL
- "cmp " LOOP_INDEX_R ", " DIM_R NL
- "bcc .SecondBlur" NL
- ".SecondEndBlurLine: " NL
- // step check
- "add " STEP_R ", " STEP_R ", #1" NL
- "cmp " STEP_R ", " STEP_COUNT_R NL
- "bcc .SecondStepLoop" NL
- "vpop {d7-d15}" NL
- "ldmia sp!, {r4-r12, pc}" NL
- );
- #endif