diff --git a/codec/common/arm64/mc_aarch64_neon.S b/codec/common/arm64/mc_aarch64_neon.S index e4e43f9986..a4dfe85466 100644 --- a/codec/common/arm64/mc_aarch64_neon.S +++ b/codec/common/arm64/mc_aarch64_neon.S @@ -32,8 +32,10 @@ #ifdef HAVE_NEON_AARCH64 #include "arm_arch64_common_macro.S" +.rodata .align 4 filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 +.previous .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 @@ -1912,7 +1914,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon mov x5, #16 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 - ldr q22, filter_para + adrp x6, filter_para + ldr q22, [x6, #:lo12:filter_para] w17_h_mc_luma_loop: ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2] @@ -1946,7 +1949,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon mov x5, #8 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 - ldr q22, filter_para + adrp x6, filter_para + ldr q22, [x6, #:lo12:filter_para] w9_h_mc_luma_loop: ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2] mov v3.d[0], v2.d[1] @@ -2012,7 +2016,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon movi v1.8h, #5, lsl #0 sub x3, x3, #16 mov x5, #16 - ldr q29, filter_para + adrp x6, filter_para + ldr q29, [x6, #:lo12:filter_para] sub x4, x4, #1 @@ -2215,7 +2220,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon movi v1.8h, #5, lsl #0 sub x3, x3, #8 mov x5, #8 - ldr q29, filter_para + adrp x6, filter_para + ldr q29, [x6, #:lo12:filter_para] sub x4, x4, #1 //prfm pldl1strm, [x0] @@ -2315,7 +2321,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon movi v1.8h, #5, lsl #0 sub x3, x3, #4 mov x5, #4 - ldr q29, filter_para + adrp x6, filter_para + ldr q29, [x6, #:lo12:filter_para] sub x4, x4, #1 //prfm pldl1strm, [x0] diff --git a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S index f8f9e03c7e..26394e596b 100644 --- a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S +++ b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S @@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon .endr WELS_ASM_AARCH64_FUNC_END +.rodata .align 4 intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 +.previous WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon sxtw x1, w1 @@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon uxtl v1.8h, v1.8b uxtl v0.8h, v0.8b - ldr q2, intra_1_to_4 - ldr q3, intra_m3_to_p4 + adrp x4, intra_1_to_4 + adrp x5, intra_m3_to_p4 + ldr q2, [x4, #:lo12:intra_1_to_4] + ldr q3, [x5, #:lo12:intra_m3_to_p4] dup v4.8h, v0.h[3] dup v5.8h, v0.h[7] add v4.8h, v4.8h, v5.8h @@ -456,9 +460,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon WELS_ASM_AARCH64_FUNC_END +.rodata .align 4 intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 +.previous WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon sxtw x1, w1 @@ -492,7 +498,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon uxtl v3.8h, v3.8b sub v0.8h, v1.8h, v0.8h sub v2.8h, v3.8h, v2.8h - ldr q4, intra_1_to_8 + adrp x4, intra_1_to_8 + ldr q4, [x4, #:lo12:intra_1_to_8] mul v0.8h, v0.8h, v4.8h mul v2.8h, v2.8h, v4.8h saddlv s0, v0.8h @@ -501,8 +508,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] shl v1.8h, v1.8h, #4 // a is in v1.h[7] - ldr q4, intra_m7_to_p8 - ldr q5, intra_m7_to_p8 + 16 + adrp x4, intra_m7_to_p8 + add x5, x4, 16 + ldr q4, [x4, #:lo12:intra_m7_to_p8] + ldr q5, [x5, #:lo12:intra_m7_to_p8] dup v1.8h, v1.h[7] dup v3.8h, v1.h[7] mla v1.8h, v4.8h, v0.h[0] diff --git a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S index ef50027d64..d0ac1febce 100644 --- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S +++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S @@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon .endr WELS_ASM_AARCH64_FUNC_END +.rodata .align 4 intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 +.previous WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon SIGN_EXTENSION x2,w2 @@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon uxtl v1.8h, v1.8b uxtl v0.8h, v0.8b - ldr q2, intra_1_to_4 - ldr q3, intra_m3_to_p4 + adrp x4, intra_1_to_4 + adrp x5, intra_m3_to_p4 + ldr q2, [x4, #:lo12:intra_1_to_4] + ldr q3, [x5, #:lo12:intra_m3_to_p4] dup v4.8h, v0.h[3] dup v5.8h, v0.h[7] add v4.8h, v4.8h, v5.8h @@ -437,9 +441,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon WELS_ASM_AARCH64_FUNC_END +.rodata .align 4 intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 +.previous //void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon SIGN_EXTENSION x2,w2 @@ -473,7 +479,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon uxtl v3.8h, v3.8b sub v0.8h, v1.8h, v0.8h sub v2.8h, v3.8h, v2.8h - ldr q4, intra_1_to_8 + adrp x4, intra_1_to_8 + ldr q4, [x4, #:lo12:intra_1_to_8] mul v0.8h, v0.8h, v4.8h mul v2.8h, v2.8h, v4.8h saddlv s0, v0.8h @@ -482,8 +489,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] shl v1.8h, v1.8h, #4 // a is in v1.h[7] - ldr q4, intra_m7_to_p8 - ldr q5, intra_m7_to_p8 + 16 + adrp x4, intra_m7_to_p8 + add x5, x4, 16 + ldr q4, [x4, #:lo12:intra_m7_to_p8] + ldr q5, [x5, #:lo12:intra_m7_to_p8] dup v1.8h, v1.h[7] dup v3.8h, v1.h[7] mla v1.8h, v4.8h, v0.h[0] diff --git a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S index 422a5f8442..6e959466fd 100644 --- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S +++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S @@ -283,16 +283,21 @@ _hash_assign_loop_x4_rem: _hash_assign_end: WELS_ASM_AARCH64_FUNC_END +.rodata .align 4 mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 +.previous WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon // void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) - ldr q7, mv_x_inc_x4 - ldr q6, mv_y_inc_x4 - ldr q5, mx_x_offset_x4 + adrp x4, mv_x_inc_x4 + adrp x5, mv_y_inc_x4 + adrp x6, mx_x_offset_x4 + ldr q7, [x4, #:lo12:mv_x_inc_x4] + ldr q6, [x5, #:lo12:mv_y_inc_x4] + ldr q5, [x6, #:lo12:mx_x_offset_x4] SIGN_EXTENSION x1,w1 SIGN_EXTENSION x2,w2 eor v4.16b, v4.16b, v4.16b