| diff --git a/third_party/libopenjpeg/dwt.c b/third_party/libopenjpeg/dwt.c |
| index 731e7b2c9..e17e9f90b 100644 |
| --- a/third_party/libopenjpeg/dwt.c |
| +++ b/third_party/libopenjpeg/dwt.c |
| @@ -55,6 +55,9 @@ |
| #if (defined(__AVX2__) || defined(__AVX512F__)) |
| #include <immintrin.h> |
| #endif |
| +#ifdef __ARM_NEON |
| +#include <arm_neon.h> |
| +#endif |
| |
| #if defined(__GNUC__) |
| #pragma GCC poison malloc calloc realloc free |
| @@ -3290,6 +3293,72 @@ static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, |
| } |
| } |
| |
| +#elif defined(__ARM_NEON) |
| + |
| +static void opj_v8dwt_decode_step1_neon(opj_v8_t* w, |
| + OPJ_UINT32 start, |
| + OPJ_UINT32 end, |
| + const OPJ_FLOAT32 c) |
| +{ |
| + OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; |
| + OPJ_UINT32 i; |
| + float32x4_t vc = vdupq_n_f32(c); |
| + /* To be adapted if NB_ELTS_V8 changes */ |
| + fw += 2 * NB_ELTS_V8 * start; |
| + for (i = start; i < end; ++i, fw += 2 * NB_ELTS_V8) { |
| + float32x4_t v0 = vld1q_f32(fw); |
| + float32x4_t v1 = vld1q_f32(fw + 4); |
| + vst1q_f32(fw, vmulq_f32(v0, vc)); |
| + vst1q_f32(fw + 4, vmulq_f32(v1, vc)); |
| + } |
| +} |
| + |
| +static void opj_v8dwt_decode_step2_neon(opj_v8_t* l, opj_v8_t* w, |
| + OPJ_UINT32 start, |
| + OPJ_UINT32 end, |
| + OPJ_UINT32 m, |
| + OPJ_FLOAT32 c) |
| +{ |
| + OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; |
| + OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; |
| + OPJ_UINT32 i; |
| + OPJ_UINT32 imax = opj_uint_min(end, m); |
| + float32x4_t vc; |
| + if (start > 0) { |
| + fw += 2 * NB_ELTS_V8 * start; |
| + fl = fw - 2 * NB_ELTS_V8; |
| + } |
| + /* To be adapted if NB_ELTS_V8 changes */ |
| + vc = vdupq_n_f32(c); |
| + for (i = start; i < imax; ++i) { |
| + float32x4_t fl0 = vld1q_f32(fl); |
| + float32x4_t fl1 = vld1q_f32(fl + 4); |
| + float32x4_t fw0 = vld1q_f32(fw); |
| + float32x4_t fw1 = vld1q_f32(fw + 4); |
| + float32x4_t fwm8 = vld1q_f32(fw - 8); |
| + float32x4_t fwm4 = vld1q_f32(fw - 4); |
| + fwm8 = vmlaq_f32(fwm8, vaddq_f32(fl0, fw0), vc); |
| + fwm4 = vmlaq_f32(fwm4, vaddq_f32(fl1, fw1), vc); |
| + vst1q_f32(fw - 8, fwm8); |
| + vst1q_f32(fw - 4, fwm4); |
| + fl = fw; |
| + fw += 2 * NB_ELTS_V8; |
| + } |
| + if (m < end) { |
| + float32x4_t vc2, fl0, fl1, fwm8, fwm4; |
| + assert(m + 1 == end); |
| + vc2 = vaddq_f32(vc, vc); |
| + fl0 = vld1q_f32(fl); |
| + fl1 = vld1q_f32(fl + 4); |
| + fwm8 = vld1q_f32(fw - 8); |
| + fwm4 = vld1q_f32(fw - 4); |
| + fwm8 = vmlaq_f32(fwm8, fl0, vc2); |
| + fwm4 = vmlaq_f32(fwm4, fl1, vc2); |
| + vst1q_f32(fw - 8, fwm8); |
| + vst1q_f32(fw - 4, fwm4); |
| + } |
| +} |
| + |
| #else |
| |
| static void opj_v8dwt_decode_step1(opj_v8_t* w, |
| @@ -3403,6 +3472,27 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) |
| dwt->win_h_x0, dwt->win_h_x1, |
| (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), |
| _mm_set1_ps(-opj_dwt_alpha)); |
| +#elif defined(__ARM_NEON) |
| + opj_v8dwt_decode_step1_neon(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, |
| + opj_K); |
| + opj_v8dwt_decode_step1_neon(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, |
| + two_invK); |
| + opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1, |
| + dwt->win_l_x0, dwt->win_l_x1, |
| + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), |
| + -opj_dwt_delta); |
| + opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1, |
| + dwt->win_h_x0, dwt->win_h_x1, |
| + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), |
| + -opj_dwt_gamma); |
| + opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1, |
| + dwt->win_l_x0, dwt->win_l_x1, |
| + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), |
| + -opj_dwt_beta); |
| + opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1, |
| + dwt->win_h_x0, dwt->win_h_x1, |
| + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), |
| + -opj_dwt_alpha); |
| #else |
| opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, |
| opj_K); |