blob: d45aa599a1bf39bc98dbcf7d9e8027c6b84449ea [file]
diff --git a/third_party/libopenjpeg/dwt.c b/third_party/libopenjpeg/dwt.c
index 731e7b2c9..e17e9f90b 100644
--- a/third_party/libopenjpeg/dwt.c
+++ b/third_party/libopenjpeg/dwt.c
@@ -55,6 +55,9 @@
#if (defined(__AVX2__) || defined(__AVX512F__))
#include <immintrin.h>
#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
#if defined(__GNUC__)
#pragma GCC poison malloc calloc realloc free
@@ -3290,6 +3293,72 @@ static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
}
}
+#elif defined(__ARM_NEON)
+
+static void opj_v8dwt_decode_step1_neon(opj_v8_t* w,
+ OPJ_UINT32 start,
+ OPJ_UINT32 end,
+ const OPJ_FLOAT32 c)
+{
+ OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
+ OPJ_UINT32 i;
+ float32x4_t vc = vdupq_n_f32(c);
+ /* To be adapted if NB_ELTS_V8 changes */
+ fw += 2 * NB_ELTS_V8 * start;
+ for (i = start; i < end; ++i, fw += 2 * NB_ELTS_V8) {
+ float32x4_t v0 = vld1q_f32(fw);
+ float32x4_t v1 = vld1q_f32(fw + 4);
+ vst1q_f32(fw, vmulq_f32(v0, vc));
+ vst1q_f32(fw + 4, vmulq_f32(v1, vc));
+ }
+}
+
+static void opj_v8dwt_decode_step2_neon(opj_v8_t* l, opj_v8_t* w,
+ OPJ_UINT32 start,
+ OPJ_UINT32 end,
+ OPJ_UINT32 m,
+ OPJ_FLOAT32 c)
+{
+ OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
+ OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
+ OPJ_UINT32 i;
+ OPJ_UINT32 imax = opj_uint_min(end, m);
+ float32x4_t vc;
+ if (start > 0) {
+ fw += 2 * NB_ELTS_V8 * start;
+ fl = fw - 2 * NB_ELTS_V8;
+ }
+ /* To be adapted if NB_ELTS_V8 changes */
+ vc = vdupq_n_f32(c);
+ for (i = start; i < imax; ++i) {
+ float32x4_t fl0 = vld1q_f32(fl);
+ float32x4_t fl1 = vld1q_f32(fl + 4);
+ float32x4_t fw0 = vld1q_f32(fw);
+ float32x4_t fw1 = vld1q_f32(fw + 4);
+ float32x4_t fwm8 = vld1q_f32(fw - 8);
+ float32x4_t fwm4 = vld1q_f32(fw - 4);
+ fwm8 = vmlaq_f32(fwm8, vaddq_f32(fl0, fw0), vc);
+ fwm4 = vmlaq_f32(fwm4, vaddq_f32(fl1, fw1), vc);
+ vst1q_f32(fw - 8, fwm8);
+ vst1q_f32(fw - 4, fwm4);
+ fl = fw;
+ fw += 2 * NB_ELTS_V8;
+ }
+ if (m < end) {
+ float32x4_t vc2, fl0, fl1, fwm8, fwm4;
+ assert(m + 1 == end);
+ vc2 = vaddq_f32(vc, vc);
+ fl0 = vld1q_f32(fl);
+ fl1 = vld1q_f32(fl + 4);
+ fwm8 = vld1q_f32(fw - 8);
+ fwm4 = vld1q_f32(fw - 4);
+ fwm8 = vmlaq_f32(fwm8, fl0, vc2);
+ fwm4 = vmlaq_f32(fwm4, fl1, vc2);
+ vst1q_f32(fw - 8, fwm8);
+ vst1q_f32(fw - 4, fwm4);
+ }
+}
+
#else
static void opj_v8dwt_decode_step1(opj_v8_t* w,
@@ -3403,6 +3472,27 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
_mm_set1_ps(-opj_dwt_alpha));
+#elif defined(__ARM_NEON)
+ opj_v8dwt_decode_step1_neon(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
+ opj_K);
+ opj_v8dwt_decode_step1_neon(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
+ two_invK);
+ opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1,
+ dwt->win_l_x0, dwt->win_l_x1,
+ (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
+ -opj_dwt_delta);
+ opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1,
+ dwt->win_h_x0, dwt->win_h_x1,
+ (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
+ -opj_dwt_gamma);
+ opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1,
+ dwt->win_l_x0, dwt->win_l_x1,
+ (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
+ -opj_dwt_beta);
+ opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1,
+ dwt->win_h_x0, dwt->win_h_x1,
+ (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
+ -opj_dwt_alpha);
#else
opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
opj_K);