third_party/libopenjpeg/0049-dwt_neon_5-3_idwt.patch - pdfium.git - Git at Google

 diff --git a/third_party/libopenjpeg/dwt.c b/third_party/libopenjpeg/dwt.c
 index e17e9f90b..726f08ace 100644
 --- a/third_party/libopenjpeg/dwt.c
 +++ b/third_party/libopenjpeg/dwt.c
 @@ -73,7 +73,7 @@
  /** Number of int32 values in a AVX2 register */
  #define VREG_INT_COUNT       8
  #else
 -/** Number of int32 values in a SSE2 register */
 +/** Number of int32 values in a SSE2 or NEON register */
  #define VREG_INT_COUNT       4
  #endif

 @@ -707,7 +707,7 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
  #endif
  }

 -#if (defined(__SSE2__) || defined(__AVX2__) || defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)
 +#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__) || defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)

  /* Conveniency macros to improve the readability of the formulas */
  #if defined(__AVX512F__)
 @@ -730,6 +730,16 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
  #define ADD(x,y)    _mm256_add_epi32((x),(y))
  #define SUB(x,y)    _mm256_sub_epi32((x),(y))
  #define SAR(x,y)    _mm256_srai_epi32((x),(y))
 +#elif defined(__ARM_NEON)
 +#define VREG        int32x4_t
 +#define LOAD_CST(x) vdupq_n_s32(x)
 +#define LOAD(x)     vld1q_s32((const int32_t*)(x))
 +#define LOADU(x)    vld1q_s32((const int32_t*)(x))
 +#define STORE(x,y)  vst1q_s32((int32_t*)(x),(y))
 +#define STOREU(x,y) vst1q_s32((int32_t*)(x),(y))
 +#define ADD(x,y)    vaddq_s32((x),(y))
 +#define SUB(x,y)    vsubq_s32((x),(y))
 +#define SAR(x,y)    vshrq_n_s32((x),(y))
  #else
  #define VREG        __m128i
  #define LOAD_CST(x) _mm_set1_epi32(x)
 @@ -763,9 +773,9 @@ void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
      }
  }

 -/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 - * 16 in AVX2, when top-most pixel is on even coordinate */
 -static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
 +/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
 + * or 16 in AVX2, when top-most pixel is on even coordinate */
 +static void opj_idwt53_v_cas0_mcols_SIMD(
      OPJ_INT32* tmp,
      const OPJ_INT32 sn,
      const OPJ_INT32 len,
 @@ -870,9 +880,9 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
  }


 -/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 - * 16 in AVX2, when top-most pixel is on odd coordinate */
 -static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
 +/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
 + * or 16 in AVX2, when top-most pixel is on odd coordinate */
 +static void opj_idwt53_v_cas1_mcols_SIMD(
      OPJ_INT32* tmp,
      const OPJ_INT32 sn,
      const OPJ_INT32 len,
 @@ -1112,11 +1122,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
      if (dwt->cas == 0) {
          /* If len == 1, unmodified value */

 -#if (defined(__SSE2__) || defined(__AVX2__))
 +#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__))
          if (len > 1 && nb_cols == PARALLEL_COLS_53) {
 -            /* Same as below general case, except that thanks to SSE2/AVX2 */
 +            /* Same as below general case, except that thanks to SIMD */
              /* we can efficiently process 8/16 columns in parallel */
 -            opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 +            opj_idwt53_v_cas0_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
              return;
          }
  #endif
 @@ -1155,11 +1165,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
              return;
          }

 -#if (defined(__SSE2__) || defined(__AVX2__))
 +#if (defined(__ARM_NEON) || defined(__SSE2__) || defined(__AVX2__))
          if (len > 2 && nb_cols == PARALLEL_COLS_53) {
 -            /* Same as below general case, except that thanks to SSE2/AVX2 */
 +            /* Same as below general case, except that thanks to SIMD */
              /* we can efficiently process 8/16 columns in parallel */
 -            opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 +            opj_idwt53_v_cas1_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
              return;
          }
  #endif
	diff --git a/third_party/libopenjpeg/dwt.c b/third_party/libopenjpeg/dwt.c
	index e17e9f90b..726f08ace 100644
	--- a/third_party/libopenjpeg/dwt.c
	+++ b/third_party/libopenjpeg/dwt.c
	@@ -73,7 +73,7 @@
	/** Number of int32 values in a AVX2 register */
	#define VREG_INT_COUNT 8
	#else
	-/** Number of int32 values in a SSE2 register */
	+/** Number of int32 values in a SSE2 or NEON register */
	#define VREG_INT_COUNT 4
	#endif

	@@ -707,7 +707,7 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
	#endif
	}

	-#if (defined(__SSE2__) \|\| defined(__AVX2__) \|\| defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)
	+#if (defined(__ARM_NEON) \|\| defined(__SSE2__) \|\| defined(__AVX2__) \|\| defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION)

	/* Conveniency macros to improve the readability of the formulas */
	#if defined(__AVX512F__)
	@@ -730,6 +730,16 @@ static void opj_idwt53_h(const opj_dwt_t *dwt,
	#define ADD(x,y) _mm256_add_epi32((x),(y))
	#define SUB(x,y) _mm256_sub_epi32((x),(y))
	#define SAR(x,y) _mm256_srai_epi32((x),(y))
	+#elif defined(__ARM_NEON)
	+#define VREG int32x4_t
	+#define LOAD_CST(x) vdupq_n_s32(x)
	+#define LOAD(x) vld1q_s32((const int32_t*)(x))
	+#define LOADU(x) vld1q_s32((const int32_t*)(x))
	+#define STORE(x,y) vst1q_s32((int32_t*)(x),(y))
	+#define STOREU(x,y) vst1q_s32((int32_t*)(x),(y))
	+#define ADD(x,y) vaddq_s32((x),(y))
	+#define SUB(x,y) vsubq_s32((x),(y))
	+#define SAR(x,y) vshrq_n_s32((x),(y))
	#else
	#define VREG __m128i
	#define LOAD_CST(x) _mm_set1_epi32(x)
	@@ -763,9 +773,9 @@ void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
	}
	}

	-/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
	- * 16 in AVX2, when top-most pixel is on even coordinate */
	-static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
	+/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
	+ * or 16 in AVX2, when top-most pixel is on even coordinate */
	+static void opj_idwt53_v_cas0_mcols_SIMD(
	OPJ_INT32* tmp,
	const OPJ_INT32 sn,
	const OPJ_INT32 len,
	@@ -870,9 +880,9 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
	}


	-/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
	- * 16 in AVX2, when top-most pixel is on odd coordinate */
	-static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
	+/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2 and NEON,
	+ * or 16 in AVX2, when top-most pixel is on odd coordinate */
	+static void opj_idwt53_v_cas1_mcols_SIMD(
	OPJ_INT32* tmp,
	const OPJ_INT32 sn,
	const OPJ_INT32 len,
	@@ -1112,11 +1122,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
	if (dwt->cas == 0) {
	/* If len == 1, unmodified value */

	-#if (defined(__SSE2__) \|\| defined(__AVX2__))
	+#if (defined(__ARM_NEON) \|\| defined(__SSE2__) \|\| defined(__AVX2__))
	if (len > 1 && nb_cols == PARALLEL_COLS_53) {
	- /* Same as below general case, except that thanks to SSE2/AVX2 */
	+ /* Same as below general case, except that thanks to SIMD */
	/* we can efficiently process 8/16 columns in parallel */
	- opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
	+ opj_idwt53_v_cas0_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
	return;
	}
	#endif
	@@ -1155,11 +1165,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
	return;
	}

	-#if (defined(__SSE2__) \|\| defined(__AVX2__))
	+#if (defined(__ARM_NEON) \|\| defined(__SSE2__) \|\| defined(__AVX2__))
	if (len > 2 && nb_cols == PARALLEL_COLS_53) {
	- /* Same as below general case, except that thanks to SSE2/AVX2 */
	+ /* Same as below general case, except that thanks to SIMD */
	/* we can efficiently process 8/16 columns in parallel */
	- opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
	+ opj_idwt53_v_cas1_mcols_SIMD(dwt->mem, sn, len, tiledp_col, stride);
	return;
	}
	#endif