Update openjpeg with alchemy changes

2020-09-29 14:40:25 -04:00
parent f941bc4334
commit f0dfb0b080
12 changed files with 753 additions and 220 deletions
--- a/indra/libopenjpeg/dwt.c
+++ b/indra/libopenjpeg/dwt.c
@@ -31,11 +31,16 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#define OPJ_SKIP_POISON
+#include "opj_includes.h"
+
 #ifdef __SSE__
 #include <xmmintrin.h>
 #endif

-#include "opj_includes.h"
+#if defined(__GNUC__)
+#pragma GCC poison malloc calloc realloc free
+#endif

 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
 /*@{*/
@@ -499,7 +504,7 @@ void dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, int prec) {
 /* <summary>                             */
 /* Determine maximum computed resolution level for inverse wavelet transform */
 /* </summary>                            */
-static int dwt_decode_max_resolution(opj_tcd_resolution_t* restrict r, int i) {
+static int dwt_decode_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, int i) {
 	int mr	= 1;
 	int w;
 	while( --i ) {
@@ -531,7 +536,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
 	v.mem = h.mem;

 	while( --numres) {
-		int * restrict tiledp = tilec->data;
+		int * OPJ_RESTRICT tiledp = tilec->data;
 		int j;

 		++tr;
@@ -565,48 +570,49 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
 	opj_aligned_free(h.mem);
 }

-static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, int size){
-	float* restrict bi = (float*) (w->wavelet + w->cas);
+static void v4dwt_interleave_h(v4dwt_t* OPJ_RESTRICT w, float* OPJ_RESTRICT a, int x, int size) {
+	float* OPJ_RESTRICT bi = (float*)(w->wavelet + w->cas);
 	int count = w->sn;
 	int i, k;
-	for(k = 0; k < 2; ++k){
-		if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+	for (k = 0; k < 2; ++k) {
+		if (count + 3 * x < size && ((size_t)a & 0x0f) == 0 && ((size_t)bi & 0x0f) == 0 && (x & 0x0f) == 0) {
 			/* Fast code path */
-			for(i = 0; i < count; ++i){
+			for (i = 0; i < count; ++i) {
 				int j = i;
-				bi[i*8    ] = a[j];
+				bi[i * 8] = a[j];
 				j += x;
-				bi[i*8 + 1] = a[j];
+				bi[i * 8 + 1] = a[j];
 				j += x;
-				bi[i*8 + 2] = a[j];
+				bi[i * 8 + 2] = a[j];
 				j += x;
-				bi[i*8 + 3] = a[j];
+				bi[i * 8 + 3] = a[j];
 			}
-		} else {
+		}
+		else {
 			/* Slow code path */
-		for(i = 0; i < count; ++i){
-			int j = i;
-			bi[i*8    ] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 1] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 2] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 3] = a[j];
+			for (i = 0; i < count; ++i) {
+				int j = i;
+				bi[i * 8] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 1] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 2] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 3] = a[j];
+			}
 		}
-		}
-		bi = (float*) (w->wavelet + 1 - w->cas);
+		bi = (float*)(w->wavelet + 1 - w->cas);
 		a += w->sn;
 		size -= w->sn;
 		count = w->dn;
 	}
 }

-static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
-	v4* restrict bi = v->wavelet + v->cas;
+static void v4dwt_interleave_v(v4dwt_t* OPJ_RESTRICT v , float* OPJ_RESTRICT a , int x){
+	v4* OPJ_RESTRICT bi = v->wavelet + v->cas;
 	int i;
 	for(i = 0; i < v->sn; ++i){
 		memcpy(&bi[i*2], &a[i*x], 4 * sizeof(float));
@@ -621,7 +627,7 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
 #ifdef __SSE__

 static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
-	__m128* restrict vw = (__m128*) w;
+	__m128* OPJ_RESTRICT vw = (__m128*) w;
 	int i;
 	/* 4x unrolled loop */
 	for(i = 0; i < count >> 2; ++i){
@@ -642,22 +648,39 @@ static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
 }

 static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
-	__m128* restrict vl = (__m128*) l;
-	__m128* restrict vw = (__m128*) w;
+	__m128* OPJ_RESTRICT vl = (__m128*) l;
+	__m128* OPJ_RESTRICT vw = (__m128*) w;
 	int i;
 	__m128 tmp1, tmp2, tmp3;
 	tmp1 = vl[0];
-	for(i = 0; i < m; ++i){
+	for (i = 0; i < m - 3; i += 4) {
+		__m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+		tmp2 = vw[-1];
+		tmp3 = vw[0];
+		tmp4 = vw[1];
+		tmp5 = vw[2];
+		tmp6 = vw[3];
+		tmp7 = vw[4];
+		tmp8 = vw[5];
+		tmp9 = vw[6];
+		vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+		vw[1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
+		vw[3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
+		vw[5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
+		tmp1 = tmp9;
+		vw += 8;
+	}
+	for ( ; i < m; ++i) {
 		tmp2 = vw[-1];
 		tmp3 = vw[ 0];
 		vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
 		tmp1 = tmp3;
 		vw += 2;
 	}
-	vl = vw - 2;
 	if(m >= k){
 		return;
 	}
+	vl = vw - 2;
 	c = _mm_add_ps(c, c);
 	c = _mm_mul_ps(c, vl[0]);
 	for(; m < k; ++m){
@@ -670,7 +693,7 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
 #else

 static void v4dwt_decode_step1(v4* w, int count, const float c){
-	float* restrict fw = (float*) w;
+	float* OPJ_RESTRICT fw = (float*) w;
 	int i;
 	for(i = 0; i < count; ++i){
 		float tmp1 = fw[i*8    ];
@@ -685,8 +708,8 @@ static void v4dwt_decode_step1(v4* w, int count, const float c){
 }

 static void v4dwt_decode_step2(v4* l, v4* w, int k, int m, float c){
-	float* restrict fl = (float*) l;
-	float* restrict fw = (float*) w;
+	float* OPJ_RESTRICT fl = (float*) l;
+	float* OPJ_RESTRICT fw = (float*) w;
 	int i;
 	for(i = 0; i < m; ++i){
 		float tmp1_1 = fl[0];
@@ -737,42 +760,44 @@ static void v4dwt_decode_step2(v4* l, v4* w, int k, int m, float c){
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
-static void v4dwt_decode(v4dwt_t* restrict dwt){
+static void v4dwt_decode(v4dwt_t* OPJ_RESTRICT dwt){
 	int a, b;
 	if(dwt->cas == 0) {
-		if(!((dwt->dn > 0) || (dwt->sn > 1))){
+		if (dwt->dn <= 0 && dwt->sn <= 1) {
 			return;
 		}
 		a = 0;
 		b = 1;
 	}else{
-		if(!((dwt->sn > 0) || (dwt->dn > 1))) {
+		if (dwt->sn <= 0 && dwt->dn <= 1) {
 			return;
 		}
 		a = 1;
 		b = 0;
 	}
+	v4* OPJ_RESTRICT waveleta = dwt->wavelet + a;
+	v4* OPJ_RESTRICT waveletb = dwt->wavelet + b;
 #ifdef __SSE__
-	v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
-	v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
-	v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
-	v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
-	v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
-	v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
+	v4dwt_decode_step1_sse(waveleta, dwt->sn, _mm_set1_ps(K));
+	v4dwt_decode_step1_sse(waveletb, dwt->dn, _mm_set1_ps(c13318));
+	v4dwt_decode_step2_sse(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
+	v4dwt_decode_step2_sse(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
+	v4dwt_decode_step2_sse(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
+	v4dwt_decode_step2_sse(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
 #else
-	v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
-	v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
-	v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
-	v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
-	v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
-	v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
+	v4dwt_decode_step1(waveleta, dwt->sn, K);
+	v4dwt_decode_step1(waveletb, dwt->dn, c13318);
+	v4dwt_decode_step2(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
+	v4dwt_decode_step2(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
+	v4dwt_decode_step2(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
+	v4dwt_decode_step2(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
 #endif
 }

 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 2-D. */
 /* </summary>                            */
-void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
+void dwt_decode_real(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, int numres){
 	v4dwt_t h;
 	v4dwt_t v;

@@ -787,7 +812,7 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
 	v.wavelet = h.wavelet;

 	while( --numres) {
-		float * restrict aj = (float*) tilec->data;
+		float * OPJ_RESTRICT aj = (float*) tilec->data;
 		int bufsize = (tilec->x1 - tilec->x0) * (tilec->y1 - tilec->y0);
 		int j;