Update openjpeg with alchemy changes

2020-09-29 14:40:25 -04:00
parent f941bc4334
commit f0dfb0b080
12 changed files with 753 additions and 220 deletions
--- a/indra/libopenjpeg/CMakeLists.txt
+++ b/indra/libopenjpeg/CMakeLists.txt
@@ -26,6 +26,7 @@ set(openjpeg_SOURCE_FILES
    mct.c
    mqc.c
    openjpeg.c
+    opj_malloc.c
    phix_manager.c
    pi.c
    ppix_manager.c
--- a/indra/libopenjpeg/dwt.c
+++ b/indra/libopenjpeg/dwt.c
@@ -31,11 +31,16 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#define OPJ_SKIP_POISON
+#include "opj_includes.h"
+
 #ifdef __SSE__
 #include <xmmintrin.h>
 #endif

-#include "opj_includes.h"
+#if defined(__GNUC__)
+#pragma GCC poison malloc calloc realloc free
+#endif

 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
 /*@{*/
@@ -499,7 +504,7 @@ void dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, int prec) {
 /* <summary>                             */
 /* Determine maximum computed resolution level for inverse wavelet transform */
 /* </summary>                            */
-static int dwt_decode_max_resolution(opj_tcd_resolution_t* restrict r, int i) {
+static int dwt_decode_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, int i) {
 	int mr	= 1;
 	int w;
 	while( --i ) {
@@ -531,7 +536,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
 	v.mem = h.mem;

 	while( --numres) {
-		int * restrict tiledp = tilec->data;
+		int * OPJ_RESTRICT tiledp = tilec->data;
 		int j;

 		++tr;
@@ -565,48 +570,49 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
 	opj_aligned_free(h.mem);
 }

-static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, int size){
-	float* restrict bi = (float*) (w->wavelet + w->cas);
+static void v4dwt_interleave_h(v4dwt_t* OPJ_RESTRICT w, float* OPJ_RESTRICT a, int x, int size) {
+	float* OPJ_RESTRICT bi = (float*)(w->wavelet + w->cas);
 	int count = w->sn;
 	int i, k;
-	for(k = 0; k < 2; ++k){
-		if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+	for (k = 0; k < 2; ++k) {
+		if (count + 3 * x < size && ((size_t)a & 0x0f) == 0 && ((size_t)bi & 0x0f) == 0 && (x & 0x0f) == 0) {
 			/* Fast code path */
-			for(i = 0; i < count; ++i){
+			for (i = 0; i < count; ++i) {
 				int j = i;
-				bi[i*8    ] = a[j];
+				bi[i * 8] = a[j];
 				j += x;
-				bi[i*8 + 1] = a[j];
+				bi[i * 8 + 1] = a[j];
 				j += x;
-				bi[i*8 + 2] = a[j];
+				bi[i * 8 + 2] = a[j];
 				j += x;
-				bi[i*8 + 3] = a[j];
+				bi[i * 8 + 3] = a[j];
 			}
-		} else {
+		}
+		else {
 			/* Slow code path */
-		for(i = 0; i < count; ++i){
-			int j = i;
-			bi[i*8    ] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 1] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 2] = a[j];
-			j += x;
-			if(j > size) continue;
-			bi[i*8 + 3] = a[j];
+			for (i = 0; i < count; ++i) {
+				int j = i;
+				bi[i * 8] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 1] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 2] = a[j];
+				j += x;
+				if (j > size) continue;
+				bi[i * 8 + 3] = a[j];
+			}
 		}
-		}
-		bi = (float*) (w->wavelet + 1 - w->cas);
+		bi = (float*)(w->wavelet + 1 - w->cas);
 		a += w->sn;
 		size -= w->sn;
 		count = w->dn;
 	}
 }

-static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
-	v4* restrict bi = v->wavelet + v->cas;
+static void v4dwt_interleave_v(v4dwt_t* OPJ_RESTRICT v , float* OPJ_RESTRICT a , int x){
+	v4* OPJ_RESTRICT bi = v->wavelet + v->cas;
 	int i;
 	for(i = 0; i < v->sn; ++i){
 		memcpy(&bi[i*2], &a[i*x], 4 * sizeof(float));
@@ -621,7 +627,7 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
 #ifdef __SSE__

 static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
-	__m128* restrict vw = (__m128*) w;
+	__m128* OPJ_RESTRICT vw = (__m128*) w;
 	int i;
 	/* 4x unrolled loop */
 	for(i = 0; i < count >> 2; ++i){
@@ -642,22 +648,39 @@ static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
 }

 static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
-	__m128* restrict vl = (__m128*) l;
-	__m128* restrict vw = (__m128*) w;
+	__m128* OPJ_RESTRICT vl = (__m128*) l;
+	__m128* OPJ_RESTRICT vw = (__m128*) w;
 	int i;
 	__m128 tmp1, tmp2, tmp3;
 	tmp1 = vl[0];
-	for(i = 0; i < m; ++i){
+	for (i = 0; i < m - 3; i += 4) {
+		__m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+		tmp2 = vw[-1];
+		tmp3 = vw[0];
+		tmp4 = vw[1];
+		tmp5 = vw[2];
+		tmp6 = vw[3];
+		tmp7 = vw[4];
+		tmp8 = vw[5];
+		tmp9 = vw[6];
+		vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+		vw[1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
+		vw[3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
+		vw[5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
+		tmp1 = tmp9;
+		vw += 8;
+	}
+	for ( ; i < m; ++i) {
 		tmp2 = vw[-1];
 		tmp3 = vw[ 0];
 		vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
 		tmp1 = tmp3;
 		vw += 2;
 	}
-	vl = vw - 2;
 	if(m >= k){
 		return;
 	}
+	vl = vw - 2;
 	c = _mm_add_ps(c, c);
 	c = _mm_mul_ps(c, vl[0]);
 	for(; m < k; ++m){
@@ -670,7 +693,7 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
 #else

 static void v4dwt_decode_step1(v4* w, int count, const float c){
-	float* restrict fw = (float*) w;
+	float* OPJ_RESTRICT fw = (float*) w;
 	int i;
 	for(i = 0; i < count; ++i){
 		float tmp1 = fw[i*8    ];
@@ -685,8 +708,8 @@ static void v4dwt_decode_step1(v4* w, int count, const float c){
 }

 static void v4dwt_decode_step2(v4* l, v4* w, int k, int m, float c){
-	float* restrict fl = (float*) l;
-	float* restrict fw = (float*) w;
+	float* OPJ_RESTRICT fl = (float*) l;
+	float* OPJ_RESTRICT fw = (float*) w;
 	int i;
 	for(i = 0; i < m; ++i){
 		float tmp1_1 = fl[0];
@@ -737,42 +760,44 @@ static void v4dwt_decode_step2(v4* l, v4* w, int k, int m, float c){
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
-static void v4dwt_decode(v4dwt_t* restrict dwt){
+static void v4dwt_decode(v4dwt_t* OPJ_RESTRICT dwt){
 	int a, b;
 	if(dwt->cas == 0) {
-		if(!((dwt->dn > 0) || (dwt->sn > 1))){
+		if (dwt->dn <= 0 && dwt->sn <= 1) {
 			return;
 		}
 		a = 0;
 		b = 1;
 	}else{
-		if(!((dwt->sn > 0) || (dwt->dn > 1))) {
+		if (dwt->sn <= 0 && dwt->dn <= 1) {
 			return;
 		}
 		a = 1;
 		b = 0;
 	}
+	v4* OPJ_RESTRICT waveleta = dwt->wavelet + a;
+	v4* OPJ_RESTRICT waveletb = dwt->wavelet + b;
 #ifdef __SSE__
-	v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
-	v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
-	v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
-	v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
-	v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
-	v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
+	v4dwt_decode_step1_sse(waveleta, dwt->sn, _mm_set1_ps(K));
+	v4dwt_decode_step1_sse(waveletb, dwt->dn, _mm_set1_ps(c13318));
+	v4dwt_decode_step2_sse(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
+	v4dwt_decode_step2_sse(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
+	v4dwt_decode_step2_sse(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
+	v4dwt_decode_step2_sse(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
 #else
-	v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
-	v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
-	v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
-	v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
-	v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
-	v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
+	v4dwt_decode_step1(waveleta, dwt->sn, K);
+	v4dwt_decode_step1(waveletb, dwt->dn, c13318);
+	v4dwt_decode_step2(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
+	v4dwt_decode_step2(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
+	v4dwt_decode_step2(waveletb, waveleta + 1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
+	v4dwt_decode_step2(waveleta, waveletb + 1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
 #endif
 }

 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 2-D. */
 /* </summary>                            */
-void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
+void dwt_decode_real(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, int numres){
 	v4dwt_t h;
 	v4dwt_t v;

@@ -787,7 +812,7 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
 	v.wavelet = h.wavelet;

 	while( --numres) {
-		float * restrict aj = (float*) tilec->data;
+		float * OPJ_RESTRICT aj = (float*) tilec->data;
 		int bufsize = (tilec->x1 - tilec->x0) * (tilec->y1 - tilec->y0);
 		int j;

--- a/indra/libopenjpeg/mct.c
+++ b/indra/libopenjpeg/mct.c
@@ -29,11 +29,16 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#define OPJ_SKIP_POISON
+#include "opj_includes.h"
+
 #ifdef __SSE__
 #include <xmmintrin.h>
 #endif

-#include "opj_includes.h"
+#if defined(__GNUC__)
+#pragma GCC poison malloc calloc realloc free
+#endif

 /* <summary> */
 /* This table contains the norms of the basis function of the reversible MCT. */
@@ -49,17 +54,38 @@ static const double mct_norms_real[3] = { 1.732, 1.805, 1.573 };
 /* Foward reversible MCT. */
 /* </summary> */
 void mct_encode(
-		int* restrict c0,
-		int* restrict c1,
-		int* restrict c2,
+		int* OPJ_RESTRICT c0,
+		int* OPJ_RESTRICT c1,
+		int* OPJ_RESTRICT c2,
 		int n)
 {
-	int i;
-	for(i = 0; i < n; ++i) {
+	int i = 0;
+#ifdef __SSE2__
+	/* Buffers are normally aligned on 16 bytes... */
+	if (((size_t)c0 & 0xf) == 0 && ((size_t)c1 & 0xf) == 0 && ((size_t)c2 & 0xf) == 0) {
+		const int cnt = n & ~3U;
+		for (; i < cnt; i += 4) {
+			__m128i y, u, v;
+			__m128i r = _mm_load_si128((const __m128i*) & (c0[i]));
+			__m128i g = _mm_load_si128((const __m128i*) & (c1[i]));
+			__m128i b = _mm_load_si128((const __m128i*) & (c2[i]));
+			y = _mm_add_epi32(g, g);
+			y = _mm_add_epi32(y, b);
+			y = _mm_add_epi32(y, r);
+			y = _mm_srai_epi32(y, 2);
+			u = _mm_sub_epi32(b, g);
+			v = _mm_sub_epi32(r, g);
+			_mm_store_si128((__m128i*) & (c0[i]), y);
+			_mm_store_si128((__m128i*) & (c1[i]), u);
+			_mm_store_si128((__m128i*) & (c2[i]), v);
+		}
+	}
+#endif
+	for (; i < n; ++i) {
 		int r = c0[i];
 		int g = c1[i];
 		int b = c2[i];
-		int y = (r + (g * 2) + b) >> 2;
+		int y = (r + g + g + b) >> 2;
 		int u = b - g;
 		int v = r - g;
 		c0[i] = y;
@@ -72,13 +98,32 @@ void mct_encode(
 /* Inverse reversible MCT. */
 /* </summary> */
 void mct_decode(
-		int* restrict c0,
-		int* restrict c1, 
-		int* restrict c2, 
+		int* OPJ_RESTRICT c0,
+		int* OPJ_RESTRICT c1,
+		int* OPJ_RESTRICT c2,
 		int n)
 {
-	int i;
-	for (i = 0; i < n; ++i) {
+	int i = 0;
+#ifdef __SSE2__
+	/* Buffers are normally aligned on 16 bytes... */
+	if (((size_t)c0 & 0xf) == 0 && ((size_t)c1 & 0xf) == 0 && ((size_t)c2 & 0xf) == 0) {
+		const int cnt = n & ~3U;
+		for (; i < cnt; i += 4) {
+			__m128i r, g, b;
+			__m128i y = _mm_load_si128((const __m128i*) & (c0[i]));
+			__m128i u = _mm_load_si128((const __m128i*) & (c1[i]));
+			__m128i v = _mm_load_si128((const __m128i*) & (c2[i]));
+			g = y;
+			g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2));
+			r = _mm_add_epi32(v, g);
+			b = _mm_add_epi32(u, g);
+			_mm_store_si128((__m128i*) & (c0[i]), r);
+			_mm_store_si128((__m128i*) & (c1[i]), g);
+			_mm_store_si128((__m128i*) & (c2[i]), b);
+}
+	}
+#endif
+	for (; i < n; ++i) {
 		int y = c0[i];
 		int u = c1[i];
 		int v = c2[i];
@@ -102,13 +147,119 @@ double mct_getnorm(int compno) {
 /* Foward irreversible MCT. */
 /* </summary> */
 void mct_encode_real(
-		int* restrict c0,
-		int* restrict c1,
-		int* restrict c2,
+		int* OPJ_RESTRICT c0,
+		int* OPJ_RESTRICT c1,
+		int* OPJ_RESTRICT c2,
 		int n)
 {
-	int i;
-	for(i = 0; i < n; ++i) {
+	int i = 0;
+#ifdef __SSE4_1__
+	/* Buffers are normally aligned on 16 bytes... */
+	if (((size_t)c0 & 0xf) == 0 && ((size_t)c1 & 0xf) == 0 && ((size_t)c2 & 0xf) == 0) {
+		const int cnt = n & ~3U;
+		const __m128i ry = _mm_set1_epi32(2449);
+		const __m128i gy = _mm_set1_epi32(4809);
+		const __m128i by = _mm_set1_epi32(934);
+		const __m128i ru = _mm_set1_epi32(1382);
+		const __m128i gu = _mm_set1_epi32(2714);
+		const __m128i gv = _mm_set1_epi32(3430);
+		const __m128i bv = _mm_set1_epi32(666);
+		const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), _MM_SHUFFLE(1, 0, 1, 0));
+		for (; i < cnt; i += 4) {
+			__m128i lo, hi, y, u, v;
+			__m128i r = _mm_load_si128((const __m128i*) & (c0[i]));
+			__m128i g = _mm_load_si128((const __m128i*) & (c1[i]));
+			__m128i b = _mm_load_si128((const __m128i*) & (c2[i]));
+
+			hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(r, ry);
+			hi = _mm_mul_epi32(hi, ry);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			y = _mm_blend_epi16(lo, hi, 0xCC);
+
+			hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(g, gy);
+			hi = _mm_mul_epi32(hi, gy);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
+
+			hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(b, by);
+			hi = _mm_mul_epi32(hi, by);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
+			_mm_store_si128((__m128i*) & (c0[i]), y);
+
+			lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0)));
+			hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1)));
+			lo = _mm_slli_epi64(lo, 12);
+			hi = _mm_slli_epi64(hi, 12);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			u = _mm_blend_epi16(lo, hi, 0xCC);
+
+			hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(r, ru);
+			hi = _mm_mul_epi32(hi, ru);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
+
+			hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(g, gu);
+			hi = _mm_mul_epi32(hi, gu);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
+			_mm_store_si128((__m128i*) & (c1[i]), u);
+
+			lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0)));
+			hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1)));
+			lo = _mm_slli_epi64(lo, 12);
+			hi = _mm_slli_epi64(hi, 12);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			v = _mm_blend_epi16(lo, hi, 0xCC);
+
+			hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(g, gv);
+			hi = _mm_mul_epi32(hi, gv);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
+
+			hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
+			lo = _mm_mul_epi32(b, bv);
+			hi = _mm_mul_epi32(hi, bv);
+			lo = _mm_add_epi64(lo, mulround);
+			hi = _mm_add_epi64(hi, mulround);
+			lo = _mm_srli_epi64(lo, 13);
+			hi = _mm_slli_epi64(hi, 32 - 13);
+			v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
+			_mm_store_si128((__m128i*) & (c2[i]), v);
+		}
+	}
+#endif
+	for (; i < n; ++i) {
 		int r = c0[i];
 		int g = c1[i];
 		int b = c2[i];
@@ -125,19 +276,21 @@ void mct_encode_real(
 /* Inverse irreversible MCT. */
 /* </summary> */
 void mct_decode_real(
-		float* restrict c0,
-		float* restrict c1,
-		float* restrict c2,
+		float* OPJ_RESTRICT c0,
+		float* OPJ_RESTRICT c1,
+		float* OPJ_RESTRICT c2,
 		int n)
 {
 	int i;
 #ifdef __SSE__
+	int count;
 	__m128 vrv, vgu, vgv, vbu;
 	vrv = _mm_set1_ps(1.402f);
 	vgu = _mm_set1_ps(0.34413f);
 	vgv = _mm_set1_ps(0.71414f);
 	vbu = _mm_set1_ps(1.772f);
-	for (i = 0; i < (n >> 3); ++i) {
+	count = n >> 3;
+	for (i = 0; i < count; ++i) {
 		__m128 vy, vu, vv;
 		__m128 vr, vg, vb;

@@ -174,7 +327,7 @@ void mct_decode_real(
 		float u = c1[i];
 		float v = c2[i];
 		float r = y + (v * 1.402f);
-		float g = y - (u * 0.34413f) - (v * (0.71414f));
+		float g = y - (u * 0.34413f) - (v * 0.71414f);
 		float b = y + (u * 1.772f);
 		c0[i] = r;
 		c1[i] = g;
--- a/indra/libopenjpeg/openjpeg.h
+++ b/indra/libopenjpeg/openjpeg.h
@@ -40,33 +40,71 @@
 ==========================================================
 */

+/*
+The inline keyword is supported by C99 but not by C90.
+Most compilers implement their own version of this keyword ...
+*/
+#ifndef INLINE
+#if defined(_MSC_VER)
+#define INLINE __forceinline
+#elif defined(__GNUC__)
+#define INLINE __inline__
+#elif defined(__MWERKS__)
+#define INLINE inline
+#else
+/* add other compilers here ... */
+#define INLINE
+#endif /* defined(<Compiler>) */
+#endif /* INLINE */
 #if defined(OPJ_STATIC) || !defined(_WIN32)
 #define OPJ_API
 #define OPJ_CALLCONV
 #else
 #define OPJ_CALLCONV __stdcall
 /*
-The following ifdef block is the standard way of creating macros which make exporting 
+The following ifdef block is the standard way of creating macros which make exporting
 from a DLL simpler. All files within this DLL are compiled with the OPJ_EXPORTS
 symbol defined on the command line. this symbol should not be defined on any project
-that uses this DLL. This way any other project whose source files include this file see 
-OPJ_API functions as being imported from a DLL, wheras this DLL sees symbols
+that uses this DLL. This way any other project whose source files include this file see
+OPJ_API functions as being imported from a DLL, whereas this DLL sees symbols
 defined with this macro as being exported.
 */
-#if defined(OPJ_EXPORTS) || defined(DLL_EXPORT)
-#define OPJ_API __declspec(dllexport)
-#else
-#define OPJ_API __declspec(dllimport)
-#endif /* OPJ_EXPORTS */
+#   if defined(OPJ_EXPORTS) || defined(DLL_EXPORT)
+#       define OPJ_API __declspec(dllexport)
+#   else
+#       define OPJ_API __declspec(dllimport)
+#   endif /* OPJ_EXPORTS */
 #endif /* !OPJ_STATIC || !_WIN32 */

 typedef int opj_bool;
 #define OPJ_TRUE 1
 #define OPJ_FALSE 0

+typedef char          OPJ_CHAR;
+typedef float         OPJ_FLOAT32;
+typedef double        OPJ_FLOAT64;
+typedef unsigned char OPJ_BYTE;
+
+#include "opj_stdint.h"
+
+typedef int8_t   OPJ_INT8;
+typedef uint8_t  OPJ_UINT8;
+typedef int16_t  OPJ_INT16;
+typedef uint16_t OPJ_UINT16;
+typedef int32_t  OPJ_INT32;
+typedef uint32_t OPJ_UINT32;
+typedef int64_t  OPJ_INT64;
+typedef uint64_t OPJ_UINT64;
+
+typedef int64_t  OPJ_OFF_T; /* 64-bit file offset type */
+
+#include <stdio.h>
+typedef size_t   OPJ_SIZE_T;
+
 /* Avoid compile-time warning because parameter is not used */
 #define OPJ_ARG_NOT_USED(x) (void)(x)
-/* 
+
+/*
 ==========================================================
   Useful constant definitions
 ==========================================================
--- a/indra/libopenjpeg/opj_includes.h
+++ b/indra/libopenjpeg/opj_includes.h
@@ -40,6 +40,8 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <ctype.h>
+#include <assert.h>
+#include <limits.h>

 /*
 ==========================================================
@@ -54,56 +56,115 @@
 ==========================================================
 */

+/* Are restricted pointers available? (C99) */
+#if (__STDC_VERSION__ >= 199901L)
+#define OPJ_RESTRICT restrict
+#else
+/* Not a C99 compiler */
+#if defined(__GNUC__)
+#define OPJ_RESTRICT __restrict__
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+#define OPJ_RESTRICT __restrict
+#else
+#define OPJ_RESTRICT /* restrict */
+#endif
+#endif
+
 /* Ignore GCC attributes if this is not GCC */
 #ifndef __GNUC__
 	#define __attribute__(x) /* __attribute__(x) */
 #endif

-/*
-The inline keyword is supported by C99 but not by C90. 
-Most compilers implement their own version of this keyword ... 
-*/
-#ifndef INLINE
-	#if defined(_MSC_VER)
-		#define INLINE __forceinline
-	#elif defined(__GNUC__)
-		#define INLINE __inline__
-	#elif defined(__MWERKS__)
-		#define INLINE inline
-	#else 
-		/* add other compilers here ... */
-		#define INLINE 
-	#endif /* defined(<Compiler>) */
-#endif /* INLINE */

-/* Are restricted pointers available? (C99) */
-#if (__STDC_VERSION__ != 199901L)
-	/* Not a C99 compiler */
-	#ifdef __GNUC__
-		#define restrict __restrict__
-	#else
-		#define restrict /* restrict */
-	#endif
-#endif
-
-/* MSVC and Borland C do not have lrintf */
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-static INLINE long lrintf(float f){
+/* MSVC before 2013 and Borland C do not have lrintf */
+#if defined(_MSC_VER)
+#include <intrin.h>
+static INLINE long opj_lrintf(float f)
+{
 #ifdef _M_X64
-    return (long)((f>0.0f) ? (f + 0.5f):(f -0.5f));
-#else
+    return _mm_cvt_ss2si(_mm_load_ss(&f));
+
+    /* commented out line breaks many tests */
+    /* return (long)((f>0.0f) ? (f + 0.5f):(f -0.5f)); */
+#elif defined(_M_IX86)
    int i;
- 
    _asm{
        fld f
        fistp i
    };
- 
+
+    return i;
+#else
+    return (long)((f>0.0f) ? (f + 0.5f) : (f - 0.5f));
+#endif
+}
+#elif defined(__BORLANDC__)
+static INLINE long opj_lrintf(float f)
+{
+#ifdef _M_X64
+    return (long)((f > 0.0f) ? (f + 0.5f) : (f - 0.5f));
+#else
+    int i;
+
+    _asm {
+        fld f
+        fistp i
+    };
+
    return i;
 #endif
 }
+#else
+static INLINE long opj_lrintf(float f)
+{
+    return lrintf(f);
+}
 #endif

+#if defined(_MSC_VER) && (_MSC_VER < 1400)
+#define vsnprintf _vsnprintf
+#endif
+
+/* MSVC x86 is really bad at doing int64 = int32 * int32 on its own. Use intrinsic. */
+#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(__INTEL_COMPILER) && defined(_M_IX86)
+#   include <intrin.h>
+#   pragma intrinsic(__emul)
+#endif
+
+/* Apparently Visual Studio doesn't define __SSE__ / __SSE2__ macros */
+#if defined(_M_X64)
+/* Intel 64bit support SSE and SSE2 */
+#   ifndef __SSE__
+#       define __SSE__ 1
+#   endif
+#   ifndef __SSE2__
+#       define __SSE2__ 1
+#   endif
+#   if !defined(__SSE4_1__) && defined(__AVX__)
+#       define __SSE4_1__ 1
+#   endif
+#endif
+
+/* For x86, test the value of the _M_IX86_FP macro. */
+/* See https://msdn.microsoft.com/en-us/library/b0084kay.aspx */
+#if defined(_M_IX86_FP)
+#   if _M_IX86_FP >= 1
+#       ifndef __SSE__
+#           define __SSE__ 1
+#       endif
+#   endif
+#   if _M_IX86_FP >= 2
+#       ifndef __SSE2__
+#           define __SSE2__ 1
+#       endif
+#   endif
+#endif
+
+/* Type to use for bit-fields in internal headers */
+typedef unsigned int OPJ_BITFIELD;
+
+#define OPJ_UNUSED(x) (void)x
+
 #include "j2k_lib.h"
 #include "opj_malloc.h"
 #include "event.h"
--- a/indra/libopenjpeg/opj_malloc.c
+++ b/indra/libopenjpeg/opj_malloc.c
@@ -0,0 +1,249 @@
+/*
+ * The copyright in this software is being made available under the 2-clauses
+ * BSD License, included below. This software may be subject to other third
+ * party and contributor rights, including patent rights, and no such rights
+ * are granted under this license.
+ *
+ * Copyright (c) 2015, Mathieu Malaterre <mathieu.malaterre@gmail.com>
+ * Copyright (c) 2015, Matthieu Darbois
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#define OPJ_SKIP_POISON
+#include "opj_includes.h"
+
+#if defined(OPJ_HAVE_MALLOC_H) && defined(OPJ_HAVE_MEMALIGN)
+# include <malloc.h>
+#endif
+
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+
+static INLINE void *opj_aligned_alloc_n(size_t alignment, size_t size)
+{
+    void* ptr;
+
+    /* alignment shall be power of 2 */
+    assert((alignment != 0U) && ((alignment & (alignment - 1U)) == 0U));
+    /* alignment shall be at least sizeof(void*) */
+    assert(alignment >= sizeof(void*));
+
+    if (size == 0U) { /* prevent implementation defined behavior of realloc */
+        return NULL;
+    }
+
+#if defined(OPJ_HAVE_POSIX_MEMALIGN)
+    /* aligned_alloc requires c11, restrict to posix_memalign for now. Quote:
+     * This function was introduced in POSIX 1003.1d. Although this function is
+     * superseded by aligned_alloc, it is more portable to older POSIX systems
+     * that do not support ISO C11.  */
+    if (posix_memalign(&ptr, alignment, size)) {
+        ptr = NULL;
+    }
+    /* older linux */
+#elif defined(OPJ_HAVE_MEMALIGN)
+    ptr = memalign(alignment, size);
+    /* _MSC_VER */
+#elif defined(OPJ_HAVE__ALIGNED_MALLOC)
+    ptr = _aligned_malloc(size, alignment);
+#else
+    /*
+     * Generic aligned malloc implementation.
+     * Uses size_t offset for the integer manipulation of the pointer,
+     * as uintptr_t is not available in C89 to do
+     * bitwise operations on the pointer itself.
+     */
+    alignment--;
+    {
+        size_t offset;
+        OPJ_UINT8 *mem;
+
+        /* Room for padding and extra pointer stored in front of allocated area */
+        size_t overhead = alignment + sizeof(void *);
+
+        /* let's be extra careful */
+        assert(alignment <= (SIZE_MAX - sizeof(void *)));
+
+        /* Avoid integer overflow */
+        if (size > (SIZE_MAX - overhead)) {
+            return NULL;
+        }
+
+        mem = (OPJ_UINT8*)malloc(size + overhead);
+        if (mem == NULL) {
+            return mem;
+        }
+        /* offset = ((alignment + 1U) - ((size_t)(mem + sizeof(void*)) & alignment)) & alignment; */
+        /* Use the fact that alignment + 1U is a power of 2 */
+        offset = ((alignment ^ ((size_t)(mem + sizeof(void*)) & alignment)) + 1U) &
+                 alignment;
+        ptr = (void *)(mem + sizeof(void*) + offset);
+        ((void**) ptr)[-1] = mem;
+    }
+#endif
+    return ptr;
+}
+static INLINE void *opj_aligned_realloc_n(void *ptr, size_t alignment,
+        size_t new_size)
+{
+    void *r_ptr;
+
+    /* alignment shall be power of 2 */
+    assert((alignment != 0U) && ((alignment & (alignment - 1U)) == 0U));
+    /* alignment shall be at least sizeof(void*) */
+    assert(alignment >= sizeof(void*));
+
+    if (new_size == 0U) { /* prevent implementation defined behavior of realloc */
+        return NULL;
+    }
+
+    /* no portable aligned realloc */
+#if defined(OPJ_HAVE_POSIX_MEMALIGN) || defined(OPJ_HAVE_MEMALIGN)
+    /* glibc doc states one can mix aligned malloc with realloc */
+    r_ptr = realloc(ptr, new_size);   /* fast path */
+    /* we simply use `size_t` to cast, since we are only interest in binary AND
+     * operator */
+    if (((size_t)r_ptr & (alignment - 1U)) != 0U) {
+        /* this is non-trivial to implement a portable aligned realloc, so use a
+         * simple approach where we do not need a function that return the size of an
+         * allocated array (eg. _msize on Windows, malloc_size on MacOS,
+         * malloc_usable_size on systems with glibc) */
+        void *a_ptr = opj_aligned_alloc_n(alignment, new_size);
+        if (a_ptr != NULL) {
+            memcpy(a_ptr, r_ptr, new_size);
+        }
+        free(r_ptr);
+        r_ptr = a_ptr;
+    }
+    /* _MSC_VER */
+#elif defined(OPJ_HAVE__ALIGNED_MALLOC)
+    r_ptr = _aligned_realloc(ptr, new_size, alignment);
+#else
+    if (ptr == NULL) {
+        return opj_aligned_alloc_n(alignment, new_size);
+    }
+    alignment--;
+    {
+        void *oldmem;
+        OPJ_UINT8 *newmem;
+        size_t overhead = alignment + sizeof(void *);
+
+        /* let's be extra careful */
+        assert(alignment <= (SIZE_MAX - sizeof(void *)));
+
+        /* Avoid integer overflow */
+        if (new_size > SIZE_MAX - overhead) {
+            return NULL;
+        }
+
+        oldmem = ((void**) ptr)[-1];
+        newmem = (OPJ_UINT8*)realloc(oldmem, new_size + overhead);
+        if (newmem == NULL) {
+            return newmem;
+        }
+
+        if (newmem == oldmem) {
+            r_ptr = ptr;
+        } else {
+            size_t old_offset;
+            size_t new_offset;
+
+            /* realloc created a new copy, realign the copied memory block */
+            old_offset = (size_t)((OPJ_UINT8*)ptr - (OPJ_UINT8*)oldmem);
+
+            /* offset = ((alignment + 1U) - ((size_t)(mem + sizeof(void*)) & alignment)) & alignment; */
+            /* Use the fact that alignment + 1U is a power of 2 */
+            new_offset  = ((alignment ^ ((size_t)(newmem + sizeof(void*)) & alignment)) +
+                           1U) & alignment;
+            new_offset += sizeof(void*);
+            r_ptr = (void *)(newmem + new_offset);
+
+            if (new_offset != old_offset) {
+                memmove(newmem + new_offset, newmem + old_offset, new_size);
+            }
+            ((void**) r_ptr)[-1] = newmem;
+        }
+    }
+#endif
+    return r_ptr;
+}
+void * opj_malloc(size_t size)
+{
+    if (size == 0U) { /* prevent implementation defined behavior of realloc */
+        return NULL;
+    }
+    return malloc(size);
+}
+void * opj_calloc(size_t num, size_t size)
+{
+    if (num == 0 || size == 0) {
+        /* prevent implementation defined behavior of realloc */
+        return NULL;
+    }
+    return calloc(num, size);
+}
+
+void *opj_aligned_malloc(size_t size)
+{
+    return opj_aligned_alloc_n(16U, size);
+}
+void * opj_aligned_realloc(void *ptr, size_t size)
+{
+    return opj_aligned_realloc_n(ptr, 16U, size);
+}
+
+void *opj_aligned_32_malloc(size_t size)
+{
+    return opj_aligned_alloc_n(32U, size);
+}
+void * opj_aligned_32_realloc(void *ptr, size_t size)
+{
+    return opj_aligned_realloc_n(ptr, 32U, size);
+}
+
+void opj_aligned_free(void* ptr)
+{
+#if defined(OPJ_HAVE_POSIX_MEMALIGN) || defined(OPJ_HAVE_MEMALIGN)
+    free(ptr);
+#elif defined(OPJ_HAVE__ALIGNED_MALLOC)
+    _aligned_free(ptr);
+#else
+    /* Generic implementation has malloced pointer stored in front of used area */
+    if (ptr != NULL) {
+        free(((void**) ptr)[-1]);
+    }
+#endif
+}
+
+void * opj_realloc(void *ptr, size_t new_size)
+{
+    if (new_size == 0U) { /* prevent implementation defined behavior of realloc */
+        return NULL;
+    }
+    return realloc(ptr, new_size);
+}
+void opj_free(void *ptr)
+{
+    free(ptr);
+}
--- a/indra/libopenjpeg/opj_malloc.h
+++ b/indra/libopenjpeg/opj_malloc.h
@@ -1,4 +1,9 @@
 /*
+ * The copyright in this software is being made available under the 2-clauses
+ * BSD License, included below. This software may be subject to other third
+ * party and contributor rights, including patent rights, and no such rights
+ * are granted under this license.
+ *
 * Copyright (c) 2005, Herve Drolon, FreeImage Team
 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
 * All rights reserved.
@@ -24,8 +29,10 @@
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
-#ifndef __OPJ_MALLOC_H
-#define __OPJ_MALLOC_H
+#ifndef OPJ_MALLOC_H
+#define OPJ_MALLOC_H
+
+#include <stddef.h>
 /**
@file opj_malloc.h
@brief Internal functions
@@ -36,6 +43,17 @@ The functions in opj_malloc.h are internal utilities used for memory management.
 /** @defgroup MISC MISC - Miscellaneous internal functions */
 /*@{*/

+/* FIXME: These should be set with cmake tests, but we're currently not requiring use of cmake */
+#ifdef _WIN32
+#define OPJ_HAVE__ALIGNED_MALLOC
+#else /* Not _WIN32 */
+#if defined(__sun)
+#define OPJ_HAVE_MEMALIGN
+#elif defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__)
+#define OPJ_HAVE_POSIX_MEMALIGN
+#endif
+#endif
+
 /** @name Exported functions */
 /*@{*/
 /* ----------------------------------------------------------------------- */
@@ -45,90 +63,32 @@ Allocate an uninitialized memory block
@param size Bytes to allocate
@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available
 */
-#ifdef ALLOC_PERF_OPT
-void * OPJ_CALLCONV opj_malloc(size_t size);
-#else
-#define opj_malloc(size) malloc(size)
-#endif
+void * opj_malloc(size_t size);

 /**
 Allocate a memory block with elements initialized to 0
-@param num Blocks to allocate
-@param size Bytes per block to allocate
+@param numOfElements  Blocks to allocate
+@param sizeOfElements Bytes per block to allocate
@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available
 */
-#ifdef ALLOC_PERF_OPT
-void * OPJ_CALLCONV opj_calloc(size_t _NumOfElements, size_t _SizeOfElements);
-#else
-#define opj_calloc(num, size) calloc(num, size)
-#endif
+void * opj_calloc(size_t numOfElements, size_t sizeOfElements);

 /**
-Allocate memory aligned to a 16 byte boundry
+Allocate memory aligned to a 16 byte boundary
@param size Bytes to allocate
@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available
 */
-/* FIXME: These should be set with cmake tests, but we're currently not requiring use of cmake */
-#ifdef _WIN32
-	/* Someone should tell the mingw people that their malloc.h ought to provide _mm_malloc() */
-	#ifdef __GNUC__
-		#include <mm_malloc.h>
-		#define HAVE_MM_MALLOC
-	#else /* MSVC, Intel C++ */
-		#include <malloc.h>
-		#ifdef _mm_malloc
-			#define HAVE_MM_MALLOC
-		#endif
-	#endif
-#else /* Not _WIN32 */
-	#if defined(__sun)
-		#define HAVE_MEMALIGN
-  #elif defined(__FreeBSD__)
-    #define HAVE_POSIX_MEMALIGN
-	/* Linux x86_64 and OSX always align allocations to 16 bytes */
-	#elif !defined(__amd64__) && !defined(__APPLE__) && !defined(_AIX)
-		#define HAVE_MEMALIGN
-		#include <malloc.h>			
-	#endif
-#endif
+void * opj_aligned_malloc(size_t size);
+void * opj_aligned_realloc(void *ptr, size_t size);
+void opj_aligned_free(void* ptr);

-#define opj_aligned_malloc(size) malloc(size)
-#define opj_aligned_free(m) free(m)
-
-#ifdef HAVE_MM_MALLOC
-	#undef opj_aligned_malloc
-	#define opj_aligned_malloc(size) _mm_malloc(size, 16)
-	#undef opj_aligned_free
-	#define opj_aligned_free(m) _mm_free(m)
-#endif
-
-#ifdef HAVE_MEMALIGN
-	extern void* memalign(size_t, size_t);
-	#undef opj_aligned_malloc
-	#define opj_aligned_malloc(size) memalign(16, (size))
-	#undef opj_aligned_free
-	#define opj_aligned_free(m) free(m)
-#endif
-
-#ifdef HAVE_POSIX_MEMALIGN
-	#undef opj_aligned_malloc
-	extern int posix_memalign(void**, size_t, size_t);
-
-	static INLINE void* __attribute__ ((malloc)) opj_aligned_malloc(size_t size){
-		void* mem = NULL;
-		posix_memalign(&mem, 16, size);
-		return mem;
-	}
-	#undef opj_aligned_free
-	#define opj_aligned_free(m) free(m)
-#endif
-
-#ifdef ALLOC_PERF_OPT
-	#undef opj_aligned_malloc
-	#define opj_aligned_malloc(size) opj_malloc(size)
-	#undef opj_aligned_free
-	#define opj_aligned_free(m) opj_free(m)
-#endif
+/**
+Allocate memory aligned to a 32 byte boundary
+@param size Bytes to allocate
+@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available
+*/
+void * opj_aligned_32_malloc(size_t size);
+void * opj_aligned_32_realloc(void *ptr, size_t size);

 /**
 Reallocate memory blocks.
@@ -136,23 +96,15 @@ Reallocate memory blocks.
@param s New size in bytes
@return Returns a void pointer to the reallocated (and possibly moved) memory block
 */
-#ifdef ALLOC_PERF_OPT
-void * OPJ_CALLCONV opj_realloc(void * m, size_t s);
-#else
-#define opj_realloc(m, s) realloc(m, s)
-#endif
+void * opj_realloc(void * m, size_t s);

 /**
 Deallocates or frees a memory block.
@param m Previously allocated memory block to be freed
 */
-#ifdef ALLOC_PERF_OPT
-void OPJ_CALLCONV opj_free(void * m);
-#else
-#define opj_free(m) free(m)
-#endif
+void opj_free(void * m);

-#ifdef __GNUC__
+#if defined(__GNUC__) && !defined(OPJ_SKIP_POISON)
 #pragma GCC poison malloc calloc realloc free
 #endif

@@ -161,5 +113,5 @@ void OPJ_CALLCONV opj_free(void * m);

 /*@}*/

-#endif /* __OPJ_MALLOC_H */
+#endif /* OPJ_MALLOC_H */

--- a/indra/libopenjpeg/opj_stdint.h
+++ b/indra/libopenjpeg/opj_stdint.h
@@ -0,0 +1,51 @@
+/*
+ * The copyright in this software is being made available under the 2-clauses
+ * BSD License, included below. This software may be subject to other third
+ * party and contributor rights, including patent rights, and no such rights
+ * are granted under this license.
+ *
+ * Copyright (c) 2012, Mathieu Malaterre <mathieu.malaterre@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef OPJ_STDINT_H
+#define OPJ_STDINT_H
+
+#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || _MSC_VER >= 1900
+#include <stdint.h>
+#else
+#if defined(_WIN32)
+typedef   signed __int8   int8_t;
+typedef unsigned __int8   uint8_t;
+typedef   signed __int16  int16_t;
+typedef unsigned __int16  uint16_t;
+typedef   signed __int32  int32_t;
+typedef unsigned __int32  uint32_t;
+typedef   signed __int64  int64_t;
+typedef unsigned __int64  uint64_t;
+#else
+#error unsupported platform
+#endif
+#endif
+
+#endif /* OPJ_STDINT_H */
--- a/indra/libopenjpeg/t1.c
+++ b/indra/libopenjpeg/t1.c
@@ -1427,7 +1427,7 @@ void t1_encode_cblks(
 			opj_tcd_resolution_t *res = &tilec->resolutions[resno];

 			for (bandno = 0; bandno < res->numbands; ++bandno) {
-				opj_tcd_band_t* restrict band = &res->bands[bandno];
+				opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno];
        int bandconst = 8192 * 8192 / ((int) floor(band->stepsize * 8192));

 				for (precno = 0; precno < res->pw * res->ph; ++precno) {
@@ -1435,8 +1435,8 @@ void t1_encode_cblks(

 					for (cblkno = 0; cblkno < prc->cw * prc->ch; ++cblkno) {
 						opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
-						int* restrict datap;
-						int* restrict tiledp;
+						int* OPJ_RESTRICT datap;
+						int* OPJ_RESTRICT tiledp;
 						int cblk_w;
 						int cblk_h;
 						int i, j;
@@ -1517,14 +1517,14 @@ void t1_decode_cblks(
 		opj_tcd_resolution_t* res = &tilec->resolutions[resno];

 		for (bandno = 0; bandno < res->numbands; ++bandno) {
-			opj_tcd_band_t* restrict band = &res->bands[bandno];
+			opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno];

 			for (precno = 0; precno < res->pw * res->ph; ++precno) {
 				opj_tcd_precinct_t* precinct = &band->precincts[precno];

 				for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
 					opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
-					int* restrict datap;
+					int* OPJ_RESTRICT datap;
 					int cblk_w, cblk_h;
 					int x, y;
 					int i, j;
@@ -1566,7 +1566,7 @@ void t1_decode_cblks(
 					}

 					if (tccp->qmfbid == 1) {
-						int* restrict tiledp = &tilec->data[(y * tile_w) + x];
+						int* OPJ_RESTRICT tiledp = &tilec->data[(y * tile_w) + x];
 						for (j = 0; j < cblk_h; ++j) {
 							for (i = 0; i < cblk_w; ++i) {
 								int tmp = datap[(j * cblk_w) + i];
@@ -1574,9 +1574,9 @@ void t1_decode_cblks(
 							}
 						}
 					} else {		/* if (tccp->qmfbid == 0) */
-						float* restrict tiledp = (float*) &tilec->data[(y * tile_w) + x];
+						float* OPJ_RESTRICT tiledp = (float*) &tilec->data[(y * tile_w) + x];
 						for (j = 0; j < cblk_h; ++j) {
-							float* restrict tiledp2 = tiledp;
+							float* OPJ_RESTRICT tiledp2 = tiledp;
 							for (i = 0; i < cblk_w; ++i) {
 								float tmp = *datap * band->stepsize;
 								*tiledp2 = tmp;
--- a/indra/libopenjpeg/t1_generate_luts.c
+++ b/indra/libopenjpeg/t1_generate_luts.c
@@ -194,7 +194,7 @@ int main(){

 	printf("/* This file was automatically generated by t1_generate_luts.c */\n\n");

-	// lut_ctxno_zc
+	/* lut_ctxno_zc */
 	for (j = 0; j < 4; ++j) {
 		for (i = 0; i < 256; ++i) {
 			int orient = j;
@@ -215,7 +215,7 @@ int main(){
 	}
 	printf("%i\n};\n\n", lut_ctxno_zc[1023]);

-	// lut_ctxno_sc
+	/* lut_ctxno_sc */
 	printf("static char lut_ctxno_sc[256] = {\n  ");
 	for (i = 0; i < 255; ++i) {
 		printf("0x%x, ", t1_init_ctxno_sc(i << 4));
@@ -224,7 +224,7 @@ int main(){
 	}
 	printf("0x%x\n};\n\n", t1_init_ctxno_sc(255 << 4));

-	// lut_spb
+	/* lut_spb */
 	printf("static char lut_spb[256] = {\n  ");
 	for (i = 0; i < 255; ++i) {
 		printf("%i, ", t1_init_spb(i << 4));
--- a/indra/libopenjpeg/t2.c
+++ b/indra/libopenjpeg/t2.c
@@ -30,6 +30,7 @@
 */

 #include "opj_includes.h"
+#include <assert.h>

 /** @defgroup T2 T2 - Implementation of a tier-2 coding */
 /*@{*/
@@ -340,13 +341,15 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
 	int precno = pi->precno;	/* precinct value */
 	int layno  = pi->layno;		/* quality layer value */

-	opj_tcd_resolution_t* res = &tile->comps[compno].resolutions[resno];
-
 	unsigned char *hd = NULL;
 	int present;
 	
 	opj_bio_t *bio = NULL;	/* BIO component */
-	
+
+	opj_tcd_resolution_t* res;
+	assert(&tile->comps[compno] != NULL);
+	res = &tile->comps[compno].resolutions[resno];
+
 	if (layno == 0) {
 		for (bandno = 0; bandno < res->numbands; bandno++) {
 			opj_tcd_band_t *band = &res->bands[bandno];
--- a/indra/libopenjpeg/tcd.c
+++ b/indra/libopenjpeg/tcd.c
@@ -1507,7 +1507,7 @@ opj_bool tcd_decode_tile(opj_tcd_t *tcd, unsigned char *src, int len, int tileno
 			for(j = res->y0; j < res->y1; ++j) {
 				for(i = res->x0; i < res->x1; ++i) {
 					float tmp = ((float*)tilec->data)[i - res->x0 + (j - res->y0) * tw];
-					int v = lrintf(tmp);
+					int v = opj_lrintf(tmp);
 					v += adjust;
 					imagec->data[(i - offset_x) + (j - offset_y) * w] = int_clamp(v, min, max);
 				}