30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40 !defined(USE_SSE2_EMULATION)
43 #include <emmintrin.h>
47 #include <smmintrin.h>
50 #include "gdal_priv_templates.hpp"
52 static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
54 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
57 return _mm_cvtsi32_si128(s);
59 return _mm_cvtsi32_si128(*
static_cast<const unsigned short *
>(ptr));
63 static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
65 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
68 return _mm_cvtsi32_si128(i);
70 return _mm_cvtsi32_si128(*
static_cast<const GInt32 *
>(ptr));
74 static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
76 #if defined(__i386__) || defined(_M_IX86)
77 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
78 #elif defined(CPL_CPU_REQUIRES_ALIGNED_ACCESS)
81 return _mm_cvtsi64_si128(i);
83 return _mm_cvtsi64_si128(*
static_cast<const GInt64 *
>(ptr));
87 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
89 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
90 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
93 *
static_cast<GInt16 *
>(pDest) =
94 static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
103 #if defined(__GNUC__)
104 #pragma GCC diagnostic push
105 #pragma GCC diagnostic ignored "-Weffc++"
108 XMMReg2Double() =
default;
109 #if defined(__GNUC__)
110 #pragma GCC diagnostic pop
113 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
116 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
120 static inline XMMReg2Double Zero()
127 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
130 reg.nsLoad1ValHighAndLow(ptr);
134 static inline XMMReg2Double Load2Val(
const double *ptr)
141 static inline XMMReg2Double Load2Val(
const float *ptr)
148 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
151 reg.nsLoad2ValAligned(ptr);
155 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
162 static inline XMMReg2Double Load2Val(
const short *ptr)
169 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
176 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
177 const XMMReg2Double &expr2)
180 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
184 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
185 const XMMReg2Double &expr2)
188 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
192 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
193 const XMMReg2Double &expr2)
196 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
200 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
201 const XMMReg2Double &expr2)
204 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
208 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
209 const XMMReg2Double &true_expr,
210 const XMMReg2Double &false_expr)
213 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
214 _mm_andnot_pd(cond.xmm, false_expr.xmm));
218 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
219 const XMMReg2Double &expr2)
222 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
226 inline void nsLoad1ValHighAndLow(
const double *ptr)
228 xmm = _mm_load1_pd(ptr);
231 inline void nsLoad2Val(
const double *ptr)
233 xmm = _mm_loadu_pd(ptr);
236 inline void nsLoad2ValAligned(
const double *ptr)
238 xmm = _mm_load_pd(ptr);
241 inline void nsLoad2Val(
const float *ptr)
243 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
246 inline void nsLoad2Val(
const unsigned char *ptr)
248 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
250 xmm_i = _mm_cvtepu8_epi32(xmm_i);
252 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
253 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
255 xmm = _mm_cvtepi32_pd(xmm_i);
258 inline void nsLoad2Val(
const short *ptr)
260 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
262 xmm_i = _mm_cvtepi16_epi32(xmm_i);
264 xmm_i = _mm_unpacklo_epi16(
266 xmm_i = _mm_srai_epi32(
269 xmm = _mm_cvtepi32_pd(xmm_i);
272 inline void nsLoad2Val(
const unsigned short *ptr)
274 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
276 xmm_i = _mm_cvtepu16_epi32(xmm_i);
278 xmm_i = _mm_unpacklo_epi16(
280 _mm_setzero_si128());
282 xmm = _mm_cvtepi32_pd(xmm_i);
285 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
288 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
290 xmm_i = _mm_cvtepu8_epi32(xmm_i);
292 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
293 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
295 low.xmm = _mm_cvtepi32_pd(xmm_i);
297 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
300 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
304 high.nsLoad2Val(ptr + 2);
307 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
311 high.nsLoad2Val(ptr + 2);
314 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
318 high.nsLoad2Val(ptr + 2);
321 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
324 __m128 temp1 = _mm_loadu_ps(ptr);
325 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
326 low.xmm = _mm_cvtps_pd(temp1);
327 high.xmm = _mm_cvtps_pd(temp2);
330 inline void Zeroize()
332 xmm = _mm_setzero_pd();
335 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
341 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
343 xmm = _mm_add_pd(xmm, other.xmm);
347 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
349 xmm = _mm_mul_pd(xmm, other.xmm);
353 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
356 ret.xmm = _mm_add_pd(xmm, other.xmm);
360 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
363 ret.xmm = _mm_sub_pd(xmm, other.xmm);
367 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
370 ret.xmm = _mm_mul_pd(xmm, other.xmm);
374 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
377 ret.xmm = _mm_div_pd(xmm, other.xmm);
381 inline double GetHorizSum()
const
384 xmm2 = _mm_shuffle_pd(
387 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
390 inline void Store2Val(
double *ptr)
const
392 _mm_storeu_pd(ptr, xmm);
395 inline void Store2ValAligned(
double *ptr)
const
397 _mm_store_pd(ptr, xmm);
400 inline void Store2Val(
float *ptr)
const
402 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
403 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
406 inline void Store2Val(
unsigned char *ptr)
const
408 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
411 tmp = _mm_packs_epi32(tmp, tmp);
412 tmp = _mm_packus_epi16(tmp, tmp);
413 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
416 inline void Store2Val(
unsigned short *ptr)
const
418 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
422 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
423 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
426 inline void StoreMask(
unsigned char *ptr)
const
428 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
429 _mm_castpd_si128(xmm));
432 inline operator double()
const
434 return _mm_cvtsd_f64(xmm);
440 #ifndef NO_WARN_USE_SSE2_EMULATION
441 #warning "Software emulation of SSE2 !"
453 XMMReg2Double(
double val)
458 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
462 static inline XMMReg2Double Zero()
469 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
472 reg.nsLoad1ValHighAndLow(ptr);
476 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
477 const XMMReg2Double &expr2)
481 if (expr1.low == expr2.low)
482 memset(&(reg.low), 0xFF,
sizeof(
double));
486 if (expr1.high == expr2.high)
487 memset(&(reg.high), 0xFF,
sizeof(
double));
494 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
495 const XMMReg2Double &expr2)
499 if (expr1.low != expr2.low)
500 memset(&(reg.low), 0xFF,
sizeof(
double));
504 if (expr1.high != expr2.high)
505 memset(&(reg.high), 0xFF,
sizeof(
double));
512 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
513 const XMMReg2Double &expr2)
517 if (expr1.low > expr2.low)
518 memset(&(reg.low), 0xFF,
sizeof(
double));
522 if (expr1.high > expr2.high)
523 memset(&(reg.high), 0xFF,
sizeof(
double));
530 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
531 const XMMReg2Double &expr2)
534 int low1[2], high1[2];
535 int low2[2], high2[2];
536 memcpy(low1, &expr1.low,
sizeof(
double));
537 memcpy(high1, &expr1.high,
sizeof(
double));
538 memcpy(low2, &expr2.low,
sizeof(
double));
539 memcpy(high2, &expr2.high,
sizeof(
double));
542 high1[0] &= high2[0];
543 high1[1] &= high2[1];
544 memcpy(®.low, low1,
sizeof(
double));
545 memcpy(®.high, high1,
sizeof(
double));
549 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
550 const XMMReg2Double &true_expr,
551 const XMMReg2Double &false_expr)
555 reg.low = true_expr.low;
557 reg.low = false_expr.low;
559 reg.high = true_expr.high;
561 reg.high = false_expr.high;
565 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
566 const XMMReg2Double &expr2)
569 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
570 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
574 static inline XMMReg2Double Load2Val(
const double *ptr)
581 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
584 reg.nsLoad2ValAligned(ptr);
588 static inline XMMReg2Double Load2Val(
const float *ptr)
595 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
602 static inline XMMReg2Double Load2Val(
const short *ptr)
609 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
616 inline void nsLoad1ValHighAndLow(
const double *ptr)
622 inline void nsLoad2Val(
const double *ptr)
628 inline void nsLoad2ValAligned(
const double *ptr)
634 inline void nsLoad2Val(
const float *ptr)
640 inline void nsLoad2Val(
const unsigned char *ptr)
646 inline void nsLoad2Val(
const short *ptr)
652 inline void nsLoad2Val(
const unsigned short *ptr)
658 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
667 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
671 high.nsLoad2Val(ptr + 2);
674 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
678 high.nsLoad2Val(ptr + 2);
681 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
685 high.nsLoad2Val(ptr + 2);
688 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
692 high.nsLoad2Val(ptr + 2);
695 inline void Zeroize()
701 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
708 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
715 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
722 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
725 ret.low = low + other.low;
726 ret.high = high + other.high;
730 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
733 ret.low = low - other.low;
734 ret.high = high - other.high;
738 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
741 ret.low = low * other.low;
742 ret.high = high * other.high;
746 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
749 ret.low = low / other.low;
750 ret.high = high / other.high;
754 inline double GetHorizSum()
const
759 inline void Store2Val(
double *ptr)
const
765 inline void Store2ValAligned(
double *ptr)
const
771 inline void Store2Val(
float *ptr)
const
773 ptr[0] =
static_cast<float>(low);
774 ptr[1] =
static_cast<float>(high);
777 void Store2Val(
unsigned char *ptr)
const
779 ptr[0] = (
unsigned char)(low + 0.5);
780 ptr[1] = (
unsigned char)(high + 0.5);
783 void Store2Val(
unsigned short *ptr)
const
786 ptr[1] = (
GUInt16)(high + 0.5);
789 inline void StoreMask(
unsigned char *ptr)
const
791 memcpy(ptr, &low, 8);
792 memcpy(ptr + 8, &high, 8);
795 inline operator double()
const
803 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
805 #include <immintrin.h>
812 XMMReg4Double() : ymm(_mm256_setzero_pd())
815 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
819 static inline XMMReg4Double Zero()
826 inline void Zeroize()
828 ymm = _mm256_setzero_pd();
831 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
834 reg.nsLoad1ValHighAndLow(ptr);
838 inline void nsLoad1ValHighAndLow(
const double *ptr)
840 ymm = _mm256_set1_pd(*ptr);
843 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
850 inline void nsLoad4Val(
const unsigned char *ptr)
852 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
853 xmm_i = _mm_cvtepu8_epi32(xmm_i);
854 ymm = _mm256_cvtepi32_pd(xmm_i);
857 static inline XMMReg4Double Load4Val(
const short *ptr)
864 inline void nsLoad4Val(
const short *ptr)
866 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
867 xmm_i = _mm_cvtepi16_epi32(xmm_i);
868 ymm = _mm256_cvtepi32_pd(xmm_i);
871 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
878 inline void nsLoad4Val(
const unsigned short *ptr)
880 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
881 xmm_i = _mm_cvtepu16_epi32(xmm_i);
882 ymm = _mm256_cvtepi32_pd(
887 static inline XMMReg4Double Load4Val(
const double *ptr)
894 inline void nsLoad4Val(
const double *ptr)
896 ymm = _mm256_loadu_pd(ptr);
899 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
902 reg.nsLoad4ValAligned(ptr);
906 inline void nsLoad4ValAligned(
const double *ptr)
908 ymm = _mm256_load_pd(ptr);
911 static inline XMMReg4Double Load4Val(
const float *ptr)
918 inline void nsLoad4Val(
const float *ptr)
920 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
923 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
924 const XMMReg4Double &expr2)
927 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
931 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
932 const XMMReg4Double &expr2)
935 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
939 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
940 const XMMReg4Double &expr2)
943 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
947 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
948 const XMMReg4Double &expr2)
951 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
955 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
956 const XMMReg4Double &true_expr,
957 const XMMReg4Double &false_expr)
960 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
961 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
965 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
966 const XMMReg4Double &expr2)
969 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
973 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
979 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
981 ymm = _mm256_add_pd(ymm, other.ymm);
985 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
987 ymm = _mm256_mul_pd(ymm, other.ymm);
991 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
994 ret.ymm = _mm256_add_pd(ymm, other.ymm);
998 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1001 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1005 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1008 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1012 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1015 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1019 void AddToLow(
const XMMReg2Double &other)
1021 __m256d ymm2 = _mm256_setzero_pd();
1022 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1023 ymm = _mm256_add_pd(ymm, ymm2);
1026 inline double GetHorizSum()
const
1028 __m256d ymm_tmp1, ymm_tmp2;
1029 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1030 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1031 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1032 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1035 inline void Store4Val(
unsigned char *ptr)
const
1038 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1042 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1044 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1047 inline void Store4Val(
unsigned short *ptr)
const
1050 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1051 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1052 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1055 inline void Store4Val(
float *ptr)
const
1057 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1060 inline void Store4Val(
double *ptr)
const
1062 _mm256_storeu_pd(ptr, ymm);
1065 inline void StoreMask(
unsigned char *ptr)
const
1067 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1068 _mm256_castpd_si256(ymm));
1077 XMMReg2Double low, high;
1079 #if defined(__GNUC__)
1080 #pragma GCC diagnostic push
1081 #pragma GCC diagnostic ignored "-Weffc++"
1084 XMMReg4Double() =
default;
1085 #if defined(__GNUC__)
1086 #pragma GCC diagnostic pop
1089 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
1093 static inline XMMReg4Double Zero()
1101 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1104 reg.low.nsLoad1ValHighAndLow(ptr);
1109 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1112 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1116 static inline XMMReg4Double Load4Val(
const short *ptr)
1119 reg.low.nsLoad2Val(ptr);
1120 reg.high.nsLoad2Val(ptr + 2);
1124 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1127 reg.low.nsLoad2Val(ptr);
1128 reg.high.nsLoad2Val(ptr + 2);
1132 static inline XMMReg4Double Load4Val(
const double *ptr)
1135 reg.low.nsLoad2Val(ptr);
1136 reg.high.nsLoad2Val(ptr + 2);
1140 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1143 reg.low.nsLoad2ValAligned(ptr);
1144 reg.high.nsLoad2ValAligned(ptr + 2);
1148 static inline XMMReg4Double Load4Val(
const float *ptr)
1151 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1155 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1156 const XMMReg4Double &expr2)
1159 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1160 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1164 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1165 const XMMReg4Double &expr2)
1168 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1169 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1173 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1174 const XMMReg4Double &expr2)
1177 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1178 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1182 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1183 const XMMReg4Double &expr2)
1186 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1187 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1191 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1192 const XMMReg4Double &true_expr,
1193 const XMMReg4Double &false_expr)
1197 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1199 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1203 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1204 const XMMReg4Double &expr2)
1207 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1208 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1212 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1219 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1226 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1233 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1236 ret.low = low + other.low;
1237 ret.high = high + other.high;
1241 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1244 ret.low = low - other.low;
1245 ret.high = high - other.high;
1249 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1252 ret.low = low * other.low;
1253 ret.high = high * other.high;
1257 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1260 ret.low = low / other.low;
1261 ret.high = high / other.high;
1265 void AddToLow(
const XMMReg2Double &other)
1270 inline double GetHorizSum()
const
1272 return (low + high).GetHorizSum();
1275 inline void Store4Val(
unsigned char *ptr)
const
1277 #ifdef USE_SSE2_EMULATION
1279 high.Store2Val(ptr + 2);
1281 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1284 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1287 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1288 _mm_castsi128_ps(tmpHigh),
1289 _MM_SHUFFLE(1, 0, 1, 0)));
1290 tmp = _mm_packs_epi32(tmp, tmp);
1291 tmp = _mm_packus_epi16(tmp, tmp);
1292 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
1296 inline void Store4Val(
unsigned short *ptr)
const
1300 high.Store2Val(ptr + 2);
1302 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1303 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1304 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1306 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1308 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1309 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1310 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1312 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
1316 inline void Store4Val(
float *ptr)
const
1319 high.Store2Val(ptr + 2);
1322 inline void Store4Val(
double *ptr)
const
1325 high.Store2Val(ptr + 2);
1328 inline void StoreMask(
unsigned char *ptr)
const
1331 high.StoreMask(ptr + 16);