GDAL
gdalsse_priv.h
1/******************************************************************************
2 * $Id$
3 *
4 * Project: GDAL
5 * Purpose: SSE2 helper
6 * Author: Even Rouault <even dot rouault at spatialys dot com>
7 *
8 ******************************************************************************
9 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10 *
11 * SPDX-License-Identifier: MIT
12 ****************************************************************************/
13
14#ifndef GDALSSE_PRIV_H_INCLUDED
15#define GDALSSE_PRIV_H_INCLUDED
16
17#ifndef DOXYGEN_SKIP
18
19#include "cpl_port.h"
20
21/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
22/* Could possibly be used too on 32bit, but we would need to check at runtime */
23#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
24 !defined(USE_SSE2_EMULATION)
25
26/* Requires SSE2 */
27#include <emmintrin.h>
28#include <string.h>
29
30#ifdef __SSE4_1__
31#include <smmintrin.h>
32#endif
33
34#include "gdal_priv_templates.hpp"
35
36static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
37{
38 unsigned short s;
39 memcpy(&s, ptr, 2);
40 return _mm_cvtsi32_si128(s);
41}
42
43static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
44{
45 GInt32 i;
46 memcpy(&i, ptr, 4);
47 return _mm_cvtsi32_si128(i);
48}
49
50static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
51{
52#if defined(__i386__) || defined(_M_IX86)
53 return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
54#else
55 GInt64 i;
56 memcpy(&i, ptr, 8);
57 return _mm_cvtsi64_si128(i);
58#endif
59}
60
61static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
62{
63 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
64 memcpy(pDest, &i, 2);
65}
66
67class XMMReg2Double
68{
69 public:
70 __m128d xmm;
71
72#if defined(__GNUC__)
73#pragma GCC diagnostic push
74#pragma GCC diagnostic ignored "-Weffc++"
75#endif
76 /* coverity[uninit_member] */
77 XMMReg2Double() = default;
78#if defined(__GNUC__)
79#pragma GCC diagnostic pop
80#endif
81
82 XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
83 {
84 }
85
86 XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
87 {
88 }
89
90 static inline XMMReg2Double Zero()
91 {
92 XMMReg2Double reg;
93 reg.Zeroize();
94 return reg;
95 }
96
97 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
98 {
99 XMMReg2Double reg;
100 reg.nsLoad1ValHighAndLow(ptr);
101 return reg;
102 }
103
104 static inline XMMReg2Double Load2Val(const double *ptr)
105 {
106 XMMReg2Double reg;
107 reg.nsLoad2Val(ptr);
108 return reg;
109 }
110
111 static inline XMMReg2Double Load2Val(const float *ptr)
112 {
113 XMMReg2Double reg;
114 reg.nsLoad2Val(ptr);
115 return reg;
116 }
117
118 static inline XMMReg2Double Load2ValAligned(const double *ptr)
119 {
120 XMMReg2Double reg;
121 reg.nsLoad2ValAligned(ptr);
122 return reg;
123 }
124
125 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
126 {
127 XMMReg2Double reg;
128 reg.nsLoad2Val(ptr);
129 return reg;
130 }
131
132 static inline XMMReg2Double Load2Val(const short *ptr)
133 {
134 XMMReg2Double reg;
135 reg.nsLoad2Val(ptr);
136 return reg;
137 }
138
139 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
140 {
141 XMMReg2Double reg;
142 reg.nsLoad2Val(ptr);
143 return reg;
144 }
145
146 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
147 const XMMReg2Double &expr2)
148 {
149 XMMReg2Double reg;
150 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
151 return reg;
152 }
153
154 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
155 const XMMReg2Double &expr2)
156 {
157 XMMReg2Double reg;
158 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
159 return reg;
160 }
161
162 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
163 const XMMReg2Double &expr2)
164 {
165 XMMReg2Double reg;
166 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
167 return reg;
168 }
169
170 static inline XMMReg2Double And(const XMMReg2Double &expr1,
171 const XMMReg2Double &expr2)
172 {
173 XMMReg2Double reg;
174 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
175 return reg;
176 }
177
178 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
179 const XMMReg2Double &true_expr,
180 const XMMReg2Double &false_expr)
181 {
182 XMMReg2Double reg;
183 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
184 _mm_andnot_pd(cond.xmm, false_expr.xmm));
185 return reg;
186 }
187
188 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
189 const XMMReg2Double &expr2)
190 {
191 XMMReg2Double reg;
192 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
193 return reg;
194 }
195
196 inline void nsLoad1ValHighAndLow(const double *ptr)
197 {
198 xmm = _mm_load1_pd(ptr);
199 }
200
201 inline void nsLoad2Val(const double *ptr)
202 {
203 xmm = _mm_loadu_pd(ptr);
204 }
205
206 inline void nsLoad2ValAligned(const double *ptr)
207 {
208 xmm = _mm_load_pd(ptr);
209 }
210
211 inline void nsLoad2Val(const float *ptr)
212 {
213 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
214 }
215
216 inline void nsLoad2Val(const unsigned char *ptr)
217 {
218 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
219#ifdef __SSE4_1__
220 xmm_i = _mm_cvtepu8_epi32(xmm_i);
221#else
222 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
223 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
224#endif
225 xmm = _mm_cvtepi32_pd(xmm_i);
226 }
227
228 inline void nsLoad2Val(const short *ptr)
229 {
230 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
231#ifdef __SSE4_1__
232 xmm_i = _mm_cvtepi16_epi32(xmm_i);
233#else
234 xmm_i = _mm_unpacklo_epi16(
235 xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
236 xmm_i = _mm_srai_epi32(
237 xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
238#endif
239 xmm = _mm_cvtepi32_pd(xmm_i);
240 }
241
242 inline void nsLoad2Val(const unsigned short *ptr)
243 {
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245#ifdef __SSE4_1__
246 xmm_i = _mm_cvtepu16_epi32(xmm_i);
247#else
248 xmm_i = _mm_unpacklo_epi16(
249 xmm_i,
250 _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
251#endif
252 xmm = _mm_cvtepi32_pd(xmm_i);
253 }
254
255 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
256 XMMReg2Double &high)
257 {
258 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
259#ifdef __SSE4_1__
260 xmm_i = _mm_cvtepu8_epi32(xmm_i);
261#else
262 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
263 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
264#endif
265 low.xmm = _mm_cvtepi32_pd(xmm_i);
266 high.xmm =
267 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
268 }
269
270 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
271 XMMReg2Double &high)
272 {
273 low.nsLoad2Val(ptr);
274 high.nsLoad2Val(ptr + 2);
275 }
276
277 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
278 XMMReg2Double &high)
279 {
280 low.nsLoad2Val(ptr);
281 high.nsLoad2Val(ptr + 2);
282 }
283
284 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
285 XMMReg2Double &high)
286 {
287 low.nsLoad2Val(ptr);
288 high.nsLoad2Val(ptr + 2);
289 }
290
291 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
292 XMMReg2Double &high)
293 {
294 __m128 temp1 = _mm_loadu_ps(ptr);
295 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
296 low.xmm = _mm_cvtps_pd(temp1);
297 high.xmm = _mm_cvtps_pd(temp2);
298 }
299
300 inline void Zeroize()
301 {
302 xmm = _mm_setzero_pd();
303 }
304
305 inline XMMReg2Double &operator=(const XMMReg2Double &other)
306 {
307 xmm = other.xmm;
308 return *this;
309 }
310
311 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
312 {
313 xmm = _mm_add_pd(xmm, other.xmm);
314 return *this;
315 }
316
317 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
318 {
319 xmm = _mm_mul_pd(xmm, other.xmm);
320 return *this;
321 }
322
323 inline XMMReg2Double operator+(const XMMReg2Double &other) const
324 {
325 XMMReg2Double ret;
326 ret.xmm = _mm_add_pd(xmm, other.xmm);
327 return ret;
328 }
329
330 inline XMMReg2Double operator-(const XMMReg2Double &other) const
331 {
332 XMMReg2Double ret;
333 ret.xmm = _mm_sub_pd(xmm, other.xmm);
334 return ret;
335 }
336
337 inline XMMReg2Double operator*(const XMMReg2Double &other) const
338 {
339 XMMReg2Double ret;
340 ret.xmm = _mm_mul_pd(xmm, other.xmm);
341 return ret;
342 }
343
344 inline XMMReg2Double operator/(const XMMReg2Double &other) const
345 {
346 XMMReg2Double ret;
347 ret.xmm = _mm_div_pd(xmm, other.xmm);
348 return ret;
349 }
350
351 inline double GetHorizSum() const
352 {
353 __m128d xmm2;
354 xmm2 = _mm_shuffle_pd(
355 xmm, xmm,
356 _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
357 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
358 }
359
360 inline void Store2Val(double *ptr) const
361 {
362 _mm_storeu_pd(ptr, xmm);
363 }
364
365 inline void Store2ValAligned(double *ptr) const
366 {
367 _mm_store_pd(ptr, xmm);
368 }
369
370 inline void Store2Val(float *ptr) const
371 {
372 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
373 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
374 }
375
376 inline void Store2Val(unsigned char *ptr) const
377 {
378 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
379 xmm,
380 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381 tmp = _mm_packs_epi32(tmp, tmp);
382 tmp = _mm_packus_epi16(tmp, tmp);
383 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
384 }
385
386 inline void Store2Val(unsigned short *ptr) const
387 {
388 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
389 xmm,
390 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
391 // X X X X 0 B 0 A --> X X X X A A B A
392 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
393 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
394 }
395
396 inline void StoreMask(unsigned char *ptr) const
397 {
398 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
399 _mm_castpd_si128(xmm));
400 }
401
402 inline operator double() const
403 {
404 return _mm_cvtsd_f64(xmm);
405 }
406};
407
408#else
409
410#ifndef NO_WARN_USE_SSE2_EMULATION
411#warning "Software emulation of SSE2 !"
412#endif
413
414class XMMReg2Double
415{
416 public:
417 double low;
418 double high;
419
420 XMMReg2Double()
421 {
422 }
423
424 XMMReg2Double(double val)
425 {
426 low = val;
427 high = 0.0;
428 }
429
430 XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
431 {
432 }
433
434 static inline XMMReg2Double Zero()
435 {
436 XMMReg2Double reg;
437 reg.Zeroize();
438 return reg;
439 }
440
441 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
442 {
443 XMMReg2Double reg;
444 reg.nsLoad1ValHighAndLow(ptr);
445 return reg;
446 }
447
448 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
449 const XMMReg2Double &expr2)
450 {
451 XMMReg2Double reg;
452
453 if (expr1.low == expr2.low)
454 memset(&(reg.low), 0xFF, sizeof(double));
455 else
456 reg.low = 0;
457
458 if (expr1.high == expr2.high)
459 memset(&(reg.high), 0xFF, sizeof(double));
460 else
461 reg.high = 0;
462
463 return reg;
464 }
465
466 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
467 const XMMReg2Double &expr2)
468 {
469 XMMReg2Double reg;
470
471 if (expr1.low != expr2.low)
472 memset(&(reg.low), 0xFF, sizeof(double));
473 else
474 reg.low = 0;
475
476 if (expr1.high != expr2.high)
477 memset(&(reg.high), 0xFF, sizeof(double));
478 else
479 reg.high = 0;
480
481 return reg;
482 }
483
484 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
485 const XMMReg2Double &expr2)
486 {
487 XMMReg2Double reg;
488
489 if (expr1.low > expr2.low)
490 memset(&(reg.low), 0xFF, sizeof(double));
491 else
492 reg.low = 0;
493
494 if (expr1.high > expr2.high)
495 memset(&(reg.high), 0xFF, sizeof(double));
496 else
497 reg.high = 0;
498
499 return reg;
500 }
501
502 static inline XMMReg2Double And(const XMMReg2Double &expr1,
503 const XMMReg2Double &expr2)
504 {
505 XMMReg2Double reg;
506 int low1[2], high1[2];
507 int low2[2], high2[2];
508 memcpy(low1, &expr1.low, sizeof(double));
509 memcpy(high1, &expr1.high, sizeof(double));
510 memcpy(low2, &expr2.low, sizeof(double));
511 memcpy(high2, &expr2.high, sizeof(double));
512 low1[0] &= low2[0];
513 low1[1] &= low2[1];
514 high1[0] &= high2[0];
515 high1[1] &= high2[1];
516 memcpy(&reg.low, low1, sizeof(double));
517 memcpy(&reg.high, high1, sizeof(double));
518 return reg;
519 }
520
521 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
522 const XMMReg2Double &true_expr,
523 const XMMReg2Double &false_expr)
524 {
525 XMMReg2Double reg;
526 if (cond.low != 0)
527 reg.low = true_expr.low;
528 else
529 reg.low = false_expr.low;
530 if (cond.high != 0)
531 reg.high = true_expr.high;
532 else
533 reg.high = false_expr.high;
534 return reg;
535 }
536
537 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
538 const XMMReg2Double &expr2)
539 {
540 XMMReg2Double reg;
541 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
542 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
543 return reg;
544 }
545
546 static inline XMMReg2Double Load2Val(const double *ptr)
547 {
548 XMMReg2Double reg;
549 reg.nsLoad2Val(ptr);
550 return reg;
551 }
552
553 static inline XMMReg2Double Load2ValAligned(const double *ptr)
554 {
555 XMMReg2Double reg;
556 reg.nsLoad2ValAligned(ptr);
557 return reg;
558 }
559
560 static inline XMMReg2Double Load2Val(const float *ptr)
561 {
562 XMMReg2Double reg;
563 reg.nsLoad2Val(ptr);
564 return reg;
565 }
566
567 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
568 {
569 XMMReg2Double reg;
570 reg.nsLoad2Val(ptr);
571 return reg;
572 }
573
574 static inline XMMReg2Double Load2Val(const short *ptr)
575 {
576 XMMReg2Double reg;
577 reg.nsLoad2Val(ptr);
578 return reg;
579 }
580
581 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
582 {
583 XMMReg2Double reg;
584 reg.nsLoad2Val(ptr);
585 return reg;
586 }
587
588 inline void nsLoad1ValHighAndLow(const double *ptr)
589 {
590 low = ptr[0];
591 high = ptr[0];
592 }
593
594 inline void nsLoad2Val(const double *ptr)
595 {
596 low = ptr[0];
597 high = ptr[1];
598 }
599
600 inline void nsLoad2ValAligned(const double *ptr)
601 {
602 low = ptr[0];
603 high = ptr[1];
604 }
605
606 inline void nsLoad2Val(const float *ptr)
607 {
608 low = ptr[0];
609 high = ptr[1];
610 }
611
612 inline void nsLoad2Val(const unsigned char *ptr)
613 {
614 low = ptr[0];
615 high = ptr[1];
616 }
617
618 inline void nsLoad2Val(const short *ptr)
619 {
620 low = ptr[0];
621 high = ptr[1];
622 }
623
624 inline void nsLoad2Val(const unsigned short *ptr)
625 {
626 low = ptr[0];
627 high = ptr[1];
628 }
629
630 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
631 XMMReg2Double &high)
632 {
633 low.low = ptr[0];
634 low.high = ptr[1];
635 high.low = ptr[2];
636 high.high = ptr[3];
637 }
638
639 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
640 XMMReg2Double &high)
641 {
642 low.nsLoad2Val(ptr);
643 high.nsLoad2Val(ptr + 2);
644 }
645
646 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
647 XMMReg2Double &high)
648 {
649 low.nsLoad2Val(ptr);
650 high.nsLoad2Val(ptr + 2);
651 }
652
653 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
654 XMMReg2Double &high)
655 {
656 low.nsLoad2Val(ptr);
657 high.nsLoad2Val(ptr + 2);
658 }
659
660 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
661 XMMReg2Double &high)
662 {
663 low.nsLoad2Val(ptr);
664 high.nsLoad2Val(ptr + 2);
665 }
666
667 inline void Zeroize()
668 {
669 low = 0.0;
670 high = 0.0;
671 }
672
673 inline XMMReg2Double &operator=(const XMMReg2Double &other)
674 {
675 low = other.low;
676 high = other.high;
677 return *this;
678 }
679
680 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
681 {
682 low += other.low;
683 high += other.high;
684 return *this;
685 }
686
687 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
688 {
689 low *= other.low;
690 high *= other.high;
691 return *this;
692 }
693
694 inline XMMReg2Double operator+(const XMMReg2Double &other) const
695 {
696 XMMReg2Double ret;
697 ret.low = low + other.low;
698 ret.high = high + other.high;
699 return ret;
700 }
701
702 inline XMMReg2Double operator-(const XMMReg2Double &other) const
703 {
704 XMMReg2Double ret;
705 ret.low = low - other.low;
706 ret.high = high - other.high;
707 return ret;
708 }
709
710 inline XMMReg2Double operator*(const XMMReg2Double &other) const
711 {
712 XMMReg2Double ret;
713 ret.low = low * other.low;
714 ret.high = high * other.high;
715 return ret;
716 }
717
718 inline XMMReg2Double operator/(const XMMReg2Double &other) const
719 {
720 XMMReg2Double ret;
721 ret.low = low / other.low;
722 ret.high = high / other.high;
723 return ret;
724 }
725
726 inline double GetHorizSum() const
727 {
728 return low + high;
729 }
730
731 inline void Store2Val(double *ptr) const
732 {
733 ptr[0] = low;
734 ptr[1] = high;
735 }
736
737 inline void Store2ValAligned(double *ptr) const
738 {
739 ptr[0] = low;
740 ptr[1] = high;
741 }
742
743 inline void Store2Val(float *ptr) const
744 {
745 ptr[0] = static_cast<float>(low);
746 ptr[1] = static_cast<float>(high);
747 }
748
749 void Store2Val(unsigned char *ptr) const
750 {
751 ptr[0] = (unsigned char)(low + 0.5);
752 ptr[1] = (unsigned char)(high + 0.5);
753 }
754
755 void Store2Val(unsigned short *ptr) const
756 {
757 ptr[0] = (GUInt16)(low + 0.5);
758 ptr[1] = (GUInt16)(high + 0.5);
759 }
760
761 inline void StoreMask(unsigned char *ptr) const
762 {
763 memcpy(ptr, &low, 8);
764 memcpy(ptr + 8, &high, 8);
765 }
766
767 inline operator double() const
768 {
769 return low;
770 }
771};
772
773#endif /* defined(__x86_64) || defined(_M_X64) */
774
775#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
776
777#include <immintrin.h>
778
779class XMMReg4Double
780{
781 public:
782 __m256d ymm;
783
784 XMMReg4Double() : ymm(_mm256_setzero_pd())
785 {
786 }
787
788 XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
789 {
790 }
791
792 static inline XMMReg4Double Zero()
793 {
794 XMMReg4Double reg;
795 reg.Zeroize();
796 return reg;
797 }
798
799 inline void Zeroize()
800 {
801 ymm = _mm256_setzero_pd();
802 }
803
804 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
805 {
806 XMMReg4Double reg;
807 reg.nsLoad1ValHighAndLow(ptr);
808 return reg;
809 }
810
811 inline void nsLoad1ValHighAndLow(const double *ptr)
812 {
813 ymm = _mm256_set1_pd(*ptr);
814 }
815
816 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
817 {
818 XMMReg4Double reg;
819 reg.nsLoad4Val(ptr);
820 return reg;
821 }
822
823 inline void nsLoad4Val(const unsigned char *ptr)
824 {
825 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
826 xmm_i = _mm_cvtepu8_epi32(xmm_i);
827 ymm = _mm256_cvtepi32_pd(xmm_i);
828 }
829
830 static inline XMMReg4Double Load4Val(const short *ptr)
831 {
832 XMMReg4Double reg;
833 reg.nsLoad4Val(ptr);
834 return reg;
835 }
836
837 inline void nsLoad4Val(const short *ptr)
838 {
839 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
840 xmm_i = _mm_cvtepi16_epi32(xmm_i);
841 ymm = _mm256_cvtepi32_pd(xmm_i);
842 }
843
844 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
845 {
846 XMMReg4Double reg;
847 reg.nsLoad4Val(ptr);
848 return reg;
849 }
850
851 inline void nsLoad4Val(const unsigned short *ptr)
852 {
853 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
854 xmm_i = _mm_cvtepu16_epi32(xmm_i);
855 ymm = _mm256_cvtepi32_pd(
856 xmm_i); // ok to use signed conversion since we are in the ushort
857 // range, so cannot be interpreted as negative int32
858 }
859
860 static inline XMMReg4Double Load4Val(const double *ptr)
861 {
862 XMMReg4Double reg;
863 reg.nsLoad4Val(ptr);
864 return reg;
865 }
866
867 inline void nsLoad4Val(const double *ptr)
868 {
869 ymm = _mm256_loadu_pd(ptr);
870 }
871
872 static inline XMMReg4Double Load4ValAligned(const double *ptr)
873 {
874 XMMReg4Double reg;
875 reg.nsLoad4ValAligned(ptr);
876 return reg;
877 }
878
879 inline void nsLoad4ValAligned(const double *ptr)
880 {
881 ymm = _mm256_load_pd(ptr);
882 }
883
884 static inline XMMReg4Double Load4Val(const float *ptr)
885 {
886 XMMReg4Double reg;
887 reg.nsLoad4Val(ptr);
888 return reg;
889 }
890
891 inline void nsLoad4Val(const float *ptr)
892 {
893 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
894 }
895
896 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
897 const XMMReg4Double &expr2)
898 {
899 XMMReg4Double reg;
900 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
901 return reg;
902 }
903
904 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
905 const XMMReg4Double &expr2)
906 {
907 XMMReg4Double reg;
908 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
909 return reg;
910 }
911
912 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
913 const XMMReg4Double &expr2)
914 {
915 XMMReg4Double reg;
916 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
917 return reg;
918 }
919
920 static inline XMMReg4Double And(const XMMReg4Double &expr1,
921 const XMMReg4Double &expr2)
922 {
923 XMMReg4Double reg;
924 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
925 return reg;
926 }
927
928 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
929 const XMMReg4Double &true_expr,
930 const XMMReg4Double &false_expr)
931 {
932 XMMReg4Double reg;
933 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
934 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
935 return reg;
936 }
937
938 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
939 const XMMReg4Double &expr2)
940 {
941 XMMReg4Double reg;
942 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
943 return reg;
944 }
945
946 inline XMMReg4Double &operator=(const XMMReg4Double &other)
947 {
948 ymm = other.ymm;
949 return *this;
950 }
951
952 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
953 {
954 ymm = _mm256_add_pd(ymm, other.ymm);
955 return *this;
956 }
957
958 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
959 {
960 ymm = _mm256_mul_pd(ymm, other.ymm);
961 return *this;
962 }
963
964 inline XMMReg4Double operator+(const XMMReg4Double &other) const
965 {
966 XMMReg4Double ret;
967 ret.ymm = _mm256_add_pd(ymm, other.ymm);
968 return ret;
969 }
970
971 inline XMMReg4Double operator-(const XMMReg4Double &other) const
972 {
973 XMMReg4Double ret;
974 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
975 return ret;
976 }
977
978 inline XMMReg4Double operator*(const XMMReg4Double &other) const
979 {
980 XMMReg4Double ret;
981 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
982 return ret;
983 }
984
985 inline XMMReg4Double operator/(const XMMReg4Double &other) const
986 {
987 XMMReg4Double ret;
988 ret.ymm = _mm256_div_pd(ymm, other.ymm);
989 return ret;
990 }
991
992 void AddToLow(const XMMReg2Double &other)
993 {
994 __m256d ymm2 = _mm256_setzero_pd();
995 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
996 ymm = _mm256_add_pd(ymm, ymm2);
997 }
998
999 inline double GetHorizSum() const
1000 {
1001 __m256d ymm_tmp1, ymm_tmp2;
1002 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1003 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1004 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1005 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1006 }
1007
1008 inline void Store4Val(unsigned char *ptr) const
1009 {
1010 __m128i xmm_i =
1011 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1012 // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1013 // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1014 xmm_i =
1015 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1016 (12 << 24))); // SSSE3
1017 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1018 }
1019
1020 inline void Store4Val(unsigned short *ptr) const
1021 {
1022 __m128i xmm_i =
1023 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1024 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1025 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1026 }
1027
1028 inline void Store4Val(float *ptr) const
1029 {
1030 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1031 }
1032
1033 inline void Store4Val(double *ptr) const
1034 {
1035 _mm256_storeu_pd(ptr, ymm);
1036 }
1037
1038 inline void StoreMask(unsigned char *ptr) const
1039 {
1040 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1041 _mm256_castpd_si256(ymm));
1042 }
1043};
1044
1045#else
1046
1047class XMMReg4Double
1048{
1049 public:
1050 XMMReg2Double low, high;
1051
1052#if defined(__GNUC__)
1053#pragma GCC diagnostic push
1054#pragma GCC diagnostic ignored "-Weffc++"
1055#endif
1056 /* coverity[uninit_member] */
1057 XMMReg4Double() = default;
1058#if defined(__GNUC__)
1059#pragma GCC diagnostic pop
1060#endif
1061
1062 XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1063 {
1064 }
1065
1066 static inline XMMReg4Double Zero()
1067 {
1068 XMMReg4Double reg;
1069 reg.low.Zeroize();
1070 reg.high.Zeroize();
1071 return reg;
1072 }
1073
1074 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1075 {
1076 XMMReg4Double reg;
1077 reg.low.nsLoad1ValHighAndLow(ptr);
1078 reg.high = reg.low;
1079 return reg;
1080 }
1081
1082 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1083 {
1084 XMMReg4Double reg;
1085 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1086 return reg;
1087 }
1088
1089 static inline XMMReg4Double Load4Val(const short *ptr)
1090 {
1091 XMMReg4Double reg;
1092 reg.low.nsLoad2Val(ptr);
1093 reg.high.nsLoad2Val(ptr + 2);
1094 return reg;
1095 }
1096
1097 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1098 {
1099 XMMReg4Double reg;
1100 reg.low.nsLoad2Val(ptr);
1101 reg.high.nsLoad2Val(ptr + 2);
1102 return reg;
1103 }
1104
1105 static inline XMMReg4Double Load4Val(const double *ptr)
1106 {
1107 XMMReg4Double reg;
1108 reg.low.nsLoad2Val(ptr);
1109 reg.high.nsLoad2Val(ptr + 2);
1110 return reg;
1111 }
1112
1113 static inline XMMReg4Double Load4ValAligned(const double *ptr)
1114 {
1115 XMMReg4Double reg;
1116 reg.low.nsLoad2ValAligned(ptr);
1117 reg.high.nsLoad2ValAligned(ptr + 2);
1118 return reg;
1119 }
1120
1121 static inline XMMReg4Double Load4Val(const float *ptr)
1122 {
1123 XMMReg4Double reg;
1124 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1125 return reg;
1126 }
1127
1128 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1129 const XMMReg4Double &expr2)
1130 {
1131 XMMReg4Double reg;
1132 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1133 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1134 return reg;
1135 }
1136
1137 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1138 const XMMReg4Double &expr2)
1139 {
1140 XMMReg4Double reg;
1141 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1142 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1143 return reg;
1144 }
1145
1146 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1147 const XMMReg4Double &expr2)
1148 {
1149 XMMReg4Double reg;
1150 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1151 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1152 return reg;
1153 }
1154
1155 static inline XMMReg4Double And(const XMMReg4Double &expr1,
1156 const XMMReg4Double &expr2)
1157 {
1158 XMMReg4Double reg;
1159 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1160 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1161 return reg;
1162 }
1163
1164 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1165 const XMMReg4Double &true_expr,
1166 const XMMReg4Double &false_expr)
1167 {
1168 XMMReg4Double reg;
1169 reg.low =
1170 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1171 reg.high =
1172 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1173 return reg;
1174 }
1175
1176 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1177 const XMMReg4Double &expr2)
1178 {
1179 XMMReg4Double reg;
1180 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1181 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1182 return reg;
1183 }
1184
1185 inline XMMReg4Double &operator=(const XMMReg4Double &other)
1186 {
1187 low = other.low;
1188 high = other.high;
1189 return *this;
1190 }
1191
1192 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1193 {
1194 low += other.low;
1195 high += other.high;
1196 return *this;
1197 }
1198
1199 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1200 {
1201 low *= other.low;
1202 high *= other.high;
1203 return *this;
1204 }
1205
1206 inline XMMReg4Double operator+(const XMMReg4Double &other) const
1207 {
1208 XMMReg4Double ret;
1209 ret.low = low + other.low;
1210 ret.high = high + other.high;
1211 return ret;
1212 }
1213
1214 inline XMMReg4Double operator-(const XMMReg4Double &other) const
1215 {
1216 XMMReg4Double ret;
1217 ret.low = low - other.low;
1218 ret.high = high - other.high;
1219 return ret;
1220 }
1221
1222 inline XMMReg4Double operator*(const XMMReg4Double &other) const
1223 {
1224 XMMReg4Double ret;
1225 ret.low = low * other.low;
1226 ret.high = high * other.high;
1227 return ret;
1228 }
1229
1230 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1231 {
1232 XMMReg4Double ret;
1233 ret.low = low / other.low;
1234 ret.high = high / other.high;
1235 return ret;
1236 }
1237
1238 void AddToLow(const XMMReg2Double &other)
1239 {
1240 low += other;
1241 }
1242
1243 inline double GetHorizSum() const
1244 {
1245 return (low + high).GetHorizSum();
1246 }
1247
1248 inline void Store4Val(unsigned char *ptr) const
1249 {
1250#ifdef USE_SSE2_EMULATION
1251 low.Store2Val(ptr);
1252 high.Store2Val(ptr + 2);
1253#else
1254 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1255 low.xmm,
1256 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1257 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1258 high.xmm,
1259 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1260 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1261 _mm_castsi128_ps(tmpHigh),
1262 _MM_SHUFFLE(1, 0, 1, 0)));
1263 tmp = _mm_packs_epi32(tmp, tmp);
1264 tmp = _mm_packus_epi16(tmp, tmp);
1265 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1266#endif
1267 }
1268
1269 inline void Store4Val(unsigned short *ptr) const
1270 {
1271#if 1
1272 low.Store2Val(ptr);
1273 high.Store2Val(ptr + 2);
1274#else
1275 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1276 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1277 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1278#if __SSE4_1__
1279 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1280#else
1281 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1282 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1283 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1284#endif
1285 GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
1286#endif
1287 }
1288
1289 inline void Store4Val(float *ptr) const
1290 {
1291 low.Store2Val(ptr);
1292 high.Store2Val(ptr + 2);
1293 }
1294
1295 inline void Store4Val(double *ptr) const
1296 {
1297 low.Store2Val(ptr);
1298 high.Store2Val(ptr + 2);
1299 }
1300
1301 inline void StoreMask(unsigned char *ptr) const
1302 {
1303 low.StoreMask(ptr);
1304 high.StoreMask(ptr + 16);
1305 }
1306};
1307
1308#endif
1309
1310#endif /* #ifndef DOXYGEN_SKIP */
1311
1312#endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:165
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:220
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:167
int GInt32
Int32 type.
Definition: cpl_port.h:159