GDAL
gdalsse_priv.h
1/******************************************************************************
2 * $Id$
3 *
4 * Project: GDAL
5 * Purpose: SSE2 helper
6 * Author: Even Rouault <even dot rouault at spatialys dot com>
7 *
8 ******************************************************************************
9 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10 *
11 * Permission is hereby granted, free of charge, to any person obtaining a
12 * copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 ****************************************************************************/
29
30#ifndef GDALSSE_PRIV_H_INCLUDED
31#define GDALSSE_PRIV_H_INCLUDED
32
33#ifndef DOXYGEN_SKIP
34
35#include "cpl_port.h"
36
37/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38/* Could possibly be used too on 32bit, but we would need to check at runtime */
39#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40 !defined(USE_SSE2_EMULATION)
41
42/* Requires SSE2 */
43#include <emmintrin.h>
44#include <string.h>
45
46#ifdef __SSE4_1__
47#include <smmintrin.h>
48#endif
49
50#include "gdal_priv_templates.hpp"
51
52static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
53{
54 unsigned short s;
55 memcpy(&s, ptr, 2);
56 return _mm_cvtsi32_si128(s);
57}
58
59static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
60{
61 GInt32 i;
62 memcpy(&i, ptr, 4);
63 return _mm_cvtsi32_si128(i);
64}
65
66static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
67{
68#if defined(__i386__) || defined(_M_IX86)
69 return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
70#else
71 GInt64 i;
72 memcpy(&i, ptr, 8);
73 return _mm_cvtsi64_si128(i);
74#endif
75}
76
77static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
78{
79 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
80 memcpy(pDest, &i, 2);
81}
82
83class XMMReg2Double
84{
85 public:
86 __m128d xmm;
87
88#if defined(__GNUC__)
89#pragma GCC diagnostic push
90#pragma GCC diagnostic ignored "-Weffc++"
91#endif
92 /* coverity[uninit_member] */
93 XMMReg2Double() = default;
94#if defined(__GNUC__)
95#pragma GCC diagnostic pop
96#endif
97
98 XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
99 {
100 }
101
102 XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
103 {
104 }
105
106 static inline XMMReg2Double Zero()
107 {
108 XMMReg2Double reg;
109 reg.Zeroize();
110 return reg;
111 }
112
113 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
114 {
115 XMMReg2Double reg;
116 reg.nsLoad1ValHighAndLow(ptr);
117 return reg;
118 }
119
120 static inline XMMReg2Double Load2Val(const double *ptr)
121 {
122 XMMReg2Double reg;
123 reg.nsLoad2Val(ptr);
124 return reg;
125 }
126
127 static inline XMMReg2Double Load2Val(const float *ptr)
128 {
129 XMMReg2Double reg;
130 reg.nsLoad2Val(ptr);
131 return reg;
132 }
133
134 static inline XMMReg2Double Load2ValAligned(const double *ptr)
135 {
136 XMMReg2Double reg;
137 reg.nsLoad2ValAligned(ptr);
138 return reg;
139 }
140
141 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
142 {
143 XMMReg2Double reg;
144 reg.nsLoad2Val(ptr);
145 return reg;
146 }
147
148 static inline XMMReg2Double Load2Val(const short *ptr)
149 {
150 XMMReg2Double reg;
151 reg.nsLoad2Val(ptr);
152 return reg;
153 }
154
155 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
156 {
157 XMMReg2Double reg;
158 reg.nsLoad2Val(ptr);
159 return reg;
160 }
161
162 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
163 const XMMReg2Double &expr2)
164 {
165 XMMReg2Double reg;
166 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
167 return reg;
168 }
169
170 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
171 const XMMReg2Double &expr2)
172 {
173 XMMReg2Double reg;
174 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
175 return reg;
176 }
177
178 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
179 const XMMReg2Double &expr2)
180 {
181 XMMReg2Double reg;
182 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
183 return reg;
184 }
185
186 static inline XMMReg2Double And(const XMMReg2Double &expr1,
187 const XMMReg2Double &expr2)
188 {
189 XMMReg2Double reg;
190 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
191 return reg;
192 }
193
194 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
195 const XMMReg2Double &true_expr,
196 const XMMReg2Double &false_expr)
197 {
198 XMMReg2Double reg;
199 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
200 _mm_andnot_pd(cond.xmm, false_expr.xmm));
201 return reg;
202 }
203
204 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
205 const XMMReg2Double &expr2)
206 {
207 XMMReg2Double reg;
208 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
209 return reg;
210 }
211
212 inline void nsLoad1ValHighAndLow(const double *ptr)
213 {
214 xmm = _mm_load1_pd(ptr);
215 }
216
217 inline void nsLoad2Val(const double *ptr)
218 {
219 xmm = _mm_loadu_pd(ptr);
220 }
221
222 inline void nsLoad2ValAligned(const double *ptr)
223 {
224 xmm = _mm_load_pd(ptr);
225 }
226
227 inline void nsLoad2Val(const float *ptr)
228 {
229 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
230 }
231
232 inline void nsLoad2Val(const unsigned char *ptr)
233 {
234 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
235#ifdef __SSE4_1__
236 xmm_i = _mm_cvtepu8_epi32(xmm_i);
237#else
238 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
239 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
240#endif
241 xmm = _mm_cvtepi32_pd(xmm_i);
242 }
243
244 inline void nsLoad2Val(const short *ptr)
245 {
246 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
247#ifdef __SSE4_1__
248 xmm_i = _mm_cvtepi16_epi32(xmm_i);
249#else
250 xmm_i = _mm_unpacklo_epi16(
251 xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
252 xmm_i = _mm_srai_epi32(
253 xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
254#endif
255 xmm = _mm_cvtepi32_pd(xmm_i);
256 }
257
258 inline void nsLoad2Val(const unsigned short *ptr)
259 {
260 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
261#ifdef __SSE4_1__
262 xmm_i = _mm_cvtepu16_epi32(xmm_i);
263#else
264 xmm_i = _mm_unpacklo_epi16(
265 xmm_i,
266 _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
267#endif
268 xmm = _mm_cvtepi32_pd(xmm_i);
269 }
270
271 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
272 XMMReg2Double &high)
273 {
274 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
275#ifdef __SSE4_1__
276 xmm_i = _mm_cvtepu8_epi32(xmm_i);
277#else
278 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
279 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
280#endif
281 low.xmm = _mm_cvtepi32_pd(xmm_i);
282 high.xmm =
283 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
284 }
285
286 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
287 XMMReg2Double &high)
288 {
289 low.nsLoad2Val(ptr);
290 high.nsLoad2Val(ptr + 2);
291 }
292
293 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
294 XMMReg2Double &high)
295 {
296 low.nsLoad2Val(ptr);
297 high.nsLoad2Val(ptr + 2);
298 }
299
300 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
301 XMMReg2Double &high)
302 {
303 low.nsLoad2Val(ptr);
304 high.nsLoad2Val(ptr + 2);
305 }
306
307 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
308 XMMReg2Double &high)
309 {
310 __m128 temp1 = _mm_loadu_ps(ptr);
311 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
312 low.xmm = _mm_cvtps_pd(temp1);
313 high.xmm = _mm_cvtps_pd(temp2);
314 }
315
316 inline void Zeroize()
317 {
318 xmm = _mm_setzero_pd();
319 }
320
321 inline XMMReg2Double &operator=(const XMMReg2Double &other)
322 {
323 xmm = other.xmm;
324 return *this;
325 }
326
327 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
328 {
329 xmm = _mm_add_pd(xmm, other.xmm);
330 return *this;
331 }
332
333 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
334 {
335 xmm = _mm_mul_pd(xmm, other.xmm);
336 return *this;
337 }
338
339 inline XMMReg2Double operator+(const XMMReg2Double &other) const
340 {
341 XMMReg2Double ret;
342 ret.xmm = _mm_add_pd(xmm, other.xmm);
343 return ret;
344 }
345
346 inline XMMReg2Double operator-(const XMMReg2Double &other) const
347 {
348 XMMReg2Double ret;
349 ret.xmm = _mm_sub_pd(xmm, other.xmm);
350 return ret;
351 }
352
353 inline XMMReg2Double operator*(const XMMReg2Double &other) const
354 {
355 XMMReg2Double ret;
356 ret.xmm = _mm_mul_pd(xmm, other.xmm);
357 return ret;
358 }
359
360 inline XMMReg2Double operator/(const XMMReg2Double &other) const
361 {
362 XMMReg2Double ret;
363 ret.xmm = _mm_div_pd(xmm, other.xmm);
364 return ret;
365 }
366
367 inline double GetHorizSum() const
368 {
369 __m128d xmm2;
370 xmm2 = _mm_shuffle_pd(
371 xmm, xmm,
372 _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
373 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
374 }
375
376 inline void Store2Val(double *ptr) const
377 {
378 _mm_storeu_pd(ptr, xmm);
379 }
380
381 inline void Store2ValAligned(double *ptr) const
382 {
383 _mm_store_pd(ptr, xmm);
384 }
385
386 inline void Store2Val(float *ptr) const
387 {
388 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
389 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
390 }
391
392 inline void Store2Val(unsigned char *ptr) const
393 {
394 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
395 xmm,
396 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
397 tmp = _mm_packs_epi32(tmp, tmp);
398 tmp = _mm_packus_epi16(tmp, tmp);
399 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
400 }
401
402 inline void Store2Val(unsigned short *ptr) const
403 {
404 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
405 xmm,
406 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
407 // X X X X 0 B 0 A --> X X X X A A B A
408 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
409 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
410 }
411
412 inline void StoreMask(unsigned char *ptr) const
413 {
414 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
415 _mm_castpd_si128(xmm));
416 }
417
418 inline operator double() const
419 {
420 return _mm_cvtsd_f64(xmm);
421 }
422};
423
424#else
425
426#ifndef NO_WARN_USE_SSE2_EMULATION
427#warning "Software emulation of SSE2 !"
428#endif
429
430class XMMReg2Double
431{
432 public:
433 double low;
434 double high;
435
436 XMMReg2Double()
437 {
438 }
439
440 XMMReg2Double(double val)
441 {
442 low = val;
443 high = 0.0;
444 }
445
446 XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
447 {
448 }
449
450 static inline XMMReg2Double Zero()
451 {
452 XMMReg2Double reg;
453 reg.Zeroize();
454 return reg;
455 }
456
457 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
458 {
459 XMMReg2Double reg;
460 reg.nsLoad1ValHighAndLow(ptr);
461 return reg;
462 }
463
464 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
465 const XMMReg2Double &expr2)
466 {
467 XMMReg2Double reg;
468
469 if (expr1.low == expr2.low)
470 memset(&(reg.low), 0xFF, sizeof(double));
471 else
472 reg.low = 0;
473
474 if (expr1.high == expr2.high)
475 memset(&(reg.high), 0xFF, sizeof(double));
476 else
477 reg.high = 0;
478
479 return reg;
480 }
481
482 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
483 const XMMReg2Double &expr2)
484 {
485 XMMReg2Double reg;
486
487 if (expr1.low != expr2.low)
488 memset(&(reg.low), 0xFF, sizeof(double));
489 else
490 reg.low = 0;
491
492 if (expr1.high != expr2.high)
493 memset(&(reg.high), 0xFF, sizeof(double));
494 else
495 reg.high = 0;
496
497 return reg;
498 }
499
500 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
501 const XMMReg2Double &expr2)
502 {
503 XMMReg2Double reg;
504
505 if (expr1.low > expr2.low)
506 memset(&(reg.low), 0xFF, sizeof(double));
507 else
508 reg.low = 0;
509
510 if (expr1.high > expr2.high)
511 memset(&(reg.high), 0xFF, sizeof(double));
512 else
513 reg.high = 0;
514
515 return reg;
516 }
517
518 static inline XMMReg2Double And(const XMMReg2Double &expr1,
519 const XMMReg2Double &expr2)
520 {
521 XMMReg2Double reg;
522 int low1[2], high1[2];
523 int low2[2], high2[2];
524 memcpy(low1, &expr1.low, sizeof(double));
525 memcpy(high1, &expr1.high, sizeof(double));
526 memcpy(low2, &expr2.low, sizeof(double));
527 memcpy(high2, &expr2.high, sizeof(double));
528 low1[0] &= low2[0];
529 low1[1] &= low2[1];
530 high1[0] &= high2[0];
531 high1[1] &= high2[1];
532 memcpy(&reg.low, low1, sizeof(double));
533 memcpy(&reg.high, high1, sizeof(double));
534 return reg;
535 }
536
537 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
538 const XMMReg2Double &true_expr,
539 const XMMReg2Double &false_expr)
540 {
541 XMMReg2Double reg;
542 if (cond.low != 0)
543 reg.low = true_expr.low;
544 else
545 reg.low = false_expr.low;
546 if (cond.high != 0)
547 reg.high = true_expr.high;
548 else
549 reg.high = false_expr.high;
550 return reg;
551 }
552
553 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
554 const XMMReg2Double &expr2)
555 {
556 XMMReg2Double reg;
557 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
558 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
559 return reg;
560 }
561
562 static inline XMMReg2Double Load2Val(const double *ptr)
563 {
564 XMMReg2Double reg;
565 reg.nsLoad2Val(ptr);
566 return reg;
567 }
568
569 static inline XMMReg2Double Load2ValAligned(const double *ptr)
570 {
571 XMMReg2Double reg;
572 reg.nsLoad2ValAligned(ptr);
573 return reg;
574 }
575
576 static inline XMMReg2Double Load2Val(const float *ptr)
577 {
578 XMMReg2Double reg;
579 reg.nsLoad2Val(ptr);
580 return reg;
581 }
582
583 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
584 {
585 XMMReg2Double reg;
586 reg.nsLoad2Val(ptr);
587 return reg;
588 }
589
590 static inline XMMReg2Double Load2Val(const short *ptr)
591 {
592 XMMReg2Double reg;
593 reg.nsLoad2Val(ptr);
594 return reg;
595 }
596
597 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
598 {
599 XMMReg2Double reg;
600 reg.nsLoad2Val(ptr);
601 return reg;
602 }
603
604 inline void nsLoad1ValHighAndLow(const double *ptr)
605 {
606 low = ptr[0];
607 high = ptr[0];
608 }
609
610 inline void nsLoad2Val(const double *ptr)
611 {
612 low = ptr[0];
613 high = ptr[1];
614 }
615
616 inline void nsLoad2ValAligned(const double *ptr)
617 {
618 low = ptr[0];
619 high = ptr[1];
620 }
621
622 inline void nsLoad2Val(const float *ptr)
623 {
624 low = ptr[0];
625 high = ptr[1];
626 }
627
628 inline void nsLoad2Val(const unsigned char *ptr)
629 {
630 low = ptr[0];
631 high = ptr[1];
632 }
633
634 inline void nsLoad2Val(const short *ptr)
635 {
636 low = ptr[0];
637 high = ptr[1];
638 }
639
640 inline void nsLoad2Val(const unsigned short *ptr)
641 {
642 low = ptr[0];
643 high = ptr[1];
644 }
645
646 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
647 XMMReg2Double &high)
648 {
649 low.low = ptr[0];
650 low.high = ptr[1];
651 high.low = ptr[2];
652 high.high = ptr[3];
653 }
654
655 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
656 XMMReg2Double &high)
657 {
658 low.nsLoad2Val(ptr);
659 high.nsLoad2Val(ptr + 2);
660 }
661
662 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
663 XMMReg2Double &high)
664 {
665 low.nsLoad2Val(ptr);
666 high.nsLoad2Val(ptr + 2);
667 }
668
669 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
670 XMMReg2Double &high)
671 {
672 low.nsLoad2Val(ptr);
673 high.nsLoad2Val(ptr + 2);
674 }
675
676 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
677 XMMReg2Double &high)
678 {
679 low.nsLoad2Val(ptr);
680 high.nsLoad2Val(ptr + 2);
681 }
682
683 inline void Zeroize()
684 {
685 low = 0.0;
686 high = 0.0;
687 }
688
689 inline XMMReg2Double &operator=(const XMMReg2Double &other)
690 {
691 low = other.low;
692 high = other.high;
693 return *this;
694 }
695
696 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
697 {
698 low += other.low;
699 high += other.high;
700 return *this;
701 }
702
703 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
704 {
705 low *= other.low;
706 high *= other.high;
707 return *this;
708 }
709
710 inline XMMReg2Double operator+(const XMMReg2Double &other) const
711 {
712 XMMReg2Double ret;
713 ret.low = low + other.low;
714 ret.high = high + other.high;
715 return ret;
716 }
717
718 inline XMMReg2Double operator-(const XMMReg2Double &other) const
719 {
720 XMMReg2Double ret;
721 ret.low = low - other.low;
722 ret.high = high - other.high;
723 return ret;
724 }
725
726 inline XMMReg2Double operator*(const XMMReg2Double &other) const
727 {
728 XMMReg2Double ret;
729 ret.low = low * other.low;
730 ret.high = high * other.high;
731 return ret;
732 }
733
734 inline XMMReg2Double operator/(const XMMReg2Double &other) const
735 {
736 XMMReg2Double ret;
737 ret.low = low / other.low;
738 ret.high = high / other.high;
739 return ret;
740 }
741
742 inline double GetHorizSum() const
743 {
744 return low + high;
745 }
746
747 inline void Store2Val(double *ptr) const
748 {
749 ptr[0] = low;
750 ptr[1] = high;
751 }
752
753 inline void Store2ValAligned(double *ptr) const
754 {
755 ptr[0] = low;
756 ptr[1] = high;
757 }
758
759 inline void Store2Val(float *ptr) const
760 {
761 ptr[0] = static_cast<float>(low);
762 ptr[1] = static_cast<float>(high);
763 }
764
765 void Store2Val(unsigned char *ptr) const
766 {
767 ptr[0] = (unsigned char)(low + 0.5);
768 ptr[1] = (unsigned char)(high + 0.5);
769 }
770
771 void Store2Val(unsigned short *ptr) const
772 {
773 ptr[0] = (GUInt16)(low + 0.5);
774 ptr[1] = (GUInt16)(high + 0.5);
775 }
776
777 inline void StoreMask(unsigned char *ptr) const
778 {
779 memcpy(ptr, &low, 8);
780 memcpy(ptr + 8, &high, 8);
781 }
782
783 inline operator double() const
784 {
785 return low;
786 }
787};
788
789#endif /* defined(__x86_64) || defined(_M_X64) */
790
791#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
792
793#include <immintrin.h>
794
795class XMMReg4Double
796{
797 public:
798 __m256d ymm;
799
800 XMMReg4Double() : ymm(_mm256_setzero_pd())
801 {
802 }
803
804 XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
805 {
806 }
807
808 static inline XMMReg4Double Zero()
809 {
810 XMMReg4Double reg;
811 reg.Zeroize();
812 return reg;
813 }
814
815 inline void Zeroize()
816 {
817 ymm = _mm256_setzero_pd();
818 }
819
820 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
821 {
822 XMMReg4Double reg;
823 reg.nsLoad1ValHighAndLow(ptr);
824 return reg;
825 }
826
827 inline void nsLoad1ValHighAndLow(const double *ptr)
828 {
829 ymm = _mm256_set1_pd(*ptr);
830 }
831
832 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
833 {
834 XMMReg4Double reg;
835 reg.nsLoad4Val(ptr);
836 return reg;
837 }
838
839 inline void nsLoad4Val(const unsigned char *ptr)
840 {
841 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842 xmm_i = _mm_cvtepu8_epi32(xmm_i);
843 ymm = _mm256_cvtepi32_pd(xmm_i);
844 }
845
846 static inline XMMReg4Double Load4Val(const short *ptr)
847 {
848 XMMReg4Double reg;
849 reg.nsLoad4Val(ptr);
850 return reg;
851 }
852
853 inline void nsLoad4Val(const short *ptr)
854 {
855 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
856 xmm_i = _mm_cvtepi16_epi32(xmm_i);
857 ymm = _mm256_cvtepi32_pd(xmm_i);
858 }
859
860 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
861 {
862 XMMReg4Double reg;
863 reg.nsLoad4Val(ptr);
864 return reg;
865 }
866
867 inline void nsLoad4Val(const unsigned short *ptr)
868 {
869 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
870 xmm_i = _mm_cvtepu16_epi32(xmm_i);
871 ymm = _mm256_cvtepi32_pd(
872 xmm_i); // ok to use signed conversion since we are in the ushort
873 // range, so cannot be interpreted as negative int32
874 }
875
876 static inline XMMReg4Double Load4Val(const double *ptr)
877 {
878 XMMReg4Double reg;
879 reg.nsLoad4Val(ptr);
880 return reg;
881 }
882
883 inline void nsLoad4Val(const double *ptr)
884 {
885 ymm = _mm256_loadu_pd(ptr);
886 }
887
888 static inline XMMReg4Double Load4ValAligned(const double *ptr)
889 {
890 XMMReg4Double reg;
891 reg.nsLoad4ValAligned(ptr);
892 return reg;
893 }
894
895 inline void nsLoad4ValAligned(const double *ptr)
896 {
897 ymm = _mm256_load_pd(ptr);
898 }
899
900 static inline XMMReg4Double Load4Val(const float *ptr)
901 {
902 XMMReg4Double reg;
903 reg.nsLoad4Val(ptr);
904 return reg;
905 }
906
907 inline void nsLoad4Val(const float *ptr)
908 {
909 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
910 }
911
912 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
913 const XMMReg4Double &expr2)
914 {
915 XMMReg4Double reg;
916 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
917 return reg;
918 }
919
920 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
921 const XMMReg4Double &expr2)
922 {
923 XMMReg4Double reg;
924 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
925 return reg;
926 }
927
928 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
929 const XMMReg4Double &expr2)
930 {
931 XMMReg4Double reg;
932 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
933 return reg;
934 }
935
936 static inline XMMReg4Double And(const XMMReg4Double &expr1,
937 const XMMReg4Double &expr2)
938 {
939 XMMReg4Double reg;
940 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
941 return reg;
942 }
943
944 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
945 const XMMReg4Double &true_expr,
946 const XMMReg4Double &false_expr)
947 {
948 XMMReg4Double reg;
949 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
950 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
951 return reg;
952 }
953
954 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
955 const XMMReg4Double &expr2)
956 {
957 XMMReg4Double reg;
958 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
959 return reg;
960 }
961
962 inline XMMReg4Double &operator=(const XMMReg4Double &other)
963 {
964 ymm = other.ymm;
965 return *this;
966 }
967
968 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
969 {
970 ymm = _mm256_add_pd(ymm, other.ymm);
971 return *this;
972 }
973
974 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
975 {
976 ymm = _mm256_mul_pd(ymm, other.ymm);
977 return *this;
978 }
979
980 inline XMMReg4Double operator+(const XMMReg4Double &other) const
981 {
982 XMMReg4Double ret;
983 ret.ymm = _mm256_add_pd(ymm, other.ymm);
984 return ret;
985 }
986
987 inline XMMReg4Double operator-(const XMMReg4Double &other) const
988 {
989 XMMReg4Double ret;
990 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
991 return ret;
992 }
993
994 inline XMMReg4Double operator*(const XMMReg4Double &other) const
995 {
996 XMMReg4Double ret;
997 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
998 return ret;
999 }
1000
1001 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1002 {
1003 XMMReg4Double ret;
1004 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1005 return ret;
1006 }
1007
1008 void AddToLow(const XMMReg2Double &other)
1009 {
1010 __m256d ymm2 = _mm256_setzero_pd();
1011 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1012 ymm = _mm256_add_pd(ymm, ymm2);
1013 }
1014
1015 inline double GetHorizSum() const
1016 {
1017 __m256d ymm_tmp1, ymm_tmp2;
1018 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1019 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1020 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1021 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1022 }
1023
1024 inline void Store4Val(unsigned char *ptr) const
1025 {
1026 __m128i xmm_i =
1027 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1028 // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1029 // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1030 xmm_i =
1031 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1032 (12 << 24))); // SSSE3
1033 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1034 }
1035
1036 inline void Store4Val(unsigned short *ptr) const
1037 {
1038 __m128i xmm_i =
1039 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1040 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1041 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1042 }
1043
1044 inline void Store4Val(float *ptr) const
1045 {
1046 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1047 }
1048
1049 inline void Store4Val(double *ptr) const
1050 {
1051 _mm256_storeu_pd(ptr, ymm);
1052 }
1053
1054 inline void StoreMask(unsigned char *ptr) const
1055 {
1056 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1057 _mm256_castpd_si256(ymm));
1058 }
1059};
1060
1061#else
1062
1063class XMMReg4Double
1064{
1065 public:
1066 XMMReg2Double low, high;
1067
1068#if defined(__GNUC__)
1069#pragma GCC diagnostic push
1070#pragma GCC diagnostic ignored "-Weffc++"
1071#endif
1072 /* coverity[uninit_member] */
1073 XMMReg4Double() = default;
1074#if defined(__GNUC__)
1075#pragma GCC diagnostic pop
1076#endif
1077
1078 XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1079 {
1080 }
1081
1082 static inline XMMReg4Double Zero()
1083 {
1084 XMMReg4Double reg;
1085 reg.low.Zeroize();
1086 reg.high.Zeroize();
1087 return reg;
1088 }
1089
1090 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1091 {
1092 XMMReg4Double reg;
1093 reg.low.nsLoad1ValHighAndLow(ptr);
1094 reg.high = reg.low;
1095 return reg;
1096 }
1097
1098 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1099 {
1100 XMMReg4Double reg;
1101 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1102 return reg;
1103 }
1104
1105 static inline XMMReg4Double Load4Val(const short *ptr)
1106 {
1107 XMMReg4Double reg;
1108 reg.low.nsLoad2Val(ptr);
1109 reg.high.nsLoad2Val(ptr + 2);
1110 return reg;
1111 }
1112
1113 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1114 {
1115 XMMReg4Double reg;
1116 reg.low.nsLoad2Val(ptr);
1117 reg.high.nsLoad2Val(ptr + 2);
1118 return reg;
1119 }
1120
1121 static inline XMMReg4Double Load4Val(const double *ptr)
1122 {
1123 XMMReg4Double reg;
1124 reg.low.nsLoad2Val(ptr);
1125 reg.high.nsLoad2Val(ptr + 2);
1126 return reg;
1127 }
1128
1129 static inline XMMReg4Double Load4ValAligned(const double *ptr)
1130 {
1131 XMMReg4Double reg;
1132 reg.low.nsLoad2ValAligned(ptr);
1133 reg.high.nsLoad2ValAligned(ptr + 2);
1134 return reg;
1135 }
1136
1137 static inline XMMReg4Double Load4Val(const float *ptr)
1138 {
1139 XMMReg4Double reg;
1140 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1141 return reg;
1142 }
1143
1144 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1145 const XMMReg4Double &expr2)
1146 {
1147 XMMReg4Double reg;
1148 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1149 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1150 return reg;
1151 }
1152
1153 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1154 const XMMReg4Double &expr2)
1155 {
1156 XMMReg4Double reg;
1157 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1158 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1159 return reg;
1160 }
1161
1162 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1163 const XMMReg4Double &expr2)
1164 {
1165 XMMReg4Double reg;
1166 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1167 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1168 return reg;
1169 }
1170
1171 static inline XMMReg4Double And(const XMMReg4Double &expr1,
1172 const XMMReg4Double &expr2)
1173 {
1174 XMMReg4Double reg;
1175 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1176 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1177 return reg;
1178 }
1179
1180 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1181 const XMMReg4Double &true_expr,
1182 const XMMReg4Double &false_expr)
1183 {
1184 XMMReg4Double reg;
1185 reg.low =
1186 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1187 reg.high =
1188 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1189 return reg;
1190 }
1191
1192 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1193 const XMMReg4Double &expr2)
1194 {
1195 XMMReg4Double reg;
1196 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1197 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1198 return reg;
1199 }
1200
1201 inline XMMReg4Double &operator=(const XMMReg4Double &other)
1202 {
1203 low = other.low;
1204 high = other.high;
1205 return *this;
1206 }
1207
1208 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1209 {
1210 low += other.low;
1211 high += other.high;
1212 return *this;
1213 }
1214
1215 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1216 {
1217 low *= other.low;
1218 high *= other.high;
1219 return *this;
1220 }
1221
1222 inline XMMReg4Double operator+(const XMMReg4Double &other) const
1223 {
1224 XMMReg4Double ret;
1225 ret.low = low + other.low;
1226 ret.high = high + other.high;
1227 return ret;
1228 }
1229
1230 inline XMMReg4Double operator-(const XMMReg4Double &other) const
1231 {
1232 XMMReg4Double ret;
1233 ret.low = low - other.low;
1234 ret.high = high - other.high;
1235 return ret;
1236 }
1237
1238 inline XMMReg4Double operator*(const XMMReg4Double &other) const
1239 {
1240 XMMReg4Double ret;
1241 ret.low = low * other.low;
1242 ret.high = high * other.high;
1243 return ret;
1244 }
1245
1246 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1247 {
1248 XMMReg4Double ret;
1249 ret.low = low / other.low;
1250 ret.high = high / other.high;
1251 return ret;
1252 }
1253
1254 void AddToLow(const XMMReg2Double &other)
1255 {
1256 low += other;
1257 }
1258
1259 inline double GetHorizSum() const
1260 {
1261 return (low + high).GetHorizSum();
1262 }
1263
1264 inline void Store4Val(unsigned char *ptr) const
1265 {
1266#ifdef USE_SSE2_EMULATION
1267 low.Store2Val(ptr);
1268 high.Store2Val(ptr + 2);
1269#else
1270 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1271 low.xmm,
1272 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1273 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1274 high.xmm,
1275 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1276 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1277 _mm_castsi128_ps(tmpHigh),
1278 _MM_SHUFFLE(1, 0, 1, 0)));
1279 tmp = _mm_packs_epi32(tmp, tmp);
1280 tmp = _mm_packus_epi16(tmp, tmp);
1281 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1282#endif
1283 }
1284
1285 inline void Store4Val(unsigned short *ptr) const
1286 {
1287#if 1
1288 low.Store2Val(ptr);
1289 high.Store2Val(ptr + 2);
1290#else
1291 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1292 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1293 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1294#if __SSE4_1__
1295 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1296#else
1297 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1298 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1299 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1300#endif
1301 GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
1302#endif
1303 }
1304
1305 inline void Store4Val(float *ptr) const
1306 {
1307 low.Store2Val(ptr);
1308 high.Store2Val(ptr + 2);
1309 }
1310
1311 inline void Store4Val(double *ptr) const
1312 {
1313 low.Store2Val(ptr);
1314 high.Store2Val(ptr + 2);
1315 }
1316
1317 inline void StoreMask(unsigned char *ptr) const
1318 {
1319 low.StoreMask(ptr);
1320 high.StoreMask(ptr + 16);
1321 }
1322};
1323
1324#endif
1325
1326#endif /* #ifndef DOXYGEN_SKIP */
1327
1328#endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:181
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:236
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:183
int GInt32
Int32 type.
Definition: cpl_port.h:175