GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id$
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40  !defined(USE_SSE2_EMULATION)
41 
42 /* Requires SSE2 */
43 #include <emmintrin.h>
44 #include <string.h>
45 
46 #ifdef __SSE4_1__
47 #include <smmintrin.h>
48 #endif
49 
50 #include "gdal_priv_templates.hpp"
51 
52 static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
53 {
54 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
55  unsigned short s;
56  memcpy(&s, ptr, 2);
57  return _mm_cvtsi32_si128(s);
58 #else
59  return _mm_cvtsi32_si128(*static_cast<const unsigned short *>(ptr));
60 #endif
61 }
62 
63 static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
64 {
65 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
66  GInt32 i;
67  memcpy(&i, ptr, 4);
68  return _mm_cvtsi32_si128(i);
69 #else
70  return _mm_cvtsi32_si128(*static_cast<const GInt32 *>(ptr));
71 #endif
72 }
73 
74 static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
75 {
76 #if defined(__i386__) || defined(_M_IX86)
77  return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
78 #elif defined(CPL_CPU_REQUIRES_ALIGNED_ACCESS)
79  GInt64 i;
80  memcpy(&i, ptr, 8);
81  return _mm_cvtsi64_si128(i);
82 #else
83  return _mm_cvtsi64_si128(*static_cast<const GInt64 *>(ptr));
84 #endif
85 }
86 
87 static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
88 {
89 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
90  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91  memcpy(pDest, &i, 2);
92 #else
93  *static_cast<GInt16 *>(pDest) =
94  static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
95 #endif
96 }
97 
98 class XMMReg2Double
99 {
100  public:
101  __m128d xmm;
102 
103 #if defined(__GNUC__)
104 #pragma GCC diagnostic push
105 #pragma GCC diagnostic ignored "-Weffc++"
106 #endif
107  /* coverity[uninit_member] */
108  XMMReg2Double() = default;
109 #if defined(__GNUC__)
110 #pragma GCC diagnostic pop
111 #endif
112 
113  XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
114  {
115  }
116  XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
117  {
118  }
119 
120  static inline XMMReg2Double Zero()
121  {
122  XMMReg2Double reg;
123  reg.Zeroize();
124  return reg;
125  }
126 
127  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
128  {
129  XMMReg2Double reg;
130  reg.nsLoad1ValHighAndLow(ptr);
131  return reg;
132  }
133 
134  static inline XMMReg2Double Load2Val(const double *ptr)
135  {
136  XMMReg2Double reg;
137  reg.nsLoad2Val(ptr);
138  return reg;
139  }
140 
141  static inline XMMReg2Double Load2Val(const float *ptr)
142  {
143  XMMReg2Double reg;
144  reg.nsLoad2Val(ptr);
145  return reg;
146  }
147 
148  static inline XMMReg2Double Load2ValAligned(const double *ptr)
149  {
150  XMMReg2Double reg;
151  reg.nsLoad2ValAligned(ptr);
152  return reg;
153  }
154 
155  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
156  {
157  XMMReg2Double reg;
158  reg.nsLoad2Val(ptr);
159  return reg;
160  }
161 
162  static inline XMMReg2Double Load2Val(const short *ptr)
163  {
164  XMMReg2Double reg;
165  reg.nsLoad2Val(ptr);
166  return reg;
167  }
168 
169  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
170  {
171  XMMReg2Double reg;
172  reg.nsLoad2Val(ptr);
173  return reg;
174  }
175 
176  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
177  const XMMReg2Double &expr2)
178  {
179  XMMReg2Double reg;
180  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
181  return reg;
182  }
183 
184  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
185  const XMMReg2Double &expr2)
186  {
187  XMMReg2Double reg;
188  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
189  return reg;
190  }
191 
192  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
193  const XMMReg2Double &expr2)
194  {
195  XMMReg2Double reg;
196  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
197  return reg;
198  }
199 
200  static inline XMMReg2Double And(const XMMReg2Double &expr1,
201  const XMMReg2Double &expr2)
202  {
203  XMMReg2Double reg;
204  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
205  return reg;
206  }
207 
208  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
209  const XMMReg2Double &true_expr,
210  const XMMReg2Double &false_expr)
211  {
212  XMMReg2Double reg;
213  reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
214  _mm_andnot_pd(cond.xmm, false_expr.xmm));
215  return reg;
216  }
217 
218  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
219  const XMMReg2Double &expr2)
220  {
221  XMMReg2Double reg;
222  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
223  return reg;
224  }
225 
226  inline void nsLoad1ValHighAndLow(const double *ptr)
227  {
228  xmm = _mm_load1_pd(ptr);
229  }
230 
231  inline void nsLoad2Val(const double *ptr)
232  {
233  xmm = _mm_loadu_pd(ptr);
234  }
235 
236  inline void nsLoad2ValAligned(const double *ptr)
237  {
238  xmm = _mm_load_pd(ptr);
239  }
240 
241  inline void nsLoad2Val(const float *ptr)
242  {
243  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
244  }
245 
246  inline void nsLoad2Val(const unsigned char *ptr)
247  {
248  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
249 #ifdef __SSE4_1__
250  xmm_i = _mm_cvtepu8_epi32(xmm_i);
251 #else
252  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
253  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
254 #endif
255  xmm = _mm_cvtepi32_pd(xmm_i);
256  }
257 
258  inline void nsLoad2Val(const short *ptr)
259  {
260  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
261 #ifdef __SSE4_1__
262  xmm_i = _mm_cvtepi16_epi32(xmm_i);
263 #else
264  xmm_i = _mm_unpacklo_epi16(
265  xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
266  xmm_i = _mm_srai_epi32(
267  xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
268 #endif
269  xmm = _mm_cvtepi32_pd(xmm_i);
270  }
271 
272  inline void nsLoad2Val(const unsigned short *ptr)
273  {
274  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
275 #ifdef __SSE4_1__
276  xmm_i = _mm_cvtepu16_epi32(xmm_i);
277 #else
278  xmm_i = _mm_unpacklo_epi16(
279  xmm_i,
280  _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
281 #endif
282  xmm = _mm_cvtepi32_pd(xmm_i);
283  }
284 
285  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
286  XMMReg2Double &high)
287  {
288  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
289 #ifdef __SSE4_1__
290  xmm_i = _mm_cvtepu8_epi32(xmm_i);
291 #else
292  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
293  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
294 #endif
295  low.xmm = _mm_cvtepi32_pd(xmm_i);
296  high.xmm =
297  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
298  }
299 
300  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
301  XMMReg2Double &high)
302  {
303  low.nsLoad2Val(ptr);
304  high.nsLoad2Val(ptr + 2);
305  }
306 
307  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
308  XMMReg2Double &high)
309  {
310  low.nsLoad2Val(ptr);
311  high.nsLoad2Val(ptr + 2);
312  }
313 
314  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
315  XMMReg2Double &high)
316  {
317  low.nsLoad2Val(ptr);
318  high.nsLoad2Val(ptr + 2);
319  }
320 
321  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
322  XMMReg2Double &high)
323  {
324  __m128 temp1 = _mm_loadu_ps(ptr);
325  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
326  low.xmm = _mm_cvtps_pd(temp1);
327  high.xmm = _mm_cvtps_pd(temp2);
328  }
329 
330  inline void Zeroize()
331  {
332  xmm = _mm_setzero_pd();
333  }
334 
335  inline XMMReg2Double &operator=(const XMMReg2Double &other)
336  {
337  xmm = other.xmm;
338  return *this;
339  }
340 
341  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
342  {
343  xmm = _mm_add_pd(xmm, other.xmm);
344  return *this;
345  }
346 
347  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
348  {
349  xmm = _mm_mul_pd(xmm, other.xmm);
350  return *this;
351  }
352 
353  inline XMMReg2Double operator+(const XMMReg2Double &other) const
354  {
355  XMMReg2Double ret;
356  ret.xmm = _mm_add_pd(xmm, other.xmm);
357  return ret;
358  }
359 
360  inline XMMReg2Double operator-(const XMMReg2Double &other) const
361  {
362  XMMReg2Double ret;
363  ret.xmm = _mm_sub_pd(xmm, other.xmm);
364  return ret;
365  }
366 
367  inline XMMReg2Double operator*(const XMMReg2Double &other) const
368  {
369  XMMReg2Double ret;
370  ret.xmm = _mm_mul_pd(xmm, other.xmm);
371  return ret;
372  }
373 
374  inline XMMReg2Double operator/(const XMMReg2Double &other) const
375  {
376  XMMReg2Double ret;
377  ret.xmm = _mm_div_pd(xmm, other.xmm);
378  return ret;
379  }
380 
381  inline double GetHorizSum() const
382  {
383  __m128d xmm2;
384  xmm2 = _mm_shuffle_pd(
385  xmm, xmm,
386  _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
387  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
388  }
389 
390  inline void Store2Val(double *ptr) const
391  {
392  _mm_storeu_pd(ptr, xmm);
393  }
394 
395  inline void Store2ValAligned(double *ptr) const
396  {
397  _mm_store_pd(ptr, xmm);
398  }
399 
400  inline void Store2Val(float *ptr) const
401  {
402  __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
403  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
404  }
405 
406  inline void Store2Val(unsigned char *ptr) const
407  {
408  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
409  xmm,
410  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
411  tmp = _mm_packs_epi32(tmp, tmp);
412  tmp = _mm_packus_epi16(tmp, tmp);
413  GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
414  }
415 
416  inline void Store2Val(unsigned short *ptr) const
417  {
418  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
419  xmm,
420  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
421  // X X X X 0 B 0 A --> X X X X A A B A
422  tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
423  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
424  }
425 
426  inline void StoreMask(unsigned char *ptr) const
427  {
428  _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
429  _mm_castpd_si128(xmm));
430  }
431 
432  inline operator double() const
433  {
434  return _mm_cvtsd_f64(xmm);
435  }
436 };
437 
438 #else
439 
440 #ifndef NO_WARN_USE_SSE2_EMULATION
441 #warning "Software emulation of SSE2 !"
442 #endif
443 
444 class XMMReg2Double
445 {
446  public:
447  double low;
448  double high;
449 
450  XMMReg2Double()
451  {
452  }
453  XMMReg2Double(double val)
454  {
455  low = val;
456  high = 0.0;
457  }
458  XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
459  {
460  }
461 
462  static inline XMMReg2Double Zero()
463  {
464  XMMReg2Double reg;
465  reg.Zeroize();
466  return reg;
467  }
468 
469  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
470  {
471  XMMReg2Double reg;
472  reg.nsLoad1ValHighAndLow(ptr);
473  return reg;
474  }
475 
476  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
477  const XMMReg2Double &expr2)
478  {
479  XMMReg2Double reg;
480 
481  if (expr1.low == expr2.low)
482  memset(&(reg.low), 0xFF, sizeof(double));
483  else
484  reg.low = 0;
485 
486  if (expr1.high == expr2.high)
487  memset(&(reg.high), 0xFF, sizeof(double));
488  else
489  reg.high = 0;
490 
491  return reg;
492  }
493 
494  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
495  const XMMReg2Double &expr2)
496  {
497  XMMReg2Double reg;
498 
499  if (expr1.low != expr2.low)
500  memset(&(reg.low), 0xFF, sizeof(double));
501  else
502  reg.low = 0;
503 
504  if (expr1.high != expr2.high)
505  memset(&(reg.high), 0xFF, sizeof(double));
506  else
507  reg.high = 0;
508 
509  return reg;
510  }
511 
512  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
513  const XMMReg2Double &expr2)
514  {
515  XMMReg2Double reg;
516 
517  if (expr1.low > expr2.low)
518  memset(&(reg.low), 0xFF, sizeof(double));
519  else
520  reg.low = 0;
521 
522  if (expr1.high > expr2.high)
523  memset(&(reg.high), 0xFF, sizeof(double));
524  else
525  reg.high = 0;
526 
527  return reg;
528  }
529 
530  static inline XMMReg2Double And(const XMMReg2Double &expr1,
531  const XMMReg2Double &expr2)
532  {
533  XMMReg2Double reg;
534  int low1[2], high1[2];
535  int low2[2], high2[2];
536  memcpy(low1, &expr1.low, sizeof(double));
537  memcpy(high1, &expr1.high, sizeof(double));
538  memcpy(low2, &expr2.low, sizeof(double));
539  memcpy(high2, &expr2.high, sizeof(double));
540  low1[0] &= low2[0];
541  low1[1] &= low2[1];
542  high1[0] &= high2[0];
543  high1[1] &= high2[1];
544  memcpy(&reg.low, low1, sizeof(double));
545  memcpy(&reg.high, high1, sizeof(double));
546  return reg;
547  }
548 
549  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
550  const XMMReg2Double &true_expr,
551  const XMMReg2Double &false_expr)
552  {
553  XMMReg2Double reg;
554  if (cond.low != 0)
555  reg.low = true_expr.low;
556  else
557  reg.low = false_expr.low;
558  if (cond.high != 0)
559  reg.high = true_expr.high;
560  else
561  reg.high = false_expr.high;
562  return reg;
563  }
564 
565  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
566  const XMMReg2Double &expr2)
567  {
568  XMMReg2Double reg;
569  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
570  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
571  return reg;
572  }
573 
574  static inline XMMReg2Double Load2Val(const double *ptr)
575  {
576  XMMReg2Double reg;
577  reg.nsLoad2Val(ptr);
578  return reg;
579  }
580 
581  static inline XMMReg2Double Load2ValAligned(const double *ptr)
582  {
583  XMMReg2Double reg;
584  reg.nsLoad2ValAligned(ptr);
585  return reg;
586  }
587 
588  static inline XMMReg2Double Load2Val(const float *ptr)
589  {
590  XMMReg2Double reg;
591  reg.nsLoad2Val(ptr);
592  return reg;
593  }
594 
595  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
596  {
597  XMMReg2Double reg;
598  reg.nsLoad2Val(ptr);
599  return reg;
600  }
601 
602  static inline XMMReg2Double Load2Val(const short *ptr)
603  {
604  XMMReg2Double reg;
605  reg.nsLoad2Val(ptr);
606  return reg;
607  }
608 
609  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
610  {
611  XMMReg2Double reg;
612  reg.nsLoad2Val(ptr);
613  return reg;
614  }
615 
616  inline void nsLoad1ValHighAndLow(const double *ptr)
617  {
618  low = ptr[0];
619  high = ptr[0];
620  }
621 
622  inline void nsLoad2Val(const double *ptr)
623  {
624  low = ptr[0];
625  high = ptr[1];
626  }
627 
628  inline void nsLoad2ValAligned(const double *ptr)
629  {
630  low = ptr[0];
631  high = ptr[1];
632  }
633 
634  inline void nsLoad2Val(const float *ptr)
635  {
636  low = ptr[0];
637  high = ptr[1];
638  }
639 
640  inline void nsLoad2Val(const unsigned char *ptr)
641  {
642  low = ptr[0];
643  high = ptr[1];
644  }
645 
646  inline void nsLoad2Val(const short *ptr)
647  {
648  low = ptr[0];
649  high = ptr[1];
650  }
651 
652  inline void nsLoad2Val(const unsigned short *ptr)
653  {
654  low = ptr[0];
655  high = ptr[1];
656  }
657 
658  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
659  XMMReg2Double &high)
660  {
661  low.low = ptr[0];
662  low.high = ptr[1];
663  high.low = ptr[2];
664  high.high = ptr[3];
665  }
666 
667  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
668  XMMReg2Double &high)
669  {
670  low.nsLoad2Val(ptr);
671  high.nsLoad2Val(ptr + 2);
672  }
673 
674  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
675  XMMReg2Double &high)
676  {
677  low.nsLoad2Val(ptr);
678  high.nsLoad2Val(ptr + 2);
679  }
680 
681  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
682  XMMReg2Double &high)
683  {
684  low.nsLoad2Val(ptr);
685  high.nsLoad2Val(ptr + 2);
686  }
687 
688  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
689  XMMReg2Double &high)
690  {
691  low.nsLoad2Val(ptr);
692  high.nsLoad2Val(ptr + 2);
693  }
694 
695  inline void Zeroize()
696  {
697  low = 0.0;
698  high = 0.0;
699  }
700 
701  inline XMMReg2Double &operator=(const XMMReg2Double &other)
702  {
703  low = other.low;
704  high = other.high;
705  return *this;
706  }
707 
708  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
709  {
710  low += other.low;
711  high += other.high;
712  return *this;
713  }
714 
715  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
716  {
717  low *= other.low;
718  high *= other.high;
719  return *this;
720  }
721 
722  inline XMMReg2Double operator+(const XMMReg2Double &other) const
723  {
724  XMMReg2Double ret;
725  ret.low = low + other.low;
726  ret.high = high + other.high;
727  return ret;
728  }
729 
730  inline XMMReg2Double operator-(const XMMReg2Double &other) const
731  {
732  XMMReg2Double ret;
733  ret.low = low - other.low;
734  ret.high = high - other.high;
735  return ret;
736  }
737 
738  inline XMMReg2Double operator*(const XMMReg2Double &other) const
739  {
740  XMMReg2Double ret;
741  ret.low = low * other.low;
742  ret.high = high * other.high;
743  return ret;
744  }
745 
746  inline XMMReg2Double operator/(const XMMReg2Double &other) const
747  {
748  XMMReg2Double ret;
749  ret.low = low / other.low;
750  ret.high = high / other.high;
751  return ret;
752  }
753 
754  inline double GetHorizSum() const
755  {
756  return low + high;
757  }
758 
759  inline void Store2Val(double *ptr) const
760  {
761  ptr[0] = low;
762  ptr[1] = high;
763  }
764 
765  inline void Store2ValAligned(double *ptr) const
766  {
767  ptr[0] = low;
768  ptr[1] = high;
769  }
770 
771  inline void Store2Val(float *ptr) const
772  {
773  ptr[0] = static_cast<float>(low);
774  ptr[1] = static_cast<float>(high);
775  }
776 
777  void Store2Val(unsigned char *ptr) const
778  {
779  ptr[0] = (unsigned char)(low + 0.5);
780  ptr[1] = (unsigned char)(high + 0.5);
781  }
782 
783  void Store2Val(unsigned short *ptr) const
784  {
785  ptr[0] = (GUInt16)(low + 0.5);
786  ptr[1] = (GUInt16)(high + 0.5);
787  }
788 
789  inline void StoreMask(unsigned char *ptr) const
790  {
791  memcpy(ptr, &low, 8);
792  memcpy(ptr + 8, &high, 8);
793  }
794 
795  inline operator double() const
796  {
797  return low;
798  }
799 };
800 
801 #endif /* defined(__x86_64) || defined(_M_X64) */
802 
803 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
804 
805 #include <immintrin.h>
806 
807 class XMMReg4Double
808 {
809  public:
810  __m256d ymm;
811 
812  XMMReg4Double() : ymm(_mm256_setzero_pd())
813  {
814  }
815  XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
816  {
817  }
818 
819  static inline XMMReg4Double Zero()
820  {
821  XMMReg4Double reg;
822  reg.Zeroize();
823  return reg;
824  }
825 
826  inline void Zeroize()
827  {
828  ymm = _mm256_setzero_pd();
829  }
830 
831  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
832  {
833  XMMReg4Double reg;
834  reg.nsLoad1ValHighAndLow(ptr);
835  return reg;
836  }
837 
838  inline void nsLoad1ValHighAndLow(const double *ptr)
839  {
840  ymm = _mm256_set1_pd(*ptr);
841  }
842 
843  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
844  {
845  XMMReg4Double reg;
846  reg.nsLoad4Val(ptr);
847  return reg;
848  }
849 
850  inline void nsLoad4Val(const unsigned char *ptr)
851  {
852  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
853  xmm_i = _mm_cvtepu8_epi32(xmm_i);
854  ymm = _mm256_cvtepi32_pd(xmm_i);
855  }
856 
857  static inline XMMReg4Double Load4Val(const short *ptr)
858  {
859  XMMReg4Double reg;
860  reg.nsLoad4Val(ptr);
861  return reg;
862  }
863 
864  inline void nsLoad4Val(const short *ptr)
865  {
866  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
867  xmm_i = _mm_cvtepi16_epi32(xmm_i);
868  ymm = _mm256_cvtepi32_pd(xmm_i);
869  }
870 
871  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
872  {
873  XMMReg4Double reg;
874  reg.nsLoad4Val(ptr);
875  return reg;
876  }
877 
878  inline void nsLoad4Val(const unsigned short *ptr)
879  {
880  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
881  xmm_i = _mm_cvtepu16_epi32(xmm_i);
882  ymm = _mm256_cvtepi32_pd(
883  xmm_i); // ok to use signed conversion since we are in the ushort
884  // range, so cannot be interpreted as negative int32
885  }
886 
887  static inline XMMReg4Double Load4Val(const double *ptr)
888  {
889  XMMReg4Double reg;
890  reg.nsLoad4Val(ptr);
891  return reg;
892  }
893 
894  inline void nsLoad4Val(const double *ptr)
895  {
896  ymm = _mm256_loadu_pd(ptr);
897  }
898 
899  static inline XMMReg4Double Load4ValAligned(const double *ptr)
900  {
901  XMMReg4Double reg;
902  reg.nsLoad4ValAligned(ptr);
903  return reg;
904  }
905 
906  inline void nsLoad4ValAligned(const double *ptr)
907  {
908  ymm = _mm256_load_pd(ptr);
909  }
910 
911  static inline XMMReg4Double Load4Val(const float *ptr)
912  {
913  XMMReg4Double reg;
914  reg.nsLoad4Val(ptr);
915  return reg;
916  }
917 
918  inline void nsLoad4Val(const float *ptr)
919  {
920  ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
921  }
922 
923  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
924  const XMMReg4Double &expr2)
925  {
926  XMMReg4Double reg;
927  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
928  return reg;
929  }
930 
931  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
932  const XMMReg4Double &expr2)
933  {
934  XMMReg4Double reg;
935  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
936  return reg;
937  }
938 
939  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
940  const XMMReg4Double &expr2)
941  {
942  XMMReg4Double reg;
943  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
944  return reg;
945  }
946 
947  static inline XMMReg4Double And(const XMMReg4Double &expr1,
948  const XMMReg4Double &expr2)
949  {
950  XMMReg4Double reg;
951  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
952  return reg;
953  }
954 
955  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
956  const XMMReg4Double &true_expr,
957  const XMMReg4Double &false_expr)
958  {
959  XMMReg4Double reg;
960  reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
961  _mm256_andnot_pd(cond.ymm, false_expr.ymm));
962  return reg;
963  }
964 
965  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
966  const XMMReg4Double &expr2)
967  {
968  XMMReg4Double reg;
969  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
970  return reg;
971  }
972 
973  inline XMMReg4Double &operator=(const XMMReg4Double &other)
974  {
975  ymm = other.ymm;
976  return *this;
977  }
978 
979  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
980  {
981  ymm = _mm256_add_pd(ymm, other.ymm);
982  return *this;
983  }
984 
985  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
986  {
987  ymm = _mm256_mul_pd(ymm, other.ymm);
988  return *this;
989  }
990 
991  inline XMMReg4Double operator+(const XMMReg4Double &other) const
992  {
993  XMMReg4Double ret;
994  ret.ymm = _mm256_add_pd(ymm, other.ymm);
995  return ret;
996  }
997 
998  inline XMMReg4Double operator-(const XMMReg4Double &other) const
999  {
1000  XMMReg4Double ret;
1001  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1002  return ret;
1003  }
1004 
1005  inline XMMReg4Double operator*(const XMMReg4Double &other) const
1006  {
1007  XMMReg4Double ret;
1008  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1009  return ret;
1010  }
1011 
1012  inline XMMReg4Double operator/(const XMMReg4Double &other) const
1013  {
1014  XMMReg4Double ret;
1015  ret.ymm = _mm256_div_pd(ymm, other.ymm);
1016  return ret;
1017  }
1018 
1019  void AddToLow(const XMMReg2Double &other)
1020  {
1021  __m256d ymm2 = _mm256_setzero_pd();
1022  ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1023  ymm = _mm256_add_pd(ymm, ymm2);
1024  }
1025 
1026  inline double GetHorizSum() const
1027  {
1028  __m256d ymm_tmp1, ymm_tmp2;
1029  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1030  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1031  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1032  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1033  }
1034 
1035  inline void Store4Val(unsigned char *ptr) const
1036  {
1037  __m128i xmm_i =
1038  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1039  // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1040  // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1041  xmm_i =
1042  _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1043  (12 << 24))); // SSSE3
1044  GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1045  }
1046 
1047  inline void Store4Val(unsigned short *ptr) const
1048  {
1049  __m128i xmm_i =
1050  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1051  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1052  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1053  }
1054 
1055  inline void Store4Val(float *ptr) const
1056  {
1057  _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1058  }
1059 
1060  inline void Store4Val(double *ptr) const
1061  {
1062  _mm256_storeu_pd(ptr, ymm);
1063  }
1064 
1065  inline void StoreMask(unsigned char *ptr) const
1066  {
1067  _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1068  _mm256_castpd_si256(ymm));
1069  }
1070 };
1071 
1072 #else
1073 
1074 class XMMReg4Double
1075 {
1076  public:
1077  XMMReg2Double low, high;
1078 
1079 #if defined(__GNUC__)
1080 #pragma GCC diagnostic push
1081 #pragma GCC diagnostic ignored "-Weffc++"
1082 #endif
1083  /* coverity[uninit_member] */
1084  XMMReg4Double() = default;
1085 #if defined(__GNUC__)
1086 #pragma GCC diagnostic pop
1087 #endif
1088 
1089  XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1090  {
1091  }
1092 
1093  static inline XMMReg4Double Zero()
1094  {
1095  XMMReg4Double reg;
1096  reg.low.Zeroize();
1097  reg.high.Zeroize();
1098  return reg;
1099  }
1100 
1101  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1102  {
1103  XMMReg4Double reg;
1104  reg.low.nsLoad1ValHighAndLow(ptr);
1105  reg.high = reg.low;
1106  return reg;
1107  }
1108 
1109  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1110  {
1111  XMMReg4Double reg;
1112  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1113  return reg;
1114  }
1115 
1116  static inline XMMReg4Double Load4Val(const short *ptr)
1117  {
1118  XMMReg4Double reg;
1119  reg.low.nsLoad2Val(ptr);
1120  reg.high.nsLoad2Val(ptr + 2);
1121  return reg;
1122  }
1123 
1124  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1125  {
1126  XMMReg4Double reg;
1127  reg.low.nsLoad2Val(ptr);
1128  reg.high.nsLoad2Val(ptr + 2);
1129  return reg;
1130  }
1131 
1132  static inline XMMReg4Double Load4Val(const double *ptr)
1133  {
1134  XMMReg4Double reg;
1135  reg.low.nsLoad2Val(ptr);
1136  reg.high.nsLoad2Val(ptr + 2);
1137  return reg;
1138  }
1139 
1140  static inline XMMReg4Double Load4ValAligned(const double *ptr)
1141  {
1142  XMMReg4Double reg;
1143  reg.low.nsLoad2ValAligned(ptr);
1144  reg.high.nsLoad2ValAligned(ptr + 2);
1145  return reg;
1146  }
1147 
1148  static inline XMMReg4Double Load4Val(const float *ptr)
1149  {
1150  XMMReg4Double reg;
1151  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1152  return reg;
1153  }
1154 
1155  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1156  const XMMReg4Double &expr2)
1157  {
1158  XMMReg4Double reg;
1159  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1160  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1161  return reg;
1162  }
1163 
1164  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1165  const XMMReg4Double &expr2)
1166  {
1167  XMMReg4Double reg;
1168  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1169  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1170  return reg;
1171  }
1172 
1173  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1174  const XMMReg4Double &expr2)
1175  {
1176  XMMReg4Double reg;
1177  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1178  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1179  return reg;
1180  }
1181 
1182  static inline XMMReg4Double And(const XMMReg4Double &expr1,
1183  const XMMReg4Double &expr2)
1184  {
1185  XMMReg4Double reg;
1186  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1187  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1188  return reg;
1189  }
1190 
1191  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1192  const XMMReg4Double &true_expr,
1193  const XMMReg4Double &false_expr)
1194  {
1195  XMMReg4Double reg;
1196  reg.low =
1197  XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1198  reg.high =
1199  XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1200  return reg;
1201  }
1202 
1203  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1204  const XMMReg4Double &expr2)
1205  {
1206  XMMReg4Double reg;
1207  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1208  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1209  return reg;
1210  }
1211 
1212  inline XMMReg4Double &operator=(const XMMReg4Double &other)
1213  {
1214  low = other.low;
1215  high = other.high;
1216  return *this;
1217  }
1218 
1219  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1220  {
1221  low += other.low;
1222  high += other.high;
1223  return *this;
1224  }
1225 
1226  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1227  {
1228  low *= other.low;
1229  high *= other.high;
1230  return *this;
1231  }
1232 
1233  inline XMMReg4Double operator+(const XMMReg4Double &other) const
1234  {
1235  XMMReg4Double ret;
1236  ret.low = low + other.low;
1237  ret.high = high + other.high;
1238  return ret;
1239  }
1240 
1241  inline XMMReg4Double operator-(const XMMReg4Double &other) const
1242  {
1243  XMMReg4Double ret;
1244  ret.low = low - other.low;
1245  ret.high = high - other.high;
1246  return ret;
1247  }
1248 
1249  inline XMMReg4Double operator*(const XMMReg4Double &other) const
1250  {
1251  XMMReg4Double ret;
1252  ret.low = low * other.low;
1253  ret.high = high * other.high;
1254  return ret;
1255  }
1256 
1257  inline XMMReg4Double operator/(const XMMReg4Double &other) const
1258  {
1259  XMMReg4Double ret;
1260  ret.low = low / other.low;
1261  ret.high = high / other.high;
1262  return ret;
1263  }
1264 
1265  void AddToLow(const XMMReg2Double &other)
1266  {
1267  low += other;
1268  }
1269 
1270  inline double GetHorizSum() const
1271  {
1272  return (low + high).GetHorizSum();
1273  }
1274 
1275  inline void Store4Val(unsigned char *ptr) const
1276  {
1277 #ifdef USE_SSE2_EMULATION
1278  low.Store2Val(ptr);
1279  high.Store2Val(ptr + 2);
1280 #else
1281  __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1282  low.xmm,
1283  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1284  __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1285  high.xmm,
1286  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1287  auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1288  _mm_castsi128_ps(tmpHigh),
1289  _MM_SHUFFLE(1, 0, 1, 0)));
1290  tmp = _mm_packs_epi32(tmp, tmp);
1291  tmp = _mm_packus_epi16(tmp, tmp);
1292  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1293 #endif
1294  }
1295 
1296  inline void Store4Val(unsigned short *ptr) const
1297  {
1298 #if 1
1299  low.Store2Val(ptr);
1300  high.Store2Val(ptr + 2);
1301 #else
1302  __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1303  __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1304  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1305 #if __SSE4_1__
1306  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1307 #else
1308  xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1309  xmm0 = _mm_packs_epi32(xmm0, xmm0);
1310  xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1311 #endif
1312  GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
1313 #endif
1314  }
1315 
1316  inline void Store4Val(float *ptr) const
1317  {
1318  low.Store2Val(ptr);
1319  high.Store2Val(ptr + 2);
1320  }
1321 
1322  inline void Store4Val(double *ptr) const
1323  {
1324  low.Store2Val(ptr);
1325  high.Store2Val(ptr + 2);
1326  }
1327 
1328  inline void StoreMask(unsigned char *ptr) const
1329  {
1330  low.StoreMask(ptr);
1331  high.StoreMask(ptr + 16);
1332  }
1333 };
1334 
1335 #endif
1336 
1337 #endif /* #ifndef DOXYGEN_SKIP */
1338 
1339 #endif /* GDALSSE_PRIV_H_INCLUDED */
GInt16
short GInt16
Int16 type.
Definition: cpl_port.h:201
GInt64
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:256
cpl_port.h
GUInt16
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:203
GInt32
int GInt32
Int32 type.
Definition: cpl_port.h:195