00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #ifndef GDALSSE_PRIV_H_INCLUDED
00031 #define GDALSSE_PRIV_H_INCLUDED
00032
00033
00034
00035 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
00036
00037
00038 #include <emmintrin.h>
00039 #include <string.h>
00040
00041 class XMMReg2Double
00042 {
00043 public:
00044 __m128d xmm;
00045
00046 XMMReg2Double() {}
00047 XMMReg2Double(double val) { xmm = _mm_load_sd (&val); }
00048 XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
00049
00050 static inline XMMReg2Double Zero()
00051 {
00052 XMMReg2Double reg;
00053 reg.Zeroize();
00054 return reg;
00055 }
00056
00057 static inline XMMReg2Double Load2Val(const double* ptr)
00058 {
00059 XMMReg2Double reg;
00060 reg.nsLoad2Val(ptr);
00061 return reg;
00062 }
00063
00064 static inline XMMReg2Double Load2Val(const float* ptr)
00065 {
00066 XMMReg2Double reg;
00067 reg.nsLoad2Val(ptr);
00068 return reg;
00069 }
00070
00071 static inline XMMReg2Double Load2ValAligned(const double* ptr)
00072 {
00073 XMMReg2Double reg;
00074 reg.nsLoad2ValAligned(ptr);
00075 return reg;
00076 }
00077
00078 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
00079 {
00080 XMMReg2Double reg;
00081 reg.nsLoad2Val(ptr);
00082 return reg;
00083 }
00084
00085 static inline XMMReg2Double Load2Val(const short* ptr)
00086 {
00087 XMMReg2Double reg;
00088 reg.nsLoad2Val(ptr);
00089 return reg;
00090 }
00091
00092 static inline XMMReg2Double Load2Val(const unsigned short* ptr)
00093 {
00094 XMMReg2Double reg;
00095 reg.nsLoad2Val(ptr);
00096 return reg;
00097 }
00098
00099 inline void nsLoad2Val(const double* ptr)
00100 {
00101 xmm = _mm_loadu_pd(ptr);
00102 }
00103
00104 inline void nsLoad2ValAligned(const double* pval)
00105 {
00106 xmm = _mm_load_pd(pval);
00107 }
00108
00109 inline void nsLoad2Val(const float* pval)
00110 {
00111 __m128 temp1 = _mm_load_ss(pval);
00112 __m128 temp2 = _mm_load_ss(pval + 1);
00113 temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
00114 temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
00115 xmm = _mm_cvtps_pd(temp1);
00116 }
00117
00118 inline void nsLoad2Val(const unsigned char* ptr)
00119 {
00120 __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr));
00121 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
00122 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
00123 xmm = _mm_cvtepi32_pd(xmm_i);
00124 }
00125
00126 inline void nsLoad2Val(const short* ptr)
00127 {
00128 int i;
00129 memcpy(&i, ptr, 4);
00130 __m128i xmm_i = _mm_cvtsi32_si128(i);
00131 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
00132 xmm_i = _mm_srai_epi32(xmm_i, 16);
00133 xmm = _mm_cvtepi32_pd(xmm_i);
00134 }
00135
00136 inline void nsLoad2Val(const unsigned short* ptr)
00137 {
00138 int i;
00139 memcpy(&i, ptr, 4);
00140 __m128i xmm_i = _mm_cvtsi32_si128(i);
00141 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
00142 xmm_i = _mm_srli_epi32(xmm_i, 16);
00143 xmm = _mm_cvtepi32_pd(xmm_i);
00144 }
00145
00146 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
00147 {
00148 __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr));
00149 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
00150 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
00151 low.xmm = _mm_cvtepi32_pd(xmm_i);
00152 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
00153 }
00154
00155 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00156 {
00157 low.nsLoad2Val(ptr);
00158 high.nsLoad2Val(ptr+2);
00159 }
00160
00161 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00162 {
00163 low.nsLoad2Val(ptr);
00164 high.nsLoad2Val(ptr+2);
00165 }
00166
00167 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
00168 {
00169 low.nsLoad2Val(ptr);
00170 high.nsLoad2Val(ptr+2);
00171 }
00172
00173 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
00174 {
00175 __m128 temp1 = _mm_loadu_ps(ptr);
00176 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
00177 low.xmm = _mm_cvtps_pd(temp1);
00178 high.xmm = _mm_cvtps_pd(temp2);
00179 }
00180
00181 inline void Zeroize()
00182 {
00183 xmm = _mm_setzero_pd();
00184 }
00185
00186 inline const XMMReg2Double& operator= (const XMMReg2Double& other)
00187 {
00188 xmm = other.xmm;
00189 return *this;
00190 }
00191
00192 inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
00193 {
00194 xmm = _mm_add_pd(xmm, other.xmm);
00195 return *this;
00196 }
00197
00198 inline XMMReg2Double operator+ (const XMMReg2Double& other)
00199 {
00200 XMMReg2Double ret;
00201 ret.xmm = _mm_add_pd(xmm, other.xmm);
00202 return ret;
00203 }
00204
00205 inline XMMReg2Double operator- (const XMMReg2Double& other)
00206 {
00207 XMMReg2Double ret;
00208 ret.xmm = _mm_sub_pd(xmm, other.xmm);
00209 return ret;
00210 }
00211
00212 inline XMMReg2Double operator* (const XMMReg2Double& other)
00213 {
00214 XMMReg2Double ret;
00215 ret.xmm = _mm_mul_pd(xmm, other.xmm);
00216 return ret;
00217 }
00218
00219 inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
00220 {
00221 xmm = _mm_mul_pd(xmm, other.xmm);
00222 return *this;
00223 }
00224
00225 inline void AddLowAndHigh()
00226 {
00227 __m128d xmm2;
00228 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
00229 xmm = _mm_add_pd(xmm, xmm2);
00230 }
00231
00232 inline void Store2Double(double* pval)
00233 {
00234 _mm_storeu_pd(pval, xmm);
00235 }
00236
00237 inline void Store2DoubleAligned(double* pval)
00238 {
00239 _mm_store_pd(pval, xmm);
00240 }
00241
00242 inline operator double () const
00243 {
00244 double val;
00245 _mm_store_sd(&val, xmm);
00246 return val;
00247 }
00248 };
00249
00250 #else
00251
00252 #warning "Software emulation of SSE2 !"
00253
00254 class XMMReg2Double
00255 {
00256 public:
00257 double low;
00258 double high;
00259
00260 XMMReg2Double() {}
00261 XMMReg2Double(double val) { low = val; high = 0.0; }
00262 XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
00263
00264 static inline XMMReg2Double Zero()
00265 {
00266 XMMReg2Double reg;
00267 reg.Zeroize();
00268 return reg;
00269 }
00270
00271 static inline XMMReg2Double Load2Val(const double* ptr)
00272 {
00273 XMMReg2Double reg;
00274 reg.nsLoad2Val(ptr);
00275 return reg;
00276 }
00277
00278 static inline XMMReg2Double Load2ValAligned(const double* ptr)
00279 {
00280 XMMReg2Double reg;
00281 reg.nsLoad2ValAligned(ptr);
00282 return reg;
00283 }
00284
00285 static inline XMMReg2Double Load2Val(const float* ptr)
00286 {
00287 XMMReg2Double reg;
00288 reg.nsLoad2Val(ptr);
00289 return reg;
00290 }
00291
00292 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
00293 {
00294 XMMReg2Double reg;
00295 reg.nsLoad2Val(ptr);
00296 return reg;
00297 }
00298
00299 static inline XMMReg2Double Load2Val(const short* ptr)
00300 {
00301 XMMReg2Double reg;
00302 reg.nsLoad2Val(ptr);
00303 return reg;
00304 }
00305
00306 inline void nsLoad2Val(const double* pval)
00307 {
00308 low = pval[0];
00309 high = pval[1];
00310 }
00311
00312 inline void nsLoad2ValAligned(const double* pval)
00313 {
00314 low = pval[0];
00315 high = pval[1];
00316 }
00317
00318 inline void nsLoad2Val(const float* pval)
00319 {
00320 low = pval[0];
00321 high = pval[1];
00322 }
00323
00324 inline void nsLoad2Val(const unsigned char* ptr)
00325 {
00326 low = ptr[0];
00327 high = ptr[1];
00328 }
00329
00330 inline void nsLoad2Val(const short* ptr)
00331 {
00332 low = ptr[0];
00333 high = ptr[1];
00334 }
00335
00336 inline void nsLoad2Val(const unsigned short* ptr)
00337 {
00338 low = ptr[0];
00339 high = ptr[1];
00340 }
00341
00342 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
00343 {
00344 low.low = ptr[0];
00345 low.high = ptr[1];
00346 high.low = ptr[2];
00347 high.high = ptr[3];
00348 }
00349
00350 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00351 {
00352 low.nsLoad2Val(ptr);
00353 high.nsLoad2Val(ptr+2);
00354 }
00355
00356 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00357 {
00358 low.nsLoad2Val(ptr);
00359 high.nsLoad2Val(ptr+2);
00360 }
00361
00362 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
00363 {
00364 low.nsLoad2Val(ptr);
00365 high.nsLoad2Val(ptr+2);
00366 }
00367
00368 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
00369 {
00370 low.nsLoad2Val(ptr);
00371 high.nsLoad2Val(ptr+2);
00372 }
00373
00374 inline void Zeroize()
00375 {
00376 low = 0.0;
00377 high = 0.0;
00378 }
00379
00380 inline const XMMReg2Double& operator= (const XMMReg2Double& other)
00381 {
00382 low = other.low;
00383 high = other.high;
00384 return *this;
00385 }
00386
00387 inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
00388 {
00389 low += other.low;
00390 high += other.high;
00391 return *this;
00392 }
00393
00394 inline XMMReg2Double operator+ (const XMMReg2Double& other)
00395 {
00396 XMMReg2Double ret;
00397 ret.low = low + other.low;
00398 ret.high = high + other.high;
00399 return ret;
00400 }
00401
00402 inline XMMReg2Double operator- (const XMMReg2Double& other)
00403 {
00404 XMMReg2Double ret;
00405 ret.low = low - other.low;
00406 ret.high = high - other.high;
00407 return ret;
00408 }
00409
00410 inline XMMReg2Double operator* (const XMMReg2Double& other)
00411 {
00412 XMMReg2Double ret;
00413 ret.low = low * other.low;
00414 ret.high = high * other.high;
00415 return ret;
00416 }
00417
00418 inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
00419 {
00420 low *= other.low;
00421 high *= other.high;
00422 return *this;
00423 }
00424
00425 inline void AddLowAndHigh()
00426 {
00427 double add = low + high;
00428 low = add;
00429 high = add;
00430 }
00431
00432 inline void Store2Double(double* pval)
00433 {
00434 pval[0] = low;
00435 pval[1] = high;
00436 }
00437
00438 inline void Store2DoubleAligned(double* pval)
00439 {
00440 pval[0] = low;
00441 pval[1] = high;
00442 }
00443
00444 inline operator double () const
00445 {
00446 return low;
00447 }
00448 };
00449
00450 #endif
00451
00452 class XMMReg4Double
00453 {
00454 public:
00455 XMMReg2Double low, high;
00456
00457 XMMReg4Double() {}
00458 XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
00459
00460 static inline XMMReg4Double Zero()
00461 {
00462 XMMReg4Double reg;
00463 reg.low.Zeroize();
00464 reg.high.Zeroize();
00465 return reg;
00466 }
00467
00468 static inline XMMReg4Double Load4Val(const unsigned char* ptr)
00469 {
00470 XMMReg4Double reg;
00471 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
00472 return reg;
00473 }
00474
00475 static inline XMMReg4Double Load4Val(const short* ptr)
00476 {
00477 XMMReg4Double reg;
00478 reg.low.nsLoad2Val(ptr);
00479 reg.high.nsLoad2Val(ptr+2);
00480 return reg;
00481 }
00482
00483 static inline XMMReg4Double Load4Val(const unsigned short* ptr)
00484 {
00485 XMMReg4Double reg;
00486 reg.low.nsLoad2Val(ptr);
00487 reg.high.nsLoad2Val(ptr+2);
00488 return reg;
00489 }
00490
00491 static inline XMMReg4Double Load4Val(const double* ptr)
00492 {
00493 XMMReg4Double reg;
00494 reg.low.nsLoad2Val(ptr);
00495 reg.high.nsLoad2Val(ptr+2);
00496 return reg;
00497 }
00498
00499 static inline XMMReg4Double Load4ValAligned(const double* ptr)
00500 {
00501 XMMReg4Double reg;
00502 reg.low.nsLoad2ValAligned(ptr);
00503 reg.high.nsLoad2ValAligned(ptr+2);
00504 return reg;
00505 }
00506
00507 static inline XMMReg4Double Load4Val(const float* ptr)
00508 {
00509 XMMReg4Double reg;
00510 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
00511 return reg;
00512 }
00513
00514 inline const XMMReg4Double& operator= (const XMMReg4Double& other)
00515 {
00516 low = other.low;
00517 high = other.high;
00518 return *this;
00519 }
00520
00521 inline const XMMReg4Double& operator+= (const XMMReg4Double& other)
00522 {
00523 low += other.low;
00524 high += other.high;
00525 return *this;
00526 }
00527
00528 inline XMMReg4Double operator+ (const XMMReg4Double& other)
00529 {
00530 XMMReg4Double ret;
00531 ret.low = low + other.low;
00532 ret.high = high + other.high;
00533 return ret;
00534 }
00535
00536 inline XMMReg4Double operator- (const XMMReg4Double& other)
00537 {
00538 XMMReg4Double ret;
00539 ret.low = low - other.low;
00540 ret.high = high - other.high;
00541 return ret;
00542 }
00543
00544 inline XMMReg4Double operator* (const XMMReg4Double& other)
00545 {
00546 XMMReg4Double ret;
00547 ret.low = low * other.low;
00548 ret.high = high * other.high;
00549 return ret;
00550 }
00551
00552 inline const XMMReg4Double& operator*= (const XMMReg4Double& other)
00553 {
00554 low *= other.low;
00555 high *= other.high;
00556 return *this;
00557 }
00558
00559 inline void AddLowAndHigh()
00560 {
00561 low = low + high;
00562 low.AddLowAndHigh();
00563 }
00564
00565 inline XMMReg2Double& GetLow()
00566 {
00567 return low;
00568 }
00569 };
00570
00571 #endif