00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #ifndef _Random123_sse_dot_h__
00033 #define _Random123_sse_dot_h__
00034 
00035 #if R123_USE_SSE
00036 
00037 #if R123_USE_X86INTRIN_H
00038 #include <x86intrin.h>
00039 #endif
00040 #if R123_USE_IA32INTRIN_H
00041 #include <ia32intrin.h>
00042 #endif
00043 #if R123_USE_XMMINTRIN_H
00044 #include <xmmintrin.h>
00045 #endif
00046 #if R123_USE_EMMINTRIN_H
00047 #include <emmintrin.h>
00048 #endif
00049 #if R123_USE_SMMINTRIN_H
00050 #include <smmintrin.h>
00051 #endif
00052 #if R123_USE_WMMINTRIN_H
00053 #include <wmmintrin.h>
00054 #endif
00055 #if R123_USE_INTRIN_H
00056 #include <intrin.h>
00057 #endif
00058 #ifdef __cplusplus
00059 #include <iostream>
00060 #include <limits>
00061 #include <stdexcept>
00062 #endif
00063 
00064 #if R123_USE_ASM_GNU
00065 
00066 
00067 R123_STATIC_INLINE int haveAESNI(){
00068     unsigned int eax, ebx, ecx, edx;
00069     __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
00070                       "a" (1));
00071     return (ecx>>25) & 1;
00072 }
00073 #elif R123_USE_CPUID_MSVC
00074 R123_STATIC_INLINE int haveAESNI(){
00075     int CPUInfo[4];
00076     __cpuid(CPUInfo, 1);
00077     return (CPUInfo[2]>>25)&1;
00078 }
00079 #else 
00080 #warning "No R123_USE_CPUID_XXX method chosen.  haveAESNI will always return false"
00081 R123_STATIC_INLINE int haveAESNI(){
00082     return 0;
00083 }
00084 #endif 
00085 
00086 
00087 
00088 
00089 
00090 
00091 
00092 
00093 #if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
00094 
00095 
00096 
00097 
00098 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
00099     union{
00100         uint64_t u64;
00101         uint32_t u32[2];
00102     } u1, u0;
00103     u1.u64 = v1;
00104     u0.u64 = v0;
00105     return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
00106 }
00107 #endif
00108 
00109 
00110 
00111 
00112 
00113 
00114 
00115 
00116 
00117 
00118 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
00119 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00120     union{
00121         uint64_t u64[2];
00122         __m128i m;
00123     }u;
00124     _mm_store_si128(&u.m, si);
00125     return u.u64[0];
00126 }
00127 #elif defined(__llvm__) || defined(__ICC)
00128 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00129     return (uint64_t)_mm_cvtsi128_si64(si);
00130 }
00131 #else 
00132 
00133 
00134 
00135 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00136     return (uint64_t)_mm_cvtsi128_si64x(si);
00137 }
00138 #endif
00139 #if defined(__GNUC__) && __GNUC__ < 4
00140 
00141 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
00142     return (__m128)si;
00143 }
00144 #endif
00145 
00146 #ifdef __cplusplus
00147 
00148 struct r123m128i{
00149     __m128i m;
00150 #if R123_USE_CXX11_UNRESTRICTED_UNIONS
00151     
00152     
00153     
00154     
00155     
00156     r123m128i() = default;
00157     r123m128i(__m128i _m): m(_m){}
00158 #endif
00159     r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
00160     r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
00161 #if R123_USE_CXX11_EXPLICIT_CONVERSIONS
00162     
00163     
00164     
00165     explicit operator bool() const {return _bool();}
00166 #else
00167     
00168     
00169     operator const void*() const{return _bool()?this:0;}
00170 #endif
00171     operator __m128i() const {return m;}
00172 
00173 private:
00174 #if R123_USE_SSE4_1
00175     bool _bool() const{ return !_mm_testz_si128(m,m); }
00176 #else
00177     bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
00178 #endif
00179 };
00180 
00181 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
00182     __m128i& c = v.m;
00183     __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
00184     c = _mm_add_epi64(c, zeroone);
00185     
00186 #if R123_USE_SSE4_1
00187     __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
00188     if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
00189         __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
00190         c = _mm_add_epi64(c, onezero);
00191     }
00192 #else
00193     unsigned mask  = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
00194     
00195     
00196     if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
00197         __m128i onezero = _mm_set_epi64x(1,0);
00198         c = _mm_add_epi64(c, onezero);
00199     }
00200 #endif
00201     return v;
00202 }
00203 
00204 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ 
00205     __m128i c = lhs.m;
00206     __m128i incr128 = _mm_set_epi64x(0, n);
00207     c = _mm_add_epi64(c, incr128);
00208     
00209 
00210     int64_t lo64 = _mm_extract_lo64(c);
00211     if((uint64_t)lo64 < n)
00212         c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
00213     lhs.m = c;
00214     return lhs; 
00215 }
00216 
00217 
00218 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
00219     throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
00220 
00221 
00222 
00223 
00224 R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
00225     throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
00226 R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
00227     throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
00228 R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
00229     throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
00230 R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
00231     throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
00232 
00233 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ 
00234     return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
00235 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ 
00236     return !(lhs==rhs);}
00237 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
00238     r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
00239 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00240     return !(lhs==rhs);}
00241 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
00242     union{
00243         uint64_t u64[2];
00244         __m128i m;
00245     }u;
00246     _mm_storeu_si128(&u.m, m.m);
00247     return os << u.u64[0] << " " << u.u64[1];
00248 }
00249 
00250 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
00251     uint64_t u64[2];
00252     is >> u64[0] >> u64[1];
00253     m.m = _mm_set_epi64x(u64[1], u64[0]);
00254     return is;
00255 }
00256 
00257 template<typename T> inline T assemble_from_u32(uint32_t *p32); 
00258 
00259 template <>
00260 inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
00261     r123m128i ret;
00262     ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
00263     return ret;
00264 }
00265 
00266 #else
00267 
00268 typedef struct {
00269     __m128i m;
00270 } r123m128i;
00271 
00272 #endif 
00273 
00274 #else 
00275 R123_STATIC_INLINE int haveAESNI(){
00276     return 0;
00277 }
00278 #endif 
00279 
00280 #endif