00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #ifndef _philox_dot_h_
00033 #define _philox_dot_h_
00034 
00037 #include "features/compilerfeatures.h"
00038 #include "array.h"
00039 
00040 
00041 
00042 
00043 
00044 
00045 
00046 
00047 
00048 
00049 
00050 
00051 
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 #define _mulhilo_dword_tpl(W, Word, Dword)                              \
00068 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00069     Dword product = ((Dword)a)*((Dword)b);                              \
00070     *hip = product>>W;                                                  \
00071     return (Word)product;                                               \
00072 }
00073 
00074 
00075 
00076 
00077 
00078 
00079 
00080 #ifdef __powerpc__
00081 #define _mulhilo_asm_tpl(W, Word, INSN)                         \
00082 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
00083     Word dx = 0;                                                \
00084     __asm__("\n\t"                                              \
00085         INSN " %0,%1,%2\n\t"                                    \
00086         : "=r"(dx)                                              \
00087         : "r"(b), "r"(ax)                                       \
00088         );                                                      \
00089     *hip = dx;                                                  \
00090     return ax*b;                                                \
00091 }
00092 #else
00093 #define _mulhilo_asm_tpl(W, Word, INSN)                         \
00094 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){      \
00095     Word dx;                                                    \
00096     __asm__("\n\t"                                              \
00097         INSN " %2\n\t"                                          \
00098         : "=a"(ax), "=d"(dx)                                    \
00099         : "r"(b), "0"(ax)                                       \
00100         );                                                      \
00101     *hip = dx;                                                  \
00102     return ax;                                                  \
00103 }
00104 #endif 
00105 
00106 
00107 
00108 
00109 
00110 
00111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN)               \
00112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){       \
00113     return INTRIN(a, b, hip);                                   \
00114 }
00115 
00116 
00117 
00118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN)                       \
00119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00120     *hip = INTRIN(a, b);                                                \
00121     return a*b;                                                         \
00122 }
00123 
00124 
00125 
00126 
00127 
00128 
00129 
00130 
00131 
00132 
00133 
00134 
00135 
00136 
00137 #define _mulhilo_c99_tpl(W, Word) \
00138 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00139     const unsigned WHALF = W/2;                                    \
00140     const Word LOMASK = ((((Word)1)<<WHALF)-1);                    \
00141     Word lo = a*b;                          \
00142     Word ahi = a>>WHALF;                                           \
00143     Word alo = a& LOMASK;                                          \
00144     Word bhi = b>>WHALF;                                           \
00145     Word blo = b& LOMASK;                                          \
00146                                                                    \
00147     Word ahbl = ahi*blo;                                           \
00148     Word albh = alo*bhi;                                           \
00149                                                                    \
00150     Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK));                   \
00151     Word hi = ahi*bhi + (ahbl>>WHALF) +  (albh>>WHALF);                 \
00152     hi += ahbl_albh >> WHALF;  \
00153                                    \
00154     hi += ((lo >> WHALF) < (ahbl_albh&LOMASK));                         \
00155     *hip = hi;                                                          \
00156     return lo;                                                          \
00157 }
00158 
00159 
00160 
00161 
00162 
00163 
00164 #define _mulhilo_fail_tpl(W, Word)                                      \
00165 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){               \
00166     R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
00167 }
00168 
00169 
00170 
00171 
00172 
00173 
00174 #if R123_USE_MULHILO32_ASM
00175 #ifdef __powerpc__
00176 _mulhilo_asm_tpl(32, uint32_t, "mulhwu")
00177 #else
00178 _mulhilo_asm_tpl(32, uint32_t, "mull")
00179 #endif 
00180 #else
00181 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
00182 #endif
00183 
00184 #if R123_USE_PHILOX_64BIT
00185 #if R123_USE_MULHILO64_ASM
00186 #ifdef __powerpc64__
00187 _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
00188 #else
00189 _mulhilo_asm_tpl(64, uint64_t, "mulq")
00190 #endif 
00191 #elif R123_USE_MULHILO64_MSVC_INTRIN
00192 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
00193 #elif R123_USE_MULHILO64_CUDA_INTRIN
00194 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
00195 #elif R123_USE_MULHILO64_OPENCL_INTRIN
00196 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
00197 #elif R123_USE_MULHILO64_MULHI_INTRIN
00198 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
00199 #elif R123_USE_GNU_UINT128
00200 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
00201 #elif R123_USE_MULHILO64_C99
00202 _mulhilo_c99_tpl(64, uint64_t)
00203 #else
00204 _mulhilo_fail_tpl(64, uint64_t)
00205 #endif
00206 #endif
00207 
00208 
00209 
00210 
00211 
00212 
00213 
00214 
00215 
00216 #ifndef PHILOX_M2x64_0
00217 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
00218 #endif
00219 
00220 #ifndef PHILOX_M4x64_0
00221 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
00222 #endif
00223 
00224 #ifndef PHILOX_M4x64_1
00225 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
00226 #endif
00227 
00228 #ifndef PHILOX_M2x32_0
00229 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
00230 #endif
00231 
00232 #ifndef PHILOX_M4x32_0
00233 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
00234 #endif
00235 #ifndef PHILOX_M4x32_1
00236 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
00237 #endif
00238 
00239 #ifndef PHILOX_W64_0
00240 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)  
00241 #endif
00242 #ifndef PHILOX_W64_1
00243 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)  
00244 #endif
00245 
00246 #ifndef PHILOX_W32_0
00247 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
00248 #endif
00249 #ifndef PHILOX_W32_1
00250 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
00251 #endif
00252 
00253 #ifndef PHILOX2x32_DEFAULT_ROUNDS
00254 #define PHILOX2x32_DEFAULT_ROUNDS 10
00255 #endif
00256 
00257 #ifndef PHILOX2x64_DEFAULT_ROUNDS
00258 #define PHILOX2x64_DEFAULT_ROUNDS 10
00259 #endif
00260 
00261 #ifndef PHILOX4x32_DEFAULT_ROUNDS
00262 #define PHILOX4x32_DEFAULT_ROUNDS 10
00263 #endif
00264 
00265 #ifndef PHILOX4x64_DEFAULT_ROUNDS
00266 #define PHILOX4x64_DEFAULT_ROUNDS 10
00267 #endif
00268 
00269 
00270 
00271 #define _philox2xWround_tpl(W, T)                                       \
00272 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
00273 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
00274     T hi;                                                               \
00275     T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi);                \
00276     struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}};               \
00277     return out;                                                         \
00278 }
00279 #define _philox2xWbumpkey_tpl(W)                                        \
00280 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
00281     key.v[0] += PHILOX_W##W##_0;                                        \
00282     return key;                                                         \
00283 }
00284 
00285 #define _philox4xWround_tpl(W, T)                                       \
00286 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
00287 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
00288     T hi0;                                                              \
00289     T hi1;                                                              \
00290     T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0);              \
00291     T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1);              \
00292     struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1,               \
00293                               hi0^ctr.v[3]^key.v[1], lo0}};             \
00294     return out;                                                         \
00295 }
00296 
00297 #define _philox4xWbumpkey_tpl(W)                                        \
00298 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
00299     key.v[0] += PHILOX_W##W##_0;                                        \
00300     key.v[1] += PHILOX_W##W##_1;                                        \
00301     return key;                                                         \
00302 }
00303 
00304 #define _philoxNxW_tpl(N, Nhalf, W, T)                         \
00305                                        \
00306 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
00307 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t;                  \
00308 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t;              \
00309 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t;              \
00310 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
00311 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
00312 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
00313     R123_ASSERT(R<=16);                                                 \
00314     if(R>0){                                       ctr = _philox##N##x##W##round(ctr, key); } \
00315     if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00316     if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00317     if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00318     if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00319     if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00320     if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00321     if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00322     if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00323     if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00324     if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00325     if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00326     if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00327     if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00328     if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00329     if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00330     return ctr;                                                         \
00331 }
00332          
00333 _philox2xWbumpkey_tpl(32)
00334 _philox4xWbumpkey_tpl(32)
00335 _philox2xWround_tpl(32, uint32_t) 
00336 _philox4xWround_tpl(32, uint32_t)            
00338 _philoxNxW_tpl(2, 1, 32, uint32_t)    
00339 _philoxNxW_tpl(4, 2, 32, uint32_t)    
00340 #if R123_USE_PHILOX_64BIT
00341 
00342 _philox2xWbumpkey_tpl(64)
00343 _philox4xWbumpkey_tpl(64)
00344 _philox2xWround_tpl(64, uint64_t) 
00345 _philox4xWround_tpl(64, uint64_t) 
00347 _philoxNxW_tpl(2, 1, 64, uint64_t)    
00348 _philoxNxW_tpl(4, 2, 64, uint64_t)    
00349 #endif 
00350 
00351 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
00352 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
00353 #if R123_USE_PHILOX_64BIT
00354 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
00355 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
00356 #endif 
00357 
00358 #ifdef __cplusplus
00359 #include <stdexcept>
00360 
00363 #define _PhiloxNxW_base_tpl(CType, KType, N, W)                         \
00364 namespace r123{                                                          \
00365 template<unsigned int ROUNDS>                                             \
00366 struct Philox##N##x##W##_R{                                             \
00367     typedef CType ctr_type;                                         \
00368     typedef KType key_type;                                             \
00369     typedef KType ukey_type;                                         \
00370     static const unsigned int rounds=ROUNDS;                                 \
00371     inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
00372         R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
00373         return philox##N##x##W##_R(ROUNDS, ctr, key);                       \
00374     }                                                                   \
00375 };                                                                      \
00376 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
00377  } // namespace r123
00378 
00380 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) 
00381 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) 
00382 #if R123_USE_PHILOX_64BIT
00383 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) 
00384 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) 
00385 #endif
00386 
00387 
00388 
00389 
00484 #endif 
00485 
00486 #endif