interpretor/preprocessor/faust-0.9.47mr3/architecture/intrinsic.hh

   1 /****************************************************/
   2 /*                                                  */
   3 /*  intrinsic.hh:                                   */
   4 /*                                                  */
   5 /*                                                  */
   6 /*  Nicolas Scaringella                             */
   7 /*                                                  */
   8 /****************************************************/
   9
  10
  11
  12 // dans fichiers architectures desormais :
  13 // inline void *aligned_calloc(size_t nmemb, size_t size) { return (void*)((unsigned)(calloc((nmemb*size)+15,sizeof(char)))+15 & 0xfffffff0); }
  14
  15
  16
  17 #ifdef __SSE2__
  18
  19 /****************************************************/
  20 /*                                                  */
  21 /*                         SSE2 implementation      */
  22 /*                                                  */
  23 /****************************************************/
  24
  25
  26 #include <mmintrin.h>
  27 #include <xmmintrin.h>
  28 #include <emmintrin.h>
  29 //#include <sse2mmx.h>
  30
  31
  32
  33 struct vec_int
  34 {
  35         __m128i vec;
  36
  37         vec_int()                                                                               {}
  38
  39         vec_int(int a)                                                                  { vec = _mm_set_epi32(a,a,a,a); }
  40
  41         vec_int(int a, int b, int c, int d)                             { vec = _mm_set_epi32(d,c,b,a); }
  42
  43         vec_int(__m128i m)                                                              { vec = m; }
  44
  45         operator  __m128i() const                                               { return vec; }
  46
  47         const int& operator[](int i)const                               { int* ip = (int*)&vec; return *(ip+i); }
  48
  49         int& operator[](int i)                                                  { int* ip = (int*)&vec; return *(ip+i); }
  50
  51 };
  52
  53 struct vec_float
  54 {
  55         __m128 vec;
  56
  57         vec_float()                                                                             {}
  58
  59         vec_float(float a)                                                              { vec = _mm_set_ps1(a); }
  60
  61         vec_float(float a, float b, float c, float d)   { vec = _mm_set_ps(d,c,b,a); }
  62
  63         vec_float(__m128 m)                                                             { vec = m; }
  64
  65         //vec_float(vec_int vi)                                                 { vec = _mm_cvtepi32_ps(vi); }
  66
  67         operator  __m128() const                                                { return vec; }
  68
  69         const float& operator[](int i)const                             { float* fp = (float*)&vec; return *(fp+i); }
  70
  71         float& operator[](int i)                                                { float* fp = (float*)&vec; return *(fp+i); }
  72
  73 };
  74
  75
  76 // Flush to zero mode: during underflow zero result is returned when the result is true
  77 // Not compatible with the IEEE standard 754 ( which
  78 // deliver denormalized result in case of underflow )
  79 #define NO_DENORMALIZE _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_ON))
  80 #define DENORMALIZE    _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_OFF))
  81
  82
  83
  84
  85 // constants
  86
  87 // 0 0 0 0
  88 #define VEC_INT_ZERO(a)   _mm_xor_si128(a,a)
  89
  90 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
  91 #define VEC_INT_ONES(a)   _mm_cmpeq_epi32(a,a)
  92
  93 // Example: 2^10 - 1 = 1023 -> VEC_INT_PW2_MINUS_1(a,10)
  94 #define VEC_INT_PW2_MINUS_1(a,pw) _mm_srli_epi32(_mm_cmpeq_epi32(a,a),32-pw)
  95
  96 // 1 1 1 1: particular case
  97 #define VEC_INT_ONE(a)    _mm_srli_epi32(_mm_cmpeq_epi32(a,a),31)
  98
  99 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
 100 #define VEC_INT_PW2(a,pw) _mm_slli_epi32(_mm_srli_epi32(_mm_cmpeq_epi32(a,a),31),pw)
 101
 102 // Example: -2^10 = -1024 -> VEC_INT_MINUS_PW2(a,10)
 103 #define VEC_INT_MINUS_PW2(a,pw) _mm_slli_epi32(_mm_cmpeq_epi32(a,a),pw)
 104
 105 // -1 -1 -1 -1: particular case
 106 #define VEC_INT_MINUS_ONE(a) _mm_cmpeq_epi32(a,a)
 107
 108 // 0.0 0.0 0.0 0.0
 109 #define VEC_FLOAT_ZERO(a) _mm_xor_ps(a,a)
 110
 111 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
 112 #define VEC_FLOAT_ONES(a) _mm_cmpeq_ps(a,a)
 113
 114
 115
 116 // conversions entre vecteurs d'ints et de floats
 117 inline vec_int   float2int( vec_float a)   { return _mm_cvtps_epi32(a); }
 118 inline int       float2int( float a )      { return int(a); }
 119
 120 inline vec_float int2float( vec_int a)   { return _mm_cvtepi32_ps(a); }
 121 inline float     int2float( int a )      { return float(a); }
 122
 123
 124
 125
 126 // arithmetic
 127 inline vec_float add_vec( vec_float a, vec_float b)     { return _mm_add_ps(a,b); }
 128 inline vec_float add_vec( vec_int a, vec_float b)       { return _mm_add_ps(int2float(a),b); }
 129 inline vec_float add_vec( vec_float a, vec_int b)       { return _mm_add_ps(a,int2float(b)); }
 130
 131 inline vec_int   add_vec( vec_int a, vec_int b)       { return _mm_add_epi32(a,b); }
 132
 133
 134 inline vec_float add_scal( vec_float a, vec_float b)  { return _mm_add_ss(a,b); }
 135 inline vec_int   add_scal( vec_int a, vec_int b)      { return _mm_add_epi32(a,b); } // _mm_add_pi32 en MMX
 136 //inline scal_int  add_scal( scal_int a, scal_int b) { return _mm_add_pi32(a,b); }
 137
 138
 139 inline vec_float sub_vec( vec_float a, vec_float b)   { return _mm_sub_ps(a,b); }
 140 inline vec_int   sub_vec( vec_int a, vec_int b)       { return _mm_sub_epi32(a,b); }
 141 inline vec_float sub_scal( vec_float a, vec_float b)  { return _mm_sub_ss(a,b); }
 142 inline vec_int   sub_scal( vec_int a, vec_int b)      { return _mm_sub_epi32(a,b); } // _mm_sub_pi32 en MMX
 143 //inline scal_int  sub_scal( scal_int a, scal_int b) { return _mm_sub_pi32(a,b); }
 144
 145
 146 inline vec_float mul_vec( vec_float a, vec_float b)     { return _mm_mul_ps(a,b); }
 147 inline vec_float mul_vec( vec_int a, vec_float b)       { return _mm_mul_ps(int2float(a),b); }
 148 inline vec_float mul_vec( vec_float a, vec_int b)       { return _mm_mul_ps(a,int2float(b)); }
 149
 150 inline vec_float mul_scal( vec_float a, vec_float b)    { return _mm_mul_ss(a,b); }
 151
 152 // INTEGER MULTIPLICATION
 153 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
 154 // X = ( A << 16 ) + B
 155 // Y = ( C << 16 ) + D
 156 // then:
 157 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
 158 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
 159 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed
 160 // ( however, if it's different from zero, then there is an overflow )
 161
 162 inline vec_int mul_vec( vec_int a, vec_int b) {
 163
 164   vec_int temp0 = _mm_shufflehi_epi16( _mm_shufflelo_epi16( b, 0xB1), 0xB1);
 165   vec_int temp1 = _mm_and_si128( b, _mm_srli_epi32( _mm_cmpeq_epi32( b,b), 16));
 166
 167   vec_int temp2 = _mm_madd_epi16( a, temp0);
 168   vec_int temp3 = _mm_madd_epi16( a, temp1);
 169
 170   vec_int temp4 = _mm_slli_epi32( temp2, 16);
 171
 172   return _mm_add_epi32( temp4, temp3);
 173 }
 174
 175 inline vec_int mul_scal( vec_int a, vec_int b) {
 176
 177   vec_int temp0 = _mm_shufflelo_epi16( b, 0xB1);
 178   vec_int temp1 = _mm_and_si128( b, _mm_cvtsi32_si128(0x00ff));
 179
 180   vec_int temp2 = _mm_madd_epi16( a, temp0);
 181   vec_int temp3 = _mm_madd_epi16( a, temp1);
 182
 183   vec_int temp4 = _mm_slli_epi32( temp2, 16);
 184
 185   return _mm_add_epi32( temp4, temp3);
 186 }
 187
 188
 189 inline vec_float div_vec( vec_float a, vec_float b)   { return _mm_mul_ps(a,_mm_rcp_ps(b)); /*_mm_div_ps(a,b);*/ }
 190 inline vec_int   div_vec( vec_int a, vec_int b)       { return _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(a),_mm_rcp_ps(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!
 191 inline vec_float div_scal( vec_float a, vec_float b)  { return _mm_mul_ss(a,_mm_rcp_ss(b)); /*_mm_div_ss(a,b);*/ }
 192 inline vec_int   div_scal( vec_int a, vec_int b)      { return _mm_cvtps_epi32(_mm_mul_ss(_mm_cvtepi32_ps(a),_mm_rcp_ss(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!!
 193 //inline scal_int  div_scal( scal_int a, scal_int b) { return _mm_cvtsi32_si64((_mm_cvtsi64_si32(a))/(_mm_cvtsi64_si32(b))); }  // A CHANGER !!!!!
 194
 195
 196 inline vec_int   mod_vec( vec_int a, vec_int N) {
 197
 198   vec_int temp = _mm_sub_epi32(a,N);
 199   vec_int zero = _mm_xor_si128(a,a);
 200
 201   vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
 202
 203   return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
 204 }
 205
 206 inline vec_int   mod_scal( vec_int a, vec_int N) {
 207
 208   vec_int temp = _mm_sub_epi32(a,N);
 209   vec_int zero = _mm_xor_si128(a,a);
 210
 211   vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
 212
 213   return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
 214 }
 215
 216
 217
 218 // simulation of  a*b + c
 219 #define madd_vec(a,b,c)  add_vec(mul_vec(a,b),c)
 220 #define madd_scal(a,b,c)  add_scal(mul_scal(a,b),c)
 221
 222 //inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,b),c); }
 223 //inline vec_int madd_vec( vec_int a, vec_int b, vec_int c)  { return add_vec(mul_vec(a,b),c); }
 224 //inline vec_int madd_scal( vec_int a, vec_int b, vec_int c) { return add_scal(mul_scal(a,b),c); }
 225
 226
 227 // simulation of  - ( a*b - c )
 228 //inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c) {  }
 229
 230 // simulation of a*(1/b) + c
 231 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c)  { return _mm_add_ps(_mm_mul_ps(a,_mm_rcp_ps(b)),c); }
 232 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,_mm_rcp_ss(b)),c); }
 233 // simulation of - ( a*(1/b) - c )
 234 //inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c) {  }
 235
 236
 237
 238
 239 // shift ( and fill with 0's )
 240 inline vec_int   shift_left_vec( vec_int a, vec_int num)     { return _mm_sll_epi32(a,num); }
 241 inline vec_int   shift_left_vec( vec_int a, int num)         { return _mm_slli_epi32(a,num); }
 242 inline vec_int   shift_left_scal( vec_int a, vec_int num)     { return _mm_sll_epi32(a,num); } // _mm_sll_pi32(a,num) en MMX
 243 //inline scal_int shift_left_scal( scal_int a, scal_int num) { return _mm_sll_pi32(a,num); }
 244 inline vec_int   shift_left_scal( vec_int a, int num)         { return _mm_slli_epi32(a,num); } // _mm_slli_pi32(a,num) en MMX
 245 //inline scal_int shift_left_scal( scal_int a, int num) { return _mm_slli_pi32(a,num); }
 246
 247
 248 // shift ( and fill with the sign bit )
 249 inline vec_int   shift_right_vec( vec_int a, vec_int num)     { return _mm_sra_epi32(a,num); }
 250 inline vec_int   shift_right_vec( vec_int a, int num)         { return _mm_srai_epi32(a,num); }
 251 inline vec_int   shift_right_scal( vec_int a, vec_int num)     { return _mm_sra_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
 252 //inline scal_int shift_right_scal( scal_int a, scal_int num) { return _mm_sra_pi32(a,num); }
 253 inline vec_int   shift_right_scal( vec_int a, int num)         { return _mm_srai_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
 254 //inline scal_int shift_right_scal( scal_int a, int num) { return _mm_srai_pi32(a,num); }
 255
 256
 257 // shift ( and fill with 0's )
 258 inline vec_int   shift_right_vec_logical( vec_int a, vec_int num)     { return _mm_srl_epi32(a,num); }
 259 inline vec_int   shift_right_vec_logical( vec_int a, int num)         { return _mm_srli_epi32(a,num); }
 260 inline vec_int   shift_right_scal_logical( vec_int a, vec_int num)     { return _mm_srl_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
 261 //inline scal_int shift_right_scal_logical( scal_int a, scal_int num) { return _mm_srl_pi32(a,num); }
 262 inline vec_int   shift_right_scal_logical( vec_int a, int num)         { return _mm_srli_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
 263 //inline scal_int shift_right_scal_logical( scal_int a, int num) { return _mm_srli_pi32(a,num); }
 264
 265
 266 // Logic
 267 // Ajouts YO;; supprime
 268 //inline vec_float and_vec( vec_float a, vec_int b)   { return _mm_and_ps(a,b); }
 269 //inline vec_float and_vec( vec_int a, vec_float b)   { return _mm_and_ps(a,b); }
 270
 271 inline vec_float and_vec( vec_float a, vec_float b)   { return _mm_and_ps(a,b); }
 272 inline vec_int   and_vec( vec_int a, vec_int b)       { return _mm_and_si128(a,b); }
 273 inline vec_float and_scal( vec_float a, vec_float b)  { return _mm_and_ps(a,b); }
 274 inline vec_int   and_scal( vec_int a, vec_int b)      { return _mm_and_si128(a,b); } // _mm_and_si64(a,b) en MMX
 275 //inline scal_int   and_scal( scal_int a, scal_int b)      { return _mm_and_si64(a,b); }
 276
 277
 278 inline vec_float or_vec( vec_float a, vec_float b)    { return _mm_or_ps(a,b); }
 279 inline vec_int   or_vec( vec_int a, vec_int b)        { return _mm_or_si128(a,b); }
 280 inline vec_float or_scal( vec_float a, vec_float b)   { return _mm_or_ps(a,b); }
 281 inline vec_int   or_scal( vec_int a, vec_int b)       { return _mm_or_si128(a,b); } // _mm_or_si64(a,b) en MMX
 282 //inline scal_int or_scal( scal_int a, scal_int b) { return _mm_or_si64(a,b); }
 283
 284
 285 inline vec_float xor_vec( vec_float a, vec_float b)   { return _mm_xor_ps(a,b); }
 286 inline vec_int   xor_vec( vec_int a, vec_int b)       { return _mm_xor_si128(a,b); }
 287 inline vec_float xor_scal( vec_float a, vec_float b)  { return _mm_xor_ps(a,b); }
 288 inline vec_int   xor_scal( vec_int a, vec_int b)      { return _mm_xor_si128(a,b); } // _mm_xor_si64(a,b) en MMX
 289 //inline scal_int xor_scal( scal_int a, scal_int b) { return _mm_xor_si64(a,b); }
 290
 291 //------------------------------------------------------------------------------------------------------------
 292 // YO : remplacement de inline vec_float par inline vec_int dans les operations de comparaison entre vec_float
 293 // pour une meilleur compatibilité avec la compilation vectorielle
 294 //------------------------------------------------------------------------------------------------------------
 295
 296 // cast (without conversion)
 297 inline vec_float cast2vec_float(vec_int x)                              { return _mm_castsi128_ps(x); }
 298 inline vec_int cast2vec_int(vec_float x)                                { return _mm_castps_si128(x); }
 299
 300 // convertions
 301 inline vec_float conv2vec_float(vec_int x)                              { return _mm_cvtepi32_ps(x); }
 302 inline vec_int conv2vec_int(vec_float x)                                { return _mm_cvtps_epi32(x); }
 303
 304
 305 // comparaison
 306 //inline vec_float int2float( vec_int a)   { return _mm_cvtepi32_ps(a); }
 307
 308
 309
 310 inline vec_float gt_vec( vec_float a, vec_float b)      { return _mm_cmpgt_ps(a,b); }
 311 inline vec_float gt_vec( vec_int a, vec_float b)        { return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
 312 inline vec_float gt_vec( vec_float a, vec_int b)        { return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
 313 inline vec_int   gt_vec( vec_int a, vec_int b)          { return _mm_cmpgt_epi32(a,b); }
 314
 315 inline vec_float gt_scal( vec_float a, vec_float b)     { return _mm_cmpgt_ps(a,b); }
 316 inline vec_float gt_scal( vec_int a, vec_float b)       { return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
 317 inline vec_float gt_scal( vec_float a, vec_int b)       { return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
 318 inline vec_int   gt_scal( vec_int a, vec_int b)         { return _mm_cmpgt_epi32(a,b); }
 319
 320 // choose between two values choose(c,u,v) = c?u:v
 321 // the type of the result depends of the types of u and v, not of the type of c
 322
 323 inline vec_float choose(vec_float c, vec_float u, vec_float v)  { return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,v)); }
 324 inline vec_float choose(vec_float c, vec_int u, vec_float v)    { return _mm_or_ps(_mm_and_ps(c,_mm_cvtepi32_ps(u)), _mm_andnot_ps(c,v)); }
 325 inline vec_float choose(vec_float c, vec_float u, vec_int v)    { return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,_mm_cvtepi32_ps(v))); }
 326
 327 inline vec_float choose(vec_int c, vec_float u, vec_float v)    { return choose(cast2vec_float(c), u, v); }
 328 inline vec_float choose(vec_int c, vec_int u, vec_float v)              { return choose(cast2vec_float(c), u, v); }
 329 inline vec_float choose(vec_int c, vec_float u, vec_int v)              { return choose(cast2vec_float(c), u, v); }
 330
 331 inline vec_int choose(vec_int c, vec_int u, vec_int v)                  { return _mm_or_si128(_mm_and_si128(c,u), _mm_andnot_si128(c,v)); }
 332 inline vec_int choose(vec_float c, vec_int u, vec_int v)                { return choose(cast2vec_int(c), u, v); }
 333
 334 // choose between two values choosezero(c,u) = c?u:0
 335 inline vec_float choosezero(vec_float c, vec_float u)                   { return _mm_and_ps(c,u); }
 336 inline vec_float choosezero(vec_int c, vec_float u)                             { return choosezero(cast2vec_float(c), u); }
 337
 338 inline vec_int choosezero(vec_int c, vec_int u)                                 { return _mm_and_si128(c,u); }
 339 inline vec_int choosezero(vec_float c, vec_int u)                               { return choosezero(cast2vec_int(c), u); }
 340
 341
 342 //inline vec_int gt_vec( vec_float a, vec_float b)    { return _mm_srli_epi32(_mm_cmpgt_ps(a,b),31); }
 343 //inline vec_int gt_vec( vec_float a, vec_float b)    { vec_univ v; v.f4 = _mm_cmpgt_ps(a,b); return _mm_srli_epi32(v.i4,31); }
 344 //inline vec_int   gt_vec( vec_int a, vec_int b)        { return _mm_cmpgt_epi32(a,b); }
 345 //inline vec_int gt_scal( vec_float a, vec_float b)   { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
 346 //inline vec_int gt_scal( vec_float a, vec_float b)   { vec_univ v; v.f4 = _mm_cmpgt_ss(a,b); return _mm_srli_epi32(v.i4,31); }
 347 //inline vec_int gt_scal( vec_float a, vec_float b)     { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
 348 //inline vec_int gt_scal( vec_int a, vec_float b)       { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
 349 //inline vec_int   gt_scal( vec_int a, vec_int b)       { return _mm_cmpgt_epi32(a,b); } // _mm_cmpgt_pi32(a,b) en MMX
 350
 351 //inline scal_int gt_scal( scal_int a, scal_int b) { return (__m128i) _mm_cmpgt_pi32(a,b); }
 352
 353 #if 0
 354
 355 inline vec_int lt_vec( vec_float a, vec_float b)    { return _mm_cmplt_ps(a,b); }
 356 inline vec_int   lt_vec( vec_int a, vec_int b)        { return _mm_cmpgt_epi32(b,a); }
 357 inline vec_int lt_scal( vec_float a, vec_float b)   { return _mm_cmplt_ss(a,b); }
 358 inline vec_int   lt_scal( vec_int a, vec_int b)       { return _mm_cmpgt_epi32(b,a); } // _mm_cmpgt_pi32(b,a) en MMX
 359 //inline scal_int lt_scal( scal_int a, scal_int b) { return _mm_cmpgt_pi32(b,a); }
 360
 361
 362
 363 inline vec_int ge_vec( vec_float a, vec_float b)    { return _mm_cmpge_ps(a,b); }
 364 inline vec_int   ge_vec( vec_int a, vec_int b)        { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); }
 365 inline vec_int ge_scal( vec_float a, vec_float b)   { return _mm_cmpge_ss(a,b); }
 366 inline vec_int   ge_scal( vec_int a, vec_int b)       { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
 367 //inline scal_int ge_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(a,b),_mm_cmpeq_pi32(a,b)); }
 368
 369
 370 inline vec_int le_vec( vec_float a, vec_float b)    { return _mm_cmple_ps(a,b); }
 371 inline vec_int   le_vec( vec_int a, vec_int b)        { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); }
 372 inline vec_int le_scal( vec_float a, vec_float b)   { return _mm_cmple_ss(a,b); }
 373 inline vec_int   le_scal( vec_int a, vec_int b)       { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
 374 //inline scal_int le_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(b,a),_mm_cmpeq_pi32(b,a)); }
 375
 376
 377 inline vec_int eq_vec( vec_float a, vec_float b)    { return _mm_cmpeq_ps(a,b); }
 378 inline vec_int   eq_vec( vec_int a, vec_int b)        { return _mm_cmpeq_epi32(a,b); }
 379 inline vec_int eq_scal( vec_float a, vec_float b)   { return _mm_cmpeq_ss(a,b); }
 380 inline vec_int   eq_scal( vec_int a, vec_int b)       { return _mm_cmpeq_epi32(a,b); } // _mm_cmpeq_pi32(a,b) en MMX
 381 //inline scal_int eq_scal( scal_int a, scal_int b) { return _mm_cmpeq_pi32(a,b); }
 382
 383
 384 inline vec_int neq_vec( vec_float a, vec_float b)   { return _mm_cmpneq_ps(a,b); }
 385 inline vec_int   neq_vec( vec_int a, vec_int b)       { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); }
 386 inline vec_int neq_scal( vec_float a, vec_float b)  { return _mm_cmpneq_ss(a,b); }
 387 inline vec_int   neq_scal( vec_int a, vec_int b)      { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); } // _mm_andnot_si64,_mm_cmpeq_pi32 MMX
 388 //inline scal_int neq_scal( scal_int a, scal_int b) { return _mm_andnot_si64(_mm_cmpeq_pi32(a,b),SCAL_INT_ALL_ONE); }
 389
 390 #endif
 391
 392
 393 // memory
 394
 395 #if 0
 396 inline vec_float set_vec( double a)             { float val = float(a); vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
 397 inline vec_float set_vec( float a)              { float val = a; vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
 398 inline vec_int   set_vec( long int a)           { vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
 399 inline vec_int   set_vec( int a)                { vec_int temp = _mm_cvtsi32_si128(a); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp);}
 400 inline vec_int   set_vec( short a)              { vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
 401 //inline scal_int  set_vec( long int a) { _mm_cvtsi32_si64(int(a)); }
 402 //inline scal_int  set_vec( int a) { _mm_cvtsi32_si64(a); }
 403 //inline scal_int  set_vec( short a) { _mm_cvtsi32_si64(int(a)); }
 404 #endif
 405
 406 #if 0
 407
 408 inline vec_float set_vec( double a, double b, double c, double d)         { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
 409 inline vec_float set_vec( float a, float b, float c, float d)             { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
 410 inline vec_int   set_vec( int a, int b, int c, int d)                     { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
 411 inline vec_int   set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
 412 inline vec_int   set_vec( short a, short b, short c, short d)             { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
 413
 414 #endif
 415
 416 inline vec_float set_vec( float a, float b, float c, float d)           { return vec_float(a,b,c,d); }
 417 inline vec_int   set_vec( int a, int b, int c, int d)                           { return vec_int(a,b,c,d); }
 418
 419 inline vec_float set_vec( float a)                                              { return vec_float(a); }
 420 inline vec_int   set_vec( int a)                                                { return vec_int(a); }
 421
 422 inline vec_float load_a_vec( float* a)                                { return _mm_load_ps(a); }
 423 inline vec_int   load_a_vec( int* a)                                  { return _mm_load_si128((__m128i*)a); }
 424
 425 inline vec_float load_u_vec( float* a)                                { return _mm_loadu_ps(a); }
 426 inline vec_int   load_u_vec( int* a)                                  { return _mm_loadu_si128((__m128i*)a); }
 427
 428 // nouvelles fonctions d'écriture sans polluer le cache
 429 inline void store_stream( float* a, vec_float b)                       { return _mm_stream_ps(a,b); }
 430 inline void store_stream( int* a, vec_int b)                           { return _mm_stream_si128((__m128i*)a,b); }
 431
 432 inline void store_a_vec( float* a, vec_float b)                       { return _mm_store_ps(a,b); }
 433 inline void store_a_vec( int* a, vec_int b)                           { return _mm_store_si128((__m128i*)a,b); }
 434
 435 inline void store_u_vec( float* a, vec_float b)                       { return _mm_storeu_ps(a,b); }
 436 inline void store_u_vec( int* a, vec_int b)                           { return _mm_storeu_si128((__m128i*)a,b); }
 437
 438
 439 inline vec_float load_scal(float* a) { return _mm_load_ss(a); }
 440 inline vec_int   load_scal(int* a) { return _mm_cvtsi32_si128(*a); }
 441 //inline scal_int  load_scal(int* a) { return _mm_cvtsi32_si64(*a); }
 442
 443 inline void store_scal(float* a, vec_float content) { return _mm_store_ss(a,content); }
 444 inline void store_scal(int* a, vec_int content)     { *a = _mm_cvtsi128_si32(content); return; }
 445 //inline void store_scal(int* a, scal_int content) { *a = _mm_cvtsi64_si32(content); return; }
 446
 447
 448
 449 inline vec_float REC0(vec_float a) { return _mm_unpacklo_ps(a,a); }
 450 inline vec_float REC1(vec_float a) { return _mm_unpacklo_ps(a,a); }
 451 inline vec_float REC2(vec_float a) { return _mm_shuffle_ps(a,a,0x90); }
 452 inline vec_float REC3(vec_float a) { return _mm_shuffle_ps(a,a,0x1B); }
 453
 454 inline vec_int REC0(vec_int a) { return _mm_unpacklo_epi32(a,a); }
 455 inline vec_int REC1(vec_int a) { return _mm_unpacklo_epi32(a,a); }
 456 inline vec_int REC2(vec_int a) { return _mm_shuffle_epi32(a,0x90); }
 457 inline vec_int REC3(vec_int a) { return _mm_shuffle_epi32(a,0x1B); }
 458
 459
 460
 461
 462
 463 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
 464 //inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_shuffle_ps(_mm_shuffle_ps(a0,a1,0x00),_mm_shuffle_ps(a2,a3,0x00),0x88); }
 465 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_unpacklo_ps(_mm_unpacklo_ps(a0,a2),_mm_unpacklo_ps(a1,a3)); }
 466 inline vec_int   SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) { return _mm_unpacklo_epi32(_mm_unpacklo_epi32(a0,a2),_mm_unpacklo_epi32(a1,a3)); }
 467 //inline vec_int   SCAL2VEC(scal_int a0,scal_int a1,scal_int a2,scal_int a3) { return _mm_unpacklo_epi32( _mm_movpi64_epi64(_mm_unpacklo_pi32(a0,a2)), _mm_movpi64_epi64(_mm_unpacklo_pi32(a1,a3)) ); } // ou _mm_set_epi64( _mm_unpacklo_pi32(a0,a1), _mm_unpacklo_pi32(a2,a3))
 468
 469 inline vec_float SCAL2VEC(double a0, double a1, double a2, double a3) { return _mm_set_ps(float(a3),float(a2),float(a1),float(a0)); }
 470 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { return _mm_set_ps(a3,a2,a1,a0); }
 471 inline vec_int   SCAL2VEC(long a0, long a1, long a2, long a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
 472 inline vec_int   SCAL2VEC(int a0, int a1, int a2, int a3) { return _mm_set_epi32(a3,a2,a1,a0); }
 473 inline vec_int   SCAL2VEC(short a0, short a1, short a2, short a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
 474
 475
 476 // vector to scalar: build a scalar vector from one element of the initial vector
 477 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0   // _mm_shuffle_ps(a,a,Ox00) would return a0,a0,a0,a0
 478 inline vec_float VEC2SCALVEC1(vec_float a) { return _mm_shuffle_ps(a,a,0x55); } // return a1,a1,a1,a1
 479 inline vec_float VEC2SCALVEC2(vec_float a) { return _mm_shuffle_ps(a,a,0xAA); } // return a2,a2,a2,a2
 480 inline vec_float VEC2SCALVEC3(vec_float a) { return _mm_shuffle_ps(a,a,0xFF); } // return a3,a3,a3,a3
 481
 482 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0   // _mm_shuffle_epi32(a,Ox00) would return a0,a0,a0,a0
 483 inline vec_int VEC2SCALVEC1(vec_int a) { return _mm_shuffle_epi32(a,0x55); } // return a1,a1,a1,a1
 484 inline vec_int VEC2SCALVEC2(vec_int a) { return _mm_shuffle_epi32(a,0xAA); } // return a2,a2,a2,a2
 485 inline vec_int VEC2SCALVEC3(vec_int a) { return _mm_shuffle_epi32(a,0xFF); } // return a3,a3,a3,a3
 486
 487 //inline scal_int VEC2SCALVEC0(vec_int a) { return _mm_movepi64_pi64(a); }  // ATTENTION !!!! :
 488 //inline scal_int VEC2SCALVEC1(vec_int a) { __m64 temp = _mm_movepi64_pi64(a); return _mm_unpackhi_pi32(temp,temp); } // VEC2SCALVEC0 et 1 peuvent être réunis en une instruction plus efficace
 489 //inline scal_int VEC2SCALVEC2(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xAA)); }
 490 //inline scal_int VEC2SCALVEC3(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xFF)); }
 491
 492
 493
 494 // vector to scalar: build a single scalar from a vector
 495 inline float VEC2SCAL0(vec_float a) { float temp; _mm_store_ss(&temp,a); return temp; }
 496 inline float VEC2SCAL1(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0x55)); return temp; }
 497 inline float VEC2SCAL2(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xAA)); return temp; }
 498 inline float VEC2SCAL3(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xFF)); return temp; }
 499
 500 inline int VEC2SCAL0(vec_int a) { return _mm_cvtsi128_si32(a); }
 501 inline int VEC2SCAL1(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0x55)); }
 502 inline int VEC2SCAL2(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xAA)); }
 503 inline int VEC2SCAL3(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xFF)); }
 504 //inline int VEC2SCAL0(scal_int a) { return _mm_cvtsi64_si32(a); }
 505
 506
 507
 508 // select: if( select == 0 ) then a ; else b ;
 509 inline vec_float select_vec( vec_float select, vec_float a, vec_float b) { return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
 510 inline vec_float select_scal( vec_float select, vec_float a, vec_float b){ return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
 511 inline vec_float select_vec( vec_int select, vec_float a, vec_float b) { __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
 512 inline vec_float select_scal( vec_int select, vec_float a, vec_float b){ __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
 513 inline vec_int select_vec( vec_int select, vec_int a, vec_int b)  { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); }
 514 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); } // ou MMX
 515
 516
 517
 518 // vectorial version of the "mem" Faust key-word
 519 // return a[2] a[1] a[0] b[3]
 520 inline vec_float mem1_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(_mm_shuffle_ps(b,a,0x4E),a,0x99); }
 521 inline vec_int   mem1_vec( vec_int a, vec_int b)           {
 522   return _mm_unpacklo_epi32( _mm_shuffle_epi32( _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0xFF), a), 0xEE), _mm_shuffle_epi32( a, 0x88) );
 523 }
 524
 525 // return a[1] a[0] b[3] b[2]
 526 inline vec_float mem2_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(b,a,0x4E); }
 527 inline vec_int   mem2_vec( vec_int a, vec_int b)           { return _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x44)), 0xD8 );
 528
 529 }
 530
 531 // return a[0] b[3] b[2] b[1]
 532 inline vec_float mem3_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(b,_mm_shuffle_ps(b,a,0x4E),0x99);  }
 533 inline vec_int   mem3_vec( vec_int a, vec_int b)           {
 534   return _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0x99), _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x00)), 0xEE) );
 535 }
 536
 537
 538
 539
 540
 541 // conversion
 542 inline vec_float bool2float( vec_float a )  { return _mm_and_ps(a,set_vec(1.0f)); }
 543 inline vec_float bool2float( vec_int a )    { return _mm_cvtepi32_ps(_mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a)))); }
 544
 545 inline vec_int   bool2int( vec_int   a)     { return _mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a))); }
 546 inline vec_int   bool2int( vec_float a )    { return _mm_cvtps_epi32(_mm_and_ps(a,set_vec(1.0f))); }
 547
 548 inline vec_int   boolfloat2boolint( vec_float a ) { vec_int   temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
 549 inline vec_float boolint2boolfloat( vec_int   a ) { vec_float temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
 550
 551
 552
 553
 554
 555
 556
 557 #elif  defined(__ALTIVEC__)
 558
 559
 560 /****************************************************/
 561 /*                                                  */
 562 /*                        ALTIVEC implementation    */
 563 /*                                                  */
 564 /****************************************************/
 565
 566
 567 //#define vec_float vector float
 568 //#define vec_int vector signed int
 569 //#define vec_bool vector bool int
 570
 571 struct vec_int
 572 {
 573         vector signed int vec;
 574         vec_int()                                                               {}
 575         vec_int(vector signed int m)                    { vec = m; }
 576 //      operator  __m128i() const                               { return vec; }
 577
 578 };
 579
 580 struct vec_float
 581 {
 582 //      union { __m128 vec; __m128i i4; };
 583         vector float vec;
 584         vec_float()                                                     {}
 585         vec_float(vector float m)                       { vec = m; }
 586         //vec_float(vec_int a)                  { vec = _mm_cvtepi32_ps(a); }
 587
 588         //operator  __m128() const      { return vec; }
 589 };
 590
 591 typedef union{
 592   float s[4];
 593   vec_float v;
 594 } __vec_float;
 595
 596
 597 typedef union{
 598   int s[4];
 599   vec_int v;
 600 } __vec_int;
 601
 602
 603 // Non-Java mode: during underflow zero result is returned
 604 // Not compatible with the Java-IEEE-C9X standard ( which
 605 // deliver denormalized result in case of underflow )
 606 #define NO_DENORMALIZE vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x8000)))
 607 #define DENORMALIZE    vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x0000)))
 608
 609
 610
 611
 612 // constants
 613
 614 // 0 0 0 0
 615 #define VEC_INT_ZERO(a)   vec_xor(a,a)
 616
 617 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
 618 #define VEC_INT_ONES(a)   (vector signed int)vec_cmpeq(a,a)
 619
 620 // 1 1 1 1
 621 #define VEC_INT_ONE(a) vec_splat_s32(1)
 622
 623 // -1 -1 -1 -1
 624 #define VEC_INT_MINUS_ONE(a) vec_splat_s32(-1)
 625
 626 // a must belong to [-16,15]
 627 // no efficient equivalent with SSE2
 628 #define VEC_INT_MINUS_16_TO_15(a) vec_splat_s32(a)
 629
 630 // This not exactly equivalent to the SSE2 version
 631 // the power must belong to [17,31]
 632 // ( that is 2^17 - 1 = 32767 is the minimum
 633 //   and  2^31 - 1 = 2147483647 is the maximum;
 634 // if you need 2^32 - 1, use VEC_INT_ONES )
 635 // Example: 2^19 - 1 = 524287 -> VEC_INT_PW2_MINUS_1(a,19)
 636 #define VEC_INT_PW2_MINUS_1(a,pw) vec_sr((vector signed int)vec_cmpeq(a,a), vec_splat_u32(32-pw))
 637
 638
 639 // This not exactly equivalent to the SSE2 version
 640 // the power must belong to [4,18]
 641 // ( that is 2^18 = 262144 is the maximum
 642 //   and  2^4 = 16 is the minimum;
 643 // if you need 2^0 = 1, use VEC_INT_ONE,
 644 // if you need 2^1 = 2, use VEC_INT_MINUS_16_TO_15(2)
 645 // if you need 2^2 = 4, use VEC_INT_MINUS_16_TO_15(4)
 646 // if you need 2^3 = 8, use VEC_INT_MINUS_16_TO_15(8) )
 647 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
 648 #define VEC_INT_PW2(a,pw) vec_sl(vec_splat_s32(8), vec_splat_u32(pw-3))
 649
 650
 651 //vec_sr(a,(vector unsigned int)num);
 652
 653 // 0.0 0.0 0.0 0.0
 654 #define VEC_FLOAT_ZERO(a) vec_xor(a,a)
 655
 656 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
 657 #define VEC_FLOAT_ONES(a) (vector float)vec_cmpeq(a,a)
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668 // arithmetic
 669 inline vec_float add_vec( vec_float a, vec_float b)   { return vec_add(a,b); }
 670 inline vec_float add_scal(vec_float a, vec_float b)   { return vec_add(a,b); }
 671 inline vec_int   add_vec( vec_int a, vec_int b)       { return vec_add(a,b); }
 672 inline vec_int   add_scal(vec_int a, vec_int b)       { return vec_add(a,b); }
 673
 674
 675 inline vec_float sub_vec( vec_float a, vec_float b)   { return vec_sub(a,b); }
 676 inline vec_float sub_scal( vec_float a, vec_float b)  { return vec_sub(a,b); }
 677 inline vec_int   sub_vec( vec_int a, vec_int b)       { return vec_sub(a,b); }
 678 inline vec_int   sub_scal( vec_int a, vec_int b)      { return vec_sub(a,b); }
 679
 680
 681 inline vec_float mul_vec( vec_float a, vec_float b)   { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
 682 inline vec_float mul_scal( vec_float a, vec_float b)  { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
 683
 684
 685 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
 686 // X = ( A << 16 ) + B
 687 // Y = ( C << 16 ) + D
 688 // then:
 689 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
 690 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
 691 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed ( however, if it's different
 692 // from zero, then there is an overflow )
 693 // In this implementation A*D + B*C is computed in a single "vec_msum"
 694
 695 inline vec_int mul_vec( vec_int a, vec_int b)   {
 696   const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
 697   return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0)  ), VEC_SIXTEEN_UINT32 ),   vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
 698                                                 }
 699
 700 inline vec_int mul_scal( vec_int a, vec_int b)   {
 701   const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
 702   return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0)  ), VEC_SIXTEEN_UINT32 ),   vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
 703                                                 }
 704
 705
 706 //inline vec_int mul_vec( vec_int a, vec_int b)   { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
 707 //inline vec_int mul_scal( vec_int a, vec_int b)  { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
 708
 709
 710
 711 inline vec_float div_vec( vec_float a, vec_float b)   { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
 712 inline vec_float div_scal( vec_float a, vec_float b)  { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
 713 inline vec_int div_vec( vec_int a, vec_int b)         { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
 714 inline vec_int div_scal( vec_int a, vec_int b)        { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
 715
 716
 717 inline vec_int   mod_vec( vec_int a, vec_int N) {
 718
 719   vec_int temp = vec_sub(a,N);
 720   vec_int zero = vec_splat_s32(int(0x00000000));
 721
 722   vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
 723
 724   return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
 725 }
 726
 727 inline vec_int   mod_scal( vec_int a, vec_int N) {
 728
 729   vec_int temp = vec_sub(a,N);
 730   vec_int zero = vec_splat_s32(int(0x00000000));
 731
 732   vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
 733
 734   return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
 735 }
 736
 737
 738
 739
 740 // return a*b + c
 741 inline vec_float madd_vec( vec_float a, vec_float b, vec_float c)  { return vec_madd(a,b,c); }
 742 inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,b,c); }
 743
 744 // return  - ( a*b - c )
 745 inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c)  { return vec_nmsub(a,b,c); }
 746 inline vec_float nmsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,b,c); }
 747
 748
 749 // return a*(1/b) + c
 750 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c)  { return vec_madd(a,vec_re(b),c); }
 751 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,vec_re(b),c); }
 752
 753
 754 // return - ( a*(1/b) - c )
 755 inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c)  { return vec_nmsub(a,vec_re(b),c); }
 756 inline vec_float divsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,vec_re(b),c); }
 757
 758
 759
 760
 761 // logic
 762 inline vec_float and_vec( vec_float a, vec_float b)   { return vec_and(a,b); }
 763 inline vec_float and_scal( vec_float a, vec_float b)  { return vec_and(a,b); }
 764 inline vec_int   and_vec( vec_int a, vec_int b)       { return vec_and(a,b); }
 765 inline vec_int   and_scal( vec_int a, vec_int b)      { return vec_and(a,b); }
 766
 767 inline vec_float or_vec( vec_float a, vec_float b)    { return vec_or(a,b); }
 768 inline vec_float or_scal( vec_float a, vec_float b)   { return vec_or(a,b); }
 769 inline vec_int   or_vec( vec_int a, vec_int b)        { return vec_or(a,b); }
 770 inline vec_int   or_scal( vec_int a, vec_int b)       { return vec_or(a,b); }
 771
 772 inline vec_float xor_vec( vec_float a, vec_float b)   { return vec_xor(a,b); }
 773 inline vec_float xor_scal( vec_float a, vec_float b)  { return vec_xor(a,b); }
 774 inline vec_int   xor_vec( vec_int a, vec_int b)       { return vec_xor(a,b); }
 775 inline vec_int   xor_scal( vec_int a, vec_int b)      { return vec_xor(a,b); }
 776
 777
 778
 779 // shift left
 780 inline vec_int   shift_left_vec( vec_int a, vec_int num)     { return vec_sl(a,(vector unsigned int)num); }
 781 inline vec_int   shift_left_scal( vec_int a, vec_int num)    { return vec_sl(a,(vector unsigned int)num); }
 782
 783 // shift ( and fill with the sign bit )
 784 inline vec_int   shift_right_vec( vec_int a, vec_int num)     { return vec_sra(a,(vector unsigned int)num); }
 785 inline vec_int   shift_right_scal( vec_int a, vec_int num)    { return vec_sra(a,(vector unsigned int)num); }
 786
 787 // shift ( and fill with 0's )
 788 //inline vec_int   shift_right_vec_logical( vec_int a, int num)         { return vec_sr(a, set_vec(num) ); }
 789 //inline vec_int   shift_right_scal_logical( vec_int a, int num)        { return vec_sr(a, set_vec(num) ); ); }
 790 inline vec_int   shift_right_vec_logical( vec_int a, vec_int num)     { return vec_sr(a,(vector unsigned int)num); }
 791 inline vec_int   shift_right_scal_logical( vec_int a, vec_int num)    { return vec_sr(a,(vector unsigned int)num); }
 792
 793
 794
 795
 796 // comparaison
 797 inline vec_float gt_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpgt(a,b); }
 798 inline vec_float gt_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpgt(a,b); }
 799 inline vec_int gt_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmpgt(a,b); }
 800 inline vec_int gt_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmpgt(a,b); }
 801
 802
 803 inline vec_float lt_vec( vec_float a, vec_float b)    { return (vector float)vec_cmplt(a,b); }
 804 inline vec_float lt_scal( vec_float a, vec_float b)   { return (vector float)vec_cmplt(a,b); }
 805 inline vec_int lt_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmplt(a,b); }
 806 inline vec_int lt_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmplt(a,b); }
 807
 808
 809 inline vec_float ge_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpge(a,b); }
 810 inline vec_float ge_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpge(a,b); }
 811 inline vec_int ge_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
 812 inline vec_int ge_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
 813
 814
 815 inline vec_float le_vec( vec_float a, vec_float b)    { return (vector float)vec_cmple(a,b); }
 816 inline vec_float le_scal( vec_float a, vec_float b)   { return (vector float)vec_cmple(a,b); }
 817 inline vec_int le_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
 818 inline vec_int le_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
 819
 820
 821 inline vec_float eq_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpeq(a,b); }
 822 inline vec_float eq_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpeq(a,b); }
 823 inline vec_int eq_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmpeq(a,b); }
 824 inline vec_int eq_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmpeq(a,b); }
 825
 826
 827 inline vec_float neq_vec( vec_float a, vec_float b)    { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
 828 inline vec_float neq_scal( vec_float a, vec_float b)   { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
 829 inline vec_int neq_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
 830 inline vec_int neq_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
 831
 832
 833
 834 // memory
 835 inline vec_float set_vec( vec_float a)                { return a; }
 836 inline vec_float set_vec( __vec_float a)              { return a.v; }
 837 inline vec_int   set_vec( vec_int a)                  { return a; }
 838 inline vec_int   set_vec( __vec_int a)                { return a.v; }
 839
 840
 841
 842 inline vec_float set_vec( double a)                   {
 843         float af;
 844         af = (float)a;
 845         vector float temp; temp = vec_lde(0,&af);
 846         temp = vec_perm(temp,temp,vec_lvsl(0,&af));
 847         return vec_splat(temp,0);
 848         //__vec_float temp; float af = float(a); temp.s[0]=af; temp.s[1]=af; temp.s[2]=af; temp.s[3]=af; return temp.v;
 849 }
 850 inline vec_float set_vec( float a)                    {
 851         float af;
 852         af = a;
 853         vector float temp; temp = vec_lde(0,&af);
 854         temp = vec_perm(temp,temp,vec_lvsl(0,&af));
 855         return vec_splat(temp,0);
 856         //__vec_float temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
 857 }
 858 inline vec_int   set_vec( long int a)                 {
 859         int ai;
 860         ai = (int)a;
 861         vector signed int temp; temp = vec_lde(0,&ai);
 862         temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
 863         return vec_splat(temp,0);
 864         //__vec_int temp; int al = int(a); temp.s[0]=al; temp.s[1]=al; temp.s[2]=al; temp.s[3]=al; return temp.v;
 865 }
 866 inline vec_int   set_vec( int a)                      {
 867         int ai;
 868         ai = a;
 869         vector signed int temp; temp = vec_lde(0,&ai);
 870         temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
 871         return vec_splat(temp,0);
 872         //__vec_int temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
 873 }
 874 inline vec_int   set_vec( short a)                    {
 875         int ai;
 876         ai = (int)a;
 877         vector signed int temp; temp = vec_lde(0,&ai);
 878         temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
 879         return vec_splat(temp,0);
 880         //__vec_int temp; int as = int(a); temp.s[0]=as; temp.s[1]=as; temp.s[2]=as; temp.s[3]=as; return temp.v;
 881 }
 882
 883 inline vec_float set_vec( double a, double b, double c, double d)         { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
 884 inline vec_float set_vec( float a, float b, float c, float d)             { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
 885 inline vec_int   set_vec( int a, int b, int c, int d)                     { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
 886 inline vec_int   set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
 887 inline vec_int   set_vec( short a, short b, short c, short d)             { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
 888
 889
 890 inline vec_float load_a_vec( float* a)                                { return vec_ld(0,a); }
 891 //inline vec_float load_u_vec( float* a)                                {  }
 892 inline vec_int   load_a_vec( int* a)                                  { return vec_ld(0,a); }
 893 //inline vec_int   load_u_vec( int* a)                                  {  }
 894
 895 inline void store_a_vec( float* a, vec_float b)                       { return vec_st(b,0,a); }
 896 //inline void store_u_vec( float* a, vec_float b)                       {  }
 897 inline void store_a_vec( int* a, vec_int b)                           { return vec_st(b,0,a); }
 898 //inline void store_u_vec( int* a, vec_int b)                           {  }
 899
 900
 901 inline vec_float load_scal(float* a) { vector float temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
 902 inline vec_int   load_scal(int* a)   { vector signed int temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
 903 inline void store_scal(float* a, vec_float content) { vec_float temp = vec_splat(content,0); return vec_ste(temp,0,a); }
 904 inline void store_scal(int* a, vec_int content)     { vec_int temp = vec_splat(content,0); return vec_ste(temp,0,a); }
 905
 906
 907
 908
 909 inline vec_float REC0(vec_float a) { return vec_mergeh(a,a); }
 910 inline vec_float REC1(vec_float a) { return vec_mergeh(a,a); }
 911 inline vec_float REC2(vec_float a) { return vec_sld(vec_splat(a,0),a,12); }
 912 inline vec_float REC3(vec_float a) {
 913   vector float temp1 = vec_mergel(a,a);
 914   vector float temp2 = vec_mergeh(a,a);
 915   return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
 916 }
 917
 918 inline vec_int REC0(vec_int a) { return vec_mergeh(a,a); }
 919 inline vec_int REC1(vec_int a) { return vec_mergeh(a,a); }
 920 inline vec_int REC2(vec_int a) { return vec_sld(vec_splat(a,0),a,12); }
 921 inline vec_int REC3(vec_int a) {
 922   vector signed int temp1 = vec_mergel(a,a);
 923   vector signed int temp2 = vec_mergeh(a,a);
 924   return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
 925 }
 926
 927
 928 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
 929 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) {
 930         return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(float*)(0)));
 931         //return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
 932 }
 933 inline vec_int   SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) {
 934         return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(int*)(0)));
 935         //return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
 936 }
 937
 938 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { __vec_float temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
 939 inline vec_int   SCAL2VEC(int a0, int a1, int a2, int a3) { __vec_int temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
 940
 941
 942 // vector to scalar: build a scalar vector from one element of the initial vector
 943 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0   // vec_splat(a,0) would return a0,a0,a0,a0
 944 inline vec_float VEC2SCALVEC1(vec_float a) { return vec_splat(a,1); } // return a1,a1,a1,a1
 945 inline vec_float VEC2SCALVEC2(vec_float a) { return vec_splat(a,2); } // return a2,a2,a2,a2
 946 inline vec_float VEC2SCALVEC3(vec_float a) { return vec_splat(a,3); } // return a3,a3,a3,a3
 947
 948 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0   // vec_splat(a,0) would return a0,a0,a0,a0
 949 inline vec_int VEC2SCALVEC1(vec_int a) { return vec_splat(a,1); } // return a1,a1,a1,a1
 950 inline vec_int VEC2SCALVEC2(vec_int a) { return vec_splat(a,2); } // return a2,a2,a2,a2
 951 inline vec_int VEC2SCALVEC3(vec_int a) { return vec_splat(a,3); } // return a3,a3,a3,a3
 952
 953
 954 // vector to scalar: build a single scalar from a vector
 955 inline float VEC2SCAL0(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
 956 inline float VEC2SCAL1(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
 957 inline float VEC2SCAL2(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
 958 inline float VEC2SCAL3(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
 959
 960 inline int VEC2SCAL0(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
 961 inline int VEC2SCAL1(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
 962 inline int VEC2SCAL2(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
 963 inline int VEC2SCAL3(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
 964
 965
 966
 967 // select: if( select == 0 ) then a ; else b ;
 968 inline vec_float select_vec( vec_float select, vec_float a, vec_float b)  { return vec_sel(a,b,(vector bool int)select ); }
 969 inline vec_float select_scal( vec_float select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select );  }
 970 inline vec_float select_vec( vec_int select, vec_float a, vec_float b)  { return vec_sel(a,b,(vector bool int)select ); }
 971 inline vec_float select_scal( vec_int select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select );  }
 972 inline vec_int select_vec( vec_int select, vec_int a, vec_int b)  { return vec_sel(a,b,(vector bool int)select ); }
 973 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return vec_sel(a,b,(vector bool int)select ); }
 974
 975
 976
 977
 978
 979 // vectorial version of the "mem" Faust key-word
 980 // result = { a[2] a[1] a[0] b[3] }
 981 inline vec_float mem1_vec( vec_float a, vec_float b)       { return vec_sld(b,a,12); }
 982 inline vec_int   mem1_vec( vec_int a, vec_int b)           { return vec_sld(b,a,12); }
 983
 984 // result = { a[1] a[0] b[3] b[2] }
 985 inline vec_float mem2_vec( vec_float a, vec_float b)       { return vec_sld(b,a,8); }
 986 inline vec_int   mem2_vec( vec_int a, vec_int b)           { return vec_sld(b,a,8); }
 987
 988 // result = { a[0] b[3] b[2] b[1] }
 989 inline vec_float mem3_vec( vec_float a, vec_float b)       { return vec_sld(b,a,4); }
 990 inline vec_int   mem3_vec( vec_int a, vec_int b)           { return vec_sld(b,a,4); }
 991
 992
 993
 994
 995
 996 // conversion
 997 inline vec_float bool2float( vec_float a )  { return vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001)))); }
 998 inline vec_float bool2float( vec_int a )    { return (vec_float)(vec_and(a,vec_splat_s32(int(0x00000001)))); }
 999
1000 inline vec_int   bool2int( vec_int   a)     { return (vec_int)vec_and(a,vec_splat_s32(int(0x00000001))); }
1001 inline vec_int   bool2int( vec_float a )    { return (vec_int)vec_round(vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001))))); }
1002
1003 inline vec_int   boolfloat2boolint( vec_float a ) { return (vector signed int)a; }
1004 inline vec_float boolint2boolfloat( vec_int   a ) { return (vector float)a; }
1005
1006 inline vec_int   float2int( vec_float a)   { return (vec_int)vec_round(a); }
1007 inline int       float2int( float a )      { return int(a); }
1008
1009 inline vec_float int2float( vec_int a)   { return vec_ctf(a,0); }
1010 inline float     int2float( int a )      { return float(a); }
1011
1012
1013 #endif