Merge branch 'newtree'
[Faustine.git] / interpretor / preprocessor / faust-0.9.47mr3 / architecture / intrinsic.hh
1 /****************************************************/
2 /* */
3 /* intrinsic.hh: */
4 /* */
5 /* */
6 /* Nicolas Scaringella */
7 /* */
8 /****************************************************/
9
10
11
12 // dans fichiers architectures desormais :
13 // inline void *aligned_calloc(size_t nmemb, size_t size) { return (void*)((unsigned)(calloc((nmemb*size)+15,sizeof(char)))+15 & 0xfffffff0); }
14
15
16
17 #ifdef __SSE2__
18
19 /****************************************************/
20 /* */
21 /* SSE2 implementation */
22 /* */
23 /****************************************************/
24
25
26 #include <mmintrin.h>
27 #include <xmmintrin.h>
28 #include <emmintrin.h>
29 //#include <sse2mmx.h>
30
31
32
33 struct vec_int
34 {
35 __m128i vec;
36
37 vec_int() {}
38
39 vec_int(int a) { vec = _mm_set_epi32(a,a,a,a); }
40
41 vec_int(int a, int b, int c, int d) { vec = _mm_set_epi32(d,c,b,a); }
42
43 vec_int(__m128i m) { vec = m; }
44
45 operator __m128i() const { return vec; }
46
47 const int& operator[](int i)const { int* ip = (int*)&vec; return *(ip+i); }
48
49 int& operator[](int i) { int* ip = (int*)&vec; return *(ip+i); }
50
51 };
52
53 struct vec_float
54 {
55 __m128 vec;
56
57 vec_float() {}
58
59 vec_float(float a) { vec = _mm_set_ps1(a); }
60
61 vec_float(float a, float b, float c, float d) { vec = _mm_set_ps(d,c,b,a); }
62
63 vec_float(__m128 m) { vec = m; }
64
65 //vec_float(vec_int vi) { vec = _mm_cvtepi32_ps(vi); }
66
67 operator __m128() const { return vec; }
68
69 const float& operator[](int i)const { float* fp = (float*)&vec; return *(fp+i); }
70
71 float& operator[](int i) { float* fp = (float*)&vec; return *(fp+i); }
72
73 };
74
75
76 // Flush to zero mode: during underflow zero result is returned when the result is true
77 // Not compatible with the IEEE standard 754 ( which
78 // deliver denormalized result in case of underflow )
79 #define NO_DENORMALIZE _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_ON))
80 #define DENORMALIZE _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_OFF))
81
82
83
84
85 // constants
86
87 // 0 0 0 0
88 #define VEC_INT_ZERO(a) _mm_xor_si128(a,a)
89
90 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
91 #define VEC_INT_ONES(a) _mm_cmpeq_epi32(a,a)
92
93 // Example: 2^10 - 1 = 1023 -> VEC_INT_PW2_MINUS_1(a,10)
94 #define VEC_INT_PW2_MINUS_1(a,pw) _mm_srli_epi32(_mm_cmpeq_epi32(a,a),32-pw)
95
96 // 1 1 1 1: particular case
97 #define VEC_INT_ONE(a) _mm_srli_epi32(_mm_cmpeq_epi32(a,a),31)
98
99 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
100 #define VEC_INT_PW2(a,pw) _mm_slli_epi32(_mm_srli_epi32(_mm_cmpeq_epi32(a,a),31),pw)
101
102 // Example: -2^10 = -1024 -> VEC_INT_MINUS_PW2(a,10)
103 #define VEC_INT_MINUS_PW2(a,pw) _mm_slli_epi32(_mm_cmpeq_epi32(a,a),pw)
104
105 // -1 -1 -1 -1: particular case
106 #define VEC_INT_MINUS_ONE(a) _mm_cmpeq_epi32(a,a)
107
108 // 0.0 0.0 0.0 0.0
109 #define VEC_FLOAT_ZERO(a) _mm_xor_ps(a,a)
110
111 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
112 #define VEC_FLOAT_ONES(a) _mm_cmpeq_ps(a,a)
113
114
115
116 // conversions entre vecteurs d'ints et de floats
117 inline vec_int float2int( vec_float a) { return _mm_cvtps_epi32(a); }
118 inline int float2int( float a ) { return int(a); }
119
120 inline vec_float int2float( vec_int a) { return _mm_cvtepi32_ps(a); }
121 inline float int2float( int a ) { return float(a); }
122
123
124
125
126 // arithmetic
127 inline vec_float add_vec( vec_float a, vec_float b) { return _mm_add_ps(a,b); }
128 inline vec_float add_vec( vec_int a, vec_float b) { return _mm_add_ps(int2float(a),b); }
129 inline vec_float add_vec( vec_float a, vec_int b) { return _mm_add_ps(a,int2float(b)); }
130
131 inline vec_int add_vec( vec_int a, vec_int b) { return _mm_add_epi32(a,b); }
132
133
134 inline vec_float add_scal( vec_float a, vec_float b) { return _mm_add_ss(a,b); }
135 inline vec_int add_scal( vec_int a, vec_int b) { return _mm_add_epi32(a,b); } // _mm_add_pi32 en MMX
136 //inline scal_int add_scal( scal_int a, scal_int b) { return _mm_add_pi32(a,b); }
137
138
139 inline vec_float sub_vec( vec_float a, vec_float b) { return _mm_sub_ps(a,b); }
140 inline vec_int sub_vec( vec_int a, vec_int b) { return _mm_sub_epi32(a,b); }
141 inline vec_float sub_scal( vec_float a, vec_float b) { return _mm_sub_ss(a,b); }
142 inline vec_int sub_scal( vec_int a, vec_int b) { return _mm_sub_epi32(a,b); } // _mm_sub_pi32 en MMX
143 //inline scal_int sub_scal( scal_int a, scal_int b) { return _mm_sub_pi32(a,b); }
144
145
146 inline vec_float mul_vec( vec_float a, vec_float b) { return _mm_mul_ps(a,b); }
147 inline vec_float mul_vec( vec_int a, vec_float b) { return _mm_mul_ps(int2float(a),b); }
148 inline vec_float mul_vec( vec_float a, vec_int b) { return _mm_mul_ps(a,int2float(b)); }
149
150 inline vec_float mul_scal( vec_float a, vec_float b) { return _mm_mul_ss(a,b); }
151
152 // INTEGER MULTIPLICATION
153 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
154 // X = ( A << 16 ) + B
155 // Y = ( C << 16 ) + D
156 // then:
157 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
158 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
159 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed
160 // ( however, if it's different from zero, then there is an overflow )
161
162 inline vec_int mul_vec( vec_int a, vec_int b) {
163
164 vec_int temp0 = _mm_shufflehi_epi16( _mm_shufflelo_epi16( b, 0xB1), 0xB1);
165 vec_int temp1 = _mm_and_si128( b, _mm_srli_epi32( _mm_cmpeq_epi32( b,b), 16));
166
167 vec_int temp2 = _mm_madd_epi16( a, temp0);
168 vec_int temp3 = _mm_madd_epi16( a, temp1);
169
170 vec_int temp4 = _mm_slli_epi32( temp2, 16);
171
172 return _mm_add_epi32( temp4, temp3);
173 }
174
175 inline vec_int mul_scal( vec_int a, vec_int b) {
176
177 vec_int temp0 = _mm_shufflelo_epi16( b, 0xB1);
178 vec_int temp1 = _mm_and_si128( b, _mm_cvtsi32_si128(0x00ff));
179
180 vec_int temp2 = _mm_madd_epi16( a, temp0);
181 vec_int temp3 = _mm_madd_epi16( a, temp1);
182
183 vec_int temp4 = _mm_slli_epi32( temp2, 16);
184
185 return _mm_add_epi32( temp4, temp3);
186 }
187
188
189 inline vec_float div_vec( vec_float a, vec_float b) { return _mm_mul_ps(a,_mm_rcp_ps(b)); /*_mm_div_ps(a,b);*/ }
190 inline vec_int div_vec( vec_int a, vec_int b) { return _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(a),_mm_rcp_ps(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!
191 inline vec_float div_scal( vec_float a, vec_float b) { return _mm_mul_ss(a,_mm_rcp_ss(b)); /*_mm_div_ss(a,b);*/ }
192 inline vec_int div_scal( vec_int a, vec_int b) { return _mm_cvtps_epi32(_mm_mul_ss(_mm_cvtepi32_ps(a),_mm_rcp_ss(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!!
193 //inline scal_int div_scal( scal_int a, scal_int b) { return _mm_cvtsi32_si64((_mm_cvtsi64_si32(a))/(_mm_cvtsi64_si32(b))); } // A CHANGER !!!!!
194
195
196 inline vec_int mod_vec( vec_int a, vec_int N) {
197
198 vec_int temp = _mm_sub_epi32(a,N);
199 vec_int zero = _mm_xor_si128(a,a);
200
201 vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
202
203 return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
204 }
205
206 inline vec_int mod_scal( vec_int a, vec_int N) {
207
208 vec_int temp = _mm_sub_epi32(a,N);
209 vec_int zero = _mm_xor_si128(a,a);
210
211 vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
212
213 return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
214 }
215
216
217
218 // simulation of a*b + c
219 #define madd_vec(a,b,c) add_vec(mul_vec(a,b),c)
220 #define madd_scal(a,b,c) add_scal(mul_scal(a,b),c)
221
222 //inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,b),c); }
223 //inline vec_int madd_vec( vec_int a, vec_int b, vec_int c) { return add_vec(mul_vec(a,b),c); }
224 //inline vec_int madd_scal( vec_int a, vec_int b, vec_int c) { return add_scal(mul_scal(a,b),c); }
225
226
227 // simulation of - ( a*b - c )
228 //inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c) { }
229
230 // simulation of a*(1/b) + c
231 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c) { return _mm_add_ps(_mm_mul_ps(a,_mm_rcp_ps(b)),c); }
232 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,_mm_rcp_ss(b)),c); }
233 // simulation of - ( a*(1/b) - c )
234 //inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c) { }
235
236
237
238
239 // shift ( and fill with 0's )
240 inline vec_int shift_left_vec( vec_int a, vec_int num) { return _mm_sll_epi32(a,num); }
241 inline vec_int shift_left_vec( vec_int a, int num) { return _mm_slli_epi32(a,num); }
242 inline vec_int shift_left_scal( vec_int a, vec_int num) { return _mm_sll_epi32(a,num); } // _mm_sll_pi32(a,num) en MMX
243 //inline scal_int shift_left_scal( scal_int a, scal_int num) { return _mm_sll_pi32(a,num); }
244 inline vec_int shift_left_scal( vec_int a, int num) { return _mm_slli_epi32(a,num); } // _mm_slli_pi32(a,num) en MMX
245 //inline scal_int shift_left_scal( scal_int a, int num) { return _mm_slli_pi32(a,num); }
246
247
248 // shift ( and fill with the sign bit )
249 inline vec_int shift_right_vec( vec_int a, vec_int num) { return _mm_sra_epi32(a,num); }
250 inline vec_int shift_right_vec( vec_int a, int num) { return _mm_srai_epi32(a,num); }
251 inline vec_int shift_right_scal( vec_int a, vec_int num) { return _mm_sra_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
252 //inline scal_int shift_right_scal( scal_int a, scal_int num) { return _mm_sra_pi32(a,num); }
253 inline vec_int shift_right_scal( vec_int a, int num) { return _mm_srai_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
254 //inline scal_int shift_right_scal( scal_int a, int num) { return _mm_srai_pi32(a,num); }
255
256
257 // shift ( and fill with 0's )
258 inline vec_int shift_right_vec_logical( vec_int a, vec_int num) { return _mm_srl_epi32(a,num); }
259 inline vec_int shift_right_vec_logical( vec_int a, int num) { return _mm_srli_epi32(a,num); }
260 inline vec_int shift_right_scal_logical( vec_int a, vec_int num) { return _mm_srl_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
261 //inline scal_int shift_right_scal_logical( scal_int a, scal_int num) { return _mm_srl_pi32(a,num); }
262 inline vec_int shift_right_scal_logical( vec_int a, int num) { return _mm_srli_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
263 //inline scal_int shift_right_scal_logical( scal_int a, int num) { return _mm_srli_pi32(a,num); }
264
265
266 // Logic
267 // Ajouts YO;; supprime
268 //inline vec_float and_vec( vec_float a, vec_int b) { return _mm_and_ps(a,b); }
269 //inline vec_float and_vec( vec_int a, vec_float b) { return _mm_and_ps(a,b); }
270
271 inline vec_float and_vec( vec_float a, vec_float b) { return _mm_and_ps(a,b); }
272 inline vec_int and_vec( vec_int a, vec_int b) { return _mm_and_si128(a,b); }
273 inline vec_float and_scal( vec_float a, vec_float b) { return _mm_and_ps(a,b); }
274 inline vec_int and_scal( vec_int a, vec_int b) { return _mm_and_si128(a,b); } // _mm_and_si64(a,b) en MMX
275 //inline scal_int and_scal( scal_int a, scal_int b) { return _mm_and_si64(a,b); }
276
277
278 inline vec_float or_vec( vec_float a, vec_float b) { return _mm_or_ps(a,b); }
279 inline vec_int or_vec( vec_int a, vec_int b) { return _mm_or_si128(a,b); }
280 inline vec_float or_scal( vec_float a, vec_float b) { return _mm_or_ps(a,b); }
281 inline vec_int or_scal( vec_int a, vec_int b) { return _mm_or_si128(a,b); } // _mm_or_si64(a,b) en MMX
282 //inline scal_int or_scal( scal_int a, scal_int b) { return _mm_or_si64(a,b); }
283
284
285 inline vec_float xor_vec( vec_float a, vec_float b) { return _mm_xor_ps(a,b); }
286 inline vec_int xor_vec( vec_int a, vec_int b) { return _mm_xor_si128(a,b); }
287 inline vec_float xor_scal( vec_float a, vec_float b) { return _mm_xor_ps(a,b); }
288 inline vec_int xor_scal( vec_int a, vec_int b) { return _mm_xor_si128(a,b); } // _mm_xor_si64(a,b) en MMX
289 //inline scal_int xor_scal( scal_int a, scal_int b) { return _mm_xor_si64(a,b); }
290
291 //------------------------------------------------------------------------------------------------------------
292 // YO : remplacement de inline vec_float par inline vec_int dans les operations de comparaison entre vec_float
293 // pour une meilleur compatibilité avec la compilation vectorielle
294 //------------------------------------------------------------------------------------------------------------
295
296 // cast (without conversion)
297 inline vec_float cast2vec_float(vec_int x) { return _mm_castsi128_ps(x); }
298 inline vec_int cast2vec_int(vec_float x) { return _mm_castps_si128(x); }
299
300 // convertions
301 inline vec_float conv2vec_float(vec_int x) { return _mm_cvtepi32_ps(x); }
302 inline vec_int conv2vec_int(vec_float x) { return _mm_cvtps_epi32(x); }
303
304
305 // comparaison
306 //inline vec_float int2float( vec_int a) { return _mm_cvtepi32_ps(a); }
307
308
309
310 inline vec_float gt_vec( vec_float a, vec_float b) { return _mm_cmpgt_ps(a,b); }
311 inline vec_float gt_vec( vec_int a, vec_float b) { return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
312 inline vec_float gt_vec( vec_float a, vec_int b) { return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
313 inline vec_int gt_vec( vec_int a, vec_int b) { return _mm_cmpgt_epi32(a,b); }
314
315 inline vec_float gt_scal( vec_float a, vec_float b) { return _mm_cmpgt_ps(a,b); }
316 inline vec_float gt_scal( vec_int a, vec_float b) { return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
317 inline vec_float gt_scal( vec_float a, vec_int b) { return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
318 inline vec_int gt_scal( vec_int a, vec_int b) { return _mm_cmpgt_epi32(a,b); }
319
320 // choose between two values choose(c,u,v) = c?u:v
321 // the type of the result depends of the types of u and v, not of the type of c
322
323 inline vec_float choose(vec_float c, vec_float u, vec_float v) { return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,v)); }
324 inline vec_float choose(vec_float c, vec_int u, vec_float v) { return _mm_or_ps(_mm_and_ps(c,_mm_cvtepi32_ps(u)), _mm_andnot_ps(c,v)); }
325 inline vec_float choose(vec_float c, vec_float u, vec_int v) { return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,_mm_cvtepi32_ps(v))); }
326
327 inline vec_float choose(vec_int c, vec_float u, vec_float v) { return choose(cast2vec_float(c), u, v); }
328 inline vec_float choose(vec_int c, vec_int u, vec_float v) { return choose(cast2vec_float(c), u, v); }
329 inline vec_float choose(vec_int c, vec_float u, vec_int v) { return choose(cast2vec_float(c), u, v); }
330
331 inline vec_int choose(vec_int c, vec_int u, vec_int v) { return _mm_or_si128(_mm_and_si128(c,u), _mm_andnot_si128(c,v)); }
332 inline vec_int choose(vec_float c, vec_int u, vec_int v) { return choose(cast2vec_int(c), u, v); }
333
334 // choose between two values choosezero(c,u) = c?u:0
335 inline vec_float choosezero(vec_float c, vec_float u) { return _mm_and_ps(c,u); }
336 inline vec_float choosezero(vec_int c, vec_float u) { return choosezero(cast2vec_float(c), u); }
337
338 inline vec_int choosezero(vec_int c, vec_int u) { return _mm_and_si128(c,u); }
339 inline vec_int choosezero(vec_float c, vec_int u) { return choosezero(cast2vec_int(c), u); }
340
341
342 //inline vec_int gt_vec( vec_float a, vec_float b) { return _mm_srli_epi32(_mm_cmpgt_ps(a,b),31); }
343 //inline vec_int gt_vec( vec_float a, vec_float b) { vec_univ v; v.f4 = _mm_cmpgt_ps(a,b); return _mm_srli_epi32(v.i4,31); }
344 //inline vec_int gt_vec( vec_int a, vec_int b) { return _mm_cmpgt_epi32(a,b); }
345 //inline vec_int gt_scal( vec_float a, vec_float b) { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
346 //inline vec_int gt_scal( vec_float a, vec_float b) { vec_univ v; v.f4 = _mm_cmpgt_ss(a,b); return _mm_srli_epi32(v.i4,31); }
347 //inline vec_int gt_scal( vec_float a, vec_float b) { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
348 //inline vec_int gt_scal( vec_int a, vec_float b) { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
349 //inline vec_int gt_scal( vec_int a, vec_int b) { return _mm_cmpgt_epi32(a,b); } // _mm_cmpgt_pi32(a,b) en MMX
350
351 //inline scal_int gt_scal( scal_int a, scal_int b) { return (__m128i) _mm_cmpgt_pi32(a,b); }
352
353 #if 0
354
355 inline vec_int lt_vec( vec_float a, vec_float b) { return _mm_cmplt_ps(a,b); }
356 inline vec_int lt_vec( vec_int a, vec_int b) { return _mm_cmpgt_epi32(b,a); }
357 inline vec_int lt_scal( vec_float a, vec_float b) { return _mm_cmplt_ss(a,b); }
358 inline vec_int lt_scal( vec_int a, vec_int b) { return _mm_cmpgt_epi32(b,a); } // _mm_cmpgt_pi32(b,a) en MMX
359 //inline scal_int lt_scal( scal_int a, scal_int b) { return _mm_cmpgt_pi32(b,a); }
360
361
362
363 inline vec_int ge_vec( vec_float a, vec_float b) { return _mm_cmpge_ps(a,b); }
364 inline vec_int ge_vec( vec_int a, vec_int b) { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); }
365 inline vec_int ge_scal( vec_float a, vec_float b) { return _mm_cmpge_ss(a,b); }
366 inline vec_int ge_scal( vec_int a, vec_int b) { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
367 //inline scal_int ge_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(a,b),_mm_cmpeq_pi32(a,b)); }
368
369
370 inline vec_int le_vec( vec_float a, vec_float b) { return _mm_cmple_ps(a,b); }
371 inline vec_int le_vec( vec_int a, vec_int b) { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); }
372 inline vec_int le_scal( vec_float a, vec_float b) { return _mm_cmple_ss(a,b); }
373 inline vec_int le_scal( vec_int a, vec_int b) { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
374 //inline scal_int le_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(b,a),_mm_cmpeq_pi32(b,a)); }
375
376
377 inline vec_int eq_vec( vec_float a, vec_float b) { return _mm_cmpeq_ps(a,b); }
378 inline vec_int eq_vec( vec_int a, vec_int b) { return _mm_cmpeq_epi32(a,b); }
379 inline vec_int eq_scal( vec_float a, vec_float b) { return _mm_cmpeq_ss(a,b); }
380 inline vec_int eq_scal( vec_int a, vec_int b) { return _mm_cmpeq_epi32(a,b); } // _mm_cmpeq_pi32(a,b) en MMX
381 //inline scal_int eq_scal( scal_int a, scal_int b) { return _mm_cmpeq_pi32(a,b); }
382
383
384 inline vec_int neq_vec( vec_float a, vec_float b) { return _mm_cmpneq_ps(a,b); }
385 inline vec_int neq_vec( vec_int a, vec_int b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); }
386 inline vec_int neq_scal( vec_float a, vec_float b) { return _mm_cmpneq_ss(a,b); }
387 inline vec_int neq_scal( vec_int a, vec_int b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); } // _mm_andnot_si64,_mm_cmpeq_pi32 MMX
388 //inline scal_int neq_scal( scal_int a, scal_int b) { return _mm_andnot_si64(_mm_cmpeq_pi32(a,b),SCAL_INT_ALL_ONE); }
389
390 #endif
391
392
393 // memory
394
395 #if 0
396 inline vec_float set_vec( double a) { float val = float(a); vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
397 inline vec_float set_vec( float a) { float val = a; vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
398 inline vec_int set_vec( long int a) { vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
399 inline vec_int set_vec( int a) { vec_int temp = _mm_cvtsi32_si128(a); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp);}
400 inline vec_int set_vec( short a) { vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
401 //inline scal_int set_vec( long int a) { _mm_cvtsi32_si64(int(a)); }
402 //inline scal_int set_vec( int a) { _mm_cvtsi32_si64(a); }
403 //inline scal_int set_vec( short a) { _mm_cvtsi32_si64(int(a)); }
404 #endif
405
406 #if 0
407
408 inline vec_float set_vec( double a, double b, double c, double d) { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
409 inline vec_float set_vec( float a, float b, float c, float d) { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
410 inline vec_int set_vec( int a, int b, int c, int d) { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
411 inline vec_int set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
412 inline vec_int set_vec( short a, short b, short c, short d) { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
413
414 #endif
415
416 inline vec_float set_vec( float a, float b, float c, float d) { return vec_float(a,b,c,d); }
417 inline vec_int set_vec( int a, int b, int c, int d) { return vec_int(a,b,c,d); }
418
419 inline vec_float set_vec( float a) { return vec_float(a); }
420 inline vec_int set_vec( int a) { return vec_int(a); }
421
422 inline vec_float load_a_vec( float* a) { return _mm_load_ps(a); }
423 inline vec_int load_a_vec( int* a) { return _mm_load_si128((__m128i*)a); }
424
425 inline vec_float load_u_vec( float* a) { return _mm_loadu_ps(a); }
426 inline vec_int load_u_vec( int* a) { return _mm_loadu_si128((__m128i*)a); }
427
428 // nouvelles fonctions d'écriture sans polluer le cache
429 inline void store_stream( float* a, vec_float b) { return _mm_stream_ps(a,b); }
430 inline void store_stream( int* a, vec_int b) { return _mm_stream_si128((__m128i*)a,b); }
431
432 inline void store_a_vec( float* a, vec_float b) { return _mm_store_ps(a,b); }
433 inline void store_a_vec( int* a, vec_int b) { return _mm_store_si128((__m128i*)a,b); }
434
435 inline void store_u_vec( float* a, vec_float b) { return _mm_storeu_ps(a,b); }
436 inline void store_u_vec( int* a, vec_int b) { return _mm_storeu_si128((__m128i*)a,b); }
437
438
439 inline vec_float load_scal(float* a) { return _mm_load_ss(a); }
440 inline vec_int load_scal(int* a) { return _mm_cvtsi32_si128(*a); }
441 //inline scal_int load_scal(int* a) { return _mm_cvtsi32_si64(*a); }
442
443 inline void store_scal(float* a, vec_float content) { return _mm_store_ss(a,content); }
444 inline void store_scal(int* a, vec_int content) { *a = _mm_cvtsi128_si32(content); return; }
445 //inline void store_scal(int* a, scal_int content) { *a = _mm_cvtsi64_si32(content); return; }
446
447
448
449 inline vec_float REC0(vec_float a) { return _mm_unpacklo_ps(a,a); }
450 inline vec_float REC1(vec_float a) { return _mm_unpacklo_ps(a,a); }
451 inline vec_float REC2(vec_float a) { return _mm_shuffle_ps(a,a,0x90); }
452 inline vec_float REC3(vec_float a) { return _mm_shuffle_ps(a,a,0x1B); }
453
454 inline vec_int REC0(vec_int a) { return _mm_unpacklo_epi32(a,a); }
455 inline vec_int REC1(vec_int a) { return _mm_unpacklo_epi32(a,a); }
456 inline vec_int REC2(vec_int a) { return _mm_shuffle_epi32(a,0x90); }
457 inline vec_int REC3(vec_int a) { return _mm_shuffle_epi32(a,0x1B); }
458
459
460
461
462
463 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
464 //inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_shuffle_ps(_mm_shuffle_ps(a0,a1,0x00),_mm_shuffle_ps(a2,a3,0x00),0x88); }
465 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_unpacklo_ps(_mm_unpacklo_ps(a0,a2),_mm_unpacklo_ps(a1,a3)); }
466 inline vec_int SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) { return _mm_unpacklo_epi32(_mm_unpacklo_epi32(a0,a2),_mm_unpacklo_epi32(a1,a3)); }
467 //inline vec_int SCAL2VEC(scal_int a0,scal_int a1,scal_int a2,scal_int a3) { return _mm_unpacklo_epi32( _mm_movpi64_epi64(_mm_unpacklo_pi32(a0,a2)), _mm_movpi64_epi64(_mm_unpacklo_pi32(a1,a3)) ); } // ou _mm_set_epi64( _mm_unpacklo_pi32(a0,a1), _mm_unpacklo_pi32(a2,a3))
468
469 inline vec_float SCAL2VEC(double a0, double a1, double a2, double a3) { return _mm_set_ps(float(a3),float(a2),float(a1),float(a0)); }
470 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { return _mm_set_ps(a3,a2,a1,a0); }
471 inline vec_int SCAL2VEC(long a0, long a1, long a2, long a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
472 inline vec_int SCAL2VEC(int a0, int a1, int a2, int a3) { return _mm_set_epi32(a3,a2,a1,a0); }
473 inline vec_int SCAL2VEC(short a0, short a1, short a2, short a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
474
475
476 // vector to scalar: build a scalar vector from one element of the initial vector
477 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0 // _mm_shuffle_ps(a,a,Ox00) would return a0,a0,a0,a0
478 inline vec_float VEC2SCALVEC1(vec_float a) { return _mm_shuffle_ps(a,a,0x55); } // return a1,a1,a1,a1
479 inline vec_float VEC2SCALVEC2(vec_float a) { return _mm_shuffle_ps(a,a,0xAA); } // return a2,a2,a2,a2
480 inline vec_float VEC2SCALVEC3(vec_float a) { return _mm_shuffle_ps(a,a,0xFF); } // return a3,a3,a3,a3
481
482 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0 // _mm_shuffle_epi32(a,Ox00) would return a0,a0,a0,a0
483 inline vec_int VEC2SCALVEC1(vec_int a) { return _mm_shuffle_epi32(a,0x55); } // return a1,a1,a1,a1
484 inline vec_int VEC2SCALVEC2(vec_int a) { return _mm_shuffle_epi32(a,0xAA); } // return a2,a2,a2,a2
485 inline vec_int VEC2SCALVEC3(vec_int a) { return _mm_shuffle_epi32(a,0xFF); } // return a3,a3,a3,a3
486
487 //inline scal_int VEC2SCALVEC0(vec_int a) { return _mm_movepi64_pi64(a); } // ATTENTION !!!! :
488 //inline scal_int VEC2SCALVEC1(vec_int a) { __m64 temp = _mm_movepi64_pi64(a); return _mm_unpackhi_pi32(temp,temp); } // VEC2SCALVEC0 et 1 peuvent être réunis en une instruction plus efficace
489 //inline scal_int VEC2SCALVEC2(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xAA)); }
490 //inline scal_int VEC2SCALVEC3(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xFF)); }
491
492
493
494 // vector to scalar: build a single scalar from a vector
495 inline float VEC2SCAL0(vec_float a) { float temp; _mm_store_ss(&temp,a); return temp; }
496 inline float VEC2SCAL1(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0x55)); return temp; }
497 inline float VEC2SCAL2(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xAA)); return temp; }
498 inline float VEC2SCAL3(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xFF)); return temp; }
499
500 inline int VEC2SCAL0(vec_int a) { return _mm_cvtsi128_si32(a); }
501 inline int VEC2SCAL1(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0x55)); }
502 inline int VEC2SCAL2(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xAA)); }
503 inline int VEC2SCAL3(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xFF)); }
504 //inline int VEC2SCAL0(scal_int a) { return _mm_cvtsi64_si32(a); }
505
506
507
508 // select: if( select == 0 ) then a ; else b ;
509 inline vec_float select_vec( vec_float select, vec_float a, vec_float b) { return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
510 inline vec_float select_scal( vec_float select, vec_float a, vec_float b){ return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
511 inline vec_float select_vec( vec_int select, vec_float a, vec_float b) { __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
512 inline vec_float select_scal( vec_int select, vec_float a, vec_float b){ __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
513 inline vec_int select_vec( vec_int select, vec_int a, vec_int b) { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); }
514 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); } // ou MMX
515
516
517
518 // vectorial version of the "mem" Faust key-word
519 // return a[2] a[1] a[0] b[3]
520 inline vec_float mem1_vec( vec_float a, vec_float b) { return _mm_shuffle_ps(_mm_shuffle_ps(b,a,0x4E),a,0x99); }
521 inline vec_int mem1_vec( vec_int a, vec_int b) {
522 return _mm_unpacklo_epi32( _mm_shuffle_epi32( _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0xFF), a), 0xEE), _mm_shuffle_epi32( a, 0x88) );
523 }
524
525 // return a[1] a[0] b[3] b[2]
526 inline vec_float mem2_vec( vec_float a, vec_float b) { return _mm_shuffle_ps(b,a,0x4E); }
527 inline vec_int mem2_vec( vec_int a, vec_int b) { return _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x44)), 0xD8 );
528
529 }
530
531 // return a[0] b[3] b[2] b[1]
532 inline vec_float mem3_vec( vec_float a, vec_float b) { return _mm_shuffle_ps(b,_mm_shuffle_ps(b,a,0x4E),0x99); }
533 inline vec_int mem3_vec( vec_int a, vec_int b) {
534 return _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0x99), _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x00)), 0xEE) );
535 }
536
537
538
539
540
541 // conversion
542 inline vec_float bool2float( vec_float a ) { return _mm_and_ps(a,set_vec(1.0f)); }
543 inline vec_float bool2float( vec_int a ) { return _mm_cvtepi32_ps(_mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a)))); }
544
545 inline vec_int bool2int( vec_int a) { return _mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a))); }
546 inline vec_int bool2int( vec_float a ) { return _mm_cvtps_epi32(_mm_and_ps(a,set_vec(1.0f))); }
547
548 inline vec_int boolfloat2boolint( vec_float a ) { vec_int temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
549 inline vec_float boolint2boolfloat( vec_int a ) { vec_float temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
550
551
552
553
554
555
556
557 #elif defined(__ALTIVEC__)
558
559
560 /****************************************************/
561 /* */
562 /* ALTIVEC implementation */
563 /* */
564 /****************************************************/
565
566
567 //#define vec_float vector float
568 //#define vec_int vector signed int
569 //#define vec_bool vector bool int
570
571 struct vec_int
572 {
573 vector signed int vec;
574 vec_int() {}
575 vec_int(vector signed int m) { vec = m; }
576 // operator __m128i() const { return vec; }
577
578 };
579
580 struct vec_float
581 {
582 // union { __m128 vec; __m128i i4; };
583 vector float vec;
584 vec_float() {}
585 vec_float(vector float m) { vec = m; }
586 //vec_float(vec_int a) { vec = _mm_cvtepi32_ps(a); }
587
588 //operator __m128() const { return vec; }
589 };
590
591 typedef union{
592 float s[4];
593 vec_float v;
594 } __vec_float;
595
596
597 typedef union{
598 int s[4];
599 vec_int v;
600 } __vec_int;
601
602
603 // Non-Java mode: during underflow zero result is returned
604 // Not compatible with the Java-IEEE-C9X standard ( which
605 // deliver denormalized result in case of underflow )
606 #define NO_DENORMALIZE vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x8000)))
607 #define DENORMALIZE vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x0000)))
608
609
610
611
612 // constants
613
614 // 0 0 0 0
615 #define VEC_INT_ZERO(a) vec_xor(a,a)
616
617 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
618 #define VEC_INT_ONES(a) (vector signed int)vec_cmpeq(a,a)
619
620 // 1 1 1 1
621 #define VEC_INT_ONE(a) vec_splat_s32(1)
622
623 // -1 -1 -1 -1
624 #define VEC_INT_MINUS_ONE(a) vec_splat_s32(-1)
625
626 // a must belong to [-16,15]
627 // no efficient equivalent with SSE2
628 #define VEC_INT_MINUS_16_TO_15(a) vec_splat_s32(a)
629
630 // This not exactly equivalent to the SSE2 version
631 // the power must belong to [17,31]
632 // ( that is 2^17 - 1 = 32767 is the minimum
633 // and 2^31 - 1 = 2147483647 is the maximum;
634 // if you need 2^32 - 1, use VEC_INT_ONES )
635 // Example: 2^19 - 1 = 524287 -> VEC_INT_PW2_MINUS_1(a,19)
636 #define VEC_INT_PW2_MINUS_1(a,pw) vec_sr((vector signed int)vec_cmpeq(a,a), vec_splat_u32(32-pw))
637
638
639 // This not exactly equivalent to the SSE2 version
640 // the power must belong to [4,18]
641 // ( that is 2^18 = 262144 is the maximum
642 // and 2^4 = 16 is the minimum;
643 // if you need 2^0 = 1, use VEC_INT_ONE,
644 // if you need 2^1 = 2, use VEC_INT_MINUS_16_TO_15(2)
645 // if you need 2^2 = 4, use VEC_INT_MINUS_16_TO_15(4)
646 // if you need 2^3 = 8, use VEC_INT_MINUS_16_TO_15(8) )
647 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
648 #define VEC_INT_PW2(a,pw) vec_sl(vec_splat_s32(8), vec_splat_u32(pw-3))
649
650
651 //vec_sr(a,(vector unsigned int)num);
652
653 // 0.0 0.0 0.0 0.0
654 #define VEC_FLOAT_ZERO(a) vec_xor(a,a)
655
656 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
657 #define VEC_FLOAT_ONES(a) (vector float)vec_cmpeq(a,a)
658
659
660
661
662
663
664
665
666
667
668 // arithmetic
669 inline vec_float add_vec( vec_float a, vec_float b) { return vec_add(a,b); }
670 inline vec_float add_scal(vec_float a, vec_float b) { return vec_add(a,b); }
671 inline vec_int add_vec( vec_int a, vec_int b) { return vec_add(a,b); }
672 inline vec_int add_scal(vec_int a, vec_int b) { return vec_add(a,b); }
673
674
675 inline vec_float sub_vec( vec_float a, vec_float b) { return vec_sub(a,b); }
676 inline vec_float sub_scal( vec_float a, vec_float b) { return vec_sub(a,b); }
677 inline vec_int sub_vec( vec_int a, vec_int b) { return vec_sub(a,b); }
678 inline vec_int sub_scal( vec_int a, vec_int b) { return vec_sub(a,b); }
679
680
681 inline vec_float mul_vec( vec_float a, vec_float b) { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
682 inline vec_float mul_scal( vec_float a, vec_float b) { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
683
684
685 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
686 // X = ( A << 16 ) + B
687 // Y = ( C << 16 ) + D
688 // then:
689 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
690 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
691 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed ( however, if it's different
692 // from zero, then there is an overflow )
693 // In this implementation A*D + B*C is computed in a single "vec_msum"
694
695 inline vec_int mul_vec( vec_int a, vec_int b) {
696 const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
697 return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0) ), VEC_SIXTEEN_UINT32 ), vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
698 }
699
700 inline vec_int mul_scal( vec_int a, vec_int b) {
701 const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
702 return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0) ), VEC_SIXTEEN_UINT32 ), vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
703 }
704
705
706 //inline vec_int mul_vec( vec_int a, vec_int b) { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
707 //inline vec_int mul_scal( vec_int a, vec_int b) { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
708
709
710
711 inline vec_float div_vec( vec_float a, vec_float b) { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
712 inline vec_float div_scal( vec_float a, vec_float b) { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
713 inline vec_int div_vec( vec_int a, vec_int b) { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
714 inline vec_int div_scal( vec_int a, vec_int b) { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
715
716
717 inline vec_int mod_vec( vec_int a, vec_int N) {
718
719 vec_int temp = vec_sub(a,N);
720 vec_int zero = vec_splat_s32(int(0x00000000));
721
722 vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
723
724 return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
725 }
726
727 inline vec_int mod_scal( vec_int a, vec_int N) {
728
729 vec_int temp = vec_sub(a,N);
730 vec_int zero = vec_splat_s32(int(0x00000000));
731
732 vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
733
734 return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
735 }
736
737
738
739
740 // return a*b + c
741 inline vec_float madd_vec( vec_float a, vec_float b, vec_float c) { return vec_madd(a,b,c); }
742 inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,b,c); }
743
744 // return - ( a*b - c )
745 inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,b,c); }
746 inline vec_float nmsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,b,c); }
747
748
749 // return a*(1/b) + c
750 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c) { return vec_madd(a,vec_re(b),c); }
751 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,vec_re(b),c); }
752
753
754 // return - ( a*(1/b) - c )
755 inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,vec_re(b),c); }
756 inline vec_float divsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,vec_re(b),c); }
757
758
759
760
761 // logic
762 inline vec_float and_vec( vec_float a, vec_float b) { return vec_and(a,b); }
763 inline vec_float and_scal( vec_float a, vec_float b) { return vec_and(a,b); }
764 inline vec_int and_vec( vec_int a, vec_int b) { return vec_and(a,b); }
765 inline vec_int and_scal( vec_int a, vec_int b) { return vec_and(a,b); }
766
767 inline vec_float or_vec( vec_float a, vec_float b) { return vec_or(a,b); }
768 inline vec_float or_scal( vec_float a, vec_float b) { return vec_or(a,b); }
769 inline vec_int or_vec( vec_int a, vec_int b) { return vec_or(a,b); }
770 inline vec_int or_scal( vec_int a, vec_int b) { return vec_or(a,b); }
771
772 inline vec_float xor_vec( vec_float a, vec_float b) { return vec_xor(a,b); }
773 inline vec_float xor_scal( vec_float a, vec_float b) { return vec_xor(a,b); }
774 inline vec_int xor_vec( vec_int a, vec_int b) { return vec_xor(a,b); }
775 inline vec_int xor_scal( vec_int a, vec_int b) { return vec_xor(a,b); }
776
777
778
779 // shift left
780 inline vec_int shift_left_vec( vec_int a, vec_int num) { return vec_sl(a,(vector unsigned int)num); }
781 inline vec_int shift_left_scal( vec_int a, vec_int num) { return vec_sl(a,(vector unsigned int)num); }
782
783 // shift ( and fill with the sign bit )
784 inline vec_int shift_right_vec( vec_int a, vec_int num) { return vec_sra(a,(vector unsigned int)num); }
785 inline vec_int shift_right_scal( vec_int a, vec_int num) { return vec_sra(a,(vector unsigned int)num); }
786
787 // shift ( and fill with 0's )
788 //inline vec_int shift_right_vec_logical( vec_int a, int num) { return vec_sr(a, set_vec(num) ); }
789 //inline vec_int shift_right_scal_logical( vec_int a, int num) { return vec_sr(a, set_vec(num) ); ); }
790 inline vec_int shift_right_vec_logical( vec_int a, vec_int num) { return vec_sr(a,(vector unsigned int)num); }
791 inline vec_int shift_right_scal_logical( vec_int a, vec_int num) { return vec_sr(a,(vector unsigned int)num); }
792
793
794
795
796 // comparaison
797 inline vec_float gt_vec( vec_float a, vec_float b) { return (vector float)vec_cmpgt(a,b); }
798 inline vec_float gt_scal( vec_float a, vec_float b) { return (vector float)vec_cmpgt(a,b); }
799 inline vec_int gt_vec( vec_int a, vec_int b) { return (vector signed int)vec_cmpgt(a,b); }
800 inline vec_int gt_scal( vec_int a, vec_int b) { return (vector signed int)vec_cmpgt(a,b); }
801
802
803 inline vec_float lt_vec( vec_float a, vec_float b) { return (vector float)vec_cmplt(a,b); }
804 inline vec_float lt_scal( vec_float a, vec_float b) { return (vector float)vec_cmplt(a,b); }
805 inline vec_int lt_vec( vec_int a, vec_int b) { return (vector signed int)vec_cmplt(a,b); }
806 inline vec_int lt_scal( vec_int a, vec_int b) { return (vector signed int)vec_cmplt(a,b); }
807
808
809 inline vec_float ge_vec( vec_float a, vec_float b) { return (vector float)vec_cmpge(a,b); }
810 inline vec_float ge_scal( vec_float a, vec_float b) { return (vector float)vec_cmpge(a,b); }
811 inline vec_int ge_vec( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
812 inline vec_int ge_scal( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
813
814
815 inline vec_float le_vec( vec_float a, vec_float b) { return (vector float)vec_cmple(a,b); }
816 inline vec_float le_scal( vec_float a, vec_float b) { return (vector float)vec_cmple(a,b); }
817 inline vec_int le_vec( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
818 inline vec_int le_scal( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
819
820
821 inline vec_float eq_vec( vec_float a, vec_float b) { return (vector float)vec_cmpeq(a,b); }
822 inline vec_float eq_scal( vec_float a, vec_float b) { return (vector float)vec_cmpeq(a,b); }
823 inline vec_int eq_vec( vec_int a, vec_int b) { return (vector signed int)vec_cmpeq(a,b); }
824 inline vec_int eq_scal( vec_int a, vec_int b) { return (vector signed int)vec_cmpeq(a,b); }
825
826
827 inline vec_float neq_vec( vec_float a, vec_float b) { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
828 inline vec_float neq_scal( vec_float a, vec_float b) { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
829 inline vec_int neq_vec( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
830 inline vec_int neq_scal( vec_int a, vec_int b) { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
831
832
833
834 // memory
835 inline vec_float set_vec( vec_float a) { return a; }
836 inline vec_float set_vec( __vec_float a) { return a.v; }
837 inline vec_int set_vec( vec_int a) { return a; }
838 inline vec_int set_vec( __vec_int a) { return a.v; }
839
840
841
842 inline vec_float set_vec( double a) {
843 float af;
844 af = (float)a;
845 vector float temp; temp = vec_lde(0,&af);
846 temp = vec_perm(temp,temp,vec_lvsl(0,&af));
847 return vec_splat(temp,0);
848 //__vec_float temp; float af = float(a); temp.s[0]=af; temp.s[1]=af; temp.s[2]=af; temp.s[3]=af; return temp.v;
849 }
850 inline vec_float set_vec( float a) {
851 float af;
852 af = a;
853 vector float temp; temp = vec_lde(0,&af);
854 temp = vec_perm(temp,temp,vec_lvsl(0,&af));
855 return vec_splat(temp,0);
856 //__vec_float temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
857 }
858 inline vec_int set_vec( long int a) {
859 int ai;
860 ai = (int)a;
861 vector signed int temp; temp = vec_lde(0,&ai);
862 temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
863 return vec_splat(temp,0);
864 //__vec_int temp; int al = int(a); temp.s[0]=al; temp.s[1]=al; temp.s[2]=al; temp.s[3]=al; return temp.v;
865 }
866 inline vec_int set_vec( int a) {
867 int ai;
868 ai = a;
869 vector signed int temp; temp = vec_lde(0,&ai);
870 temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
871 return vec_splat(temp,0);
872 //__vec_int temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
873 }
874 inline vec_int set_vec( short a) {
875 int ai;
876 ai = (int)a;
877 vector signed int temp; temp = vec_lde(0,&ai);
878 temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
879 return vec_splat(temp,0);
880 //__vec_int temp; int as = int(a); temp.s[0]=as; temp.s[1]=as; temp.s[2]=as; temp.s[3]=as; return temp.v;
881 }
882
883 inline vec_float set_vec( double a, double b, double c, double d) { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
884 inline vec_float set_vec( float a, float b, float c, float d) { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
885 inline vec_int set_vec( int a, int b, int c, int d) { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
886 inline vec_int set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
887 inline vec_int set_vec( short a, short b, short c, short d) { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
888
889
890 inline vec_float load_a_vec( float* a) { return vec_ld(0,a); }
891 //inline vec_float load_u_vec( float* a) { }
892 inline vec_int load_a_vec( int* a) { return vec_ld(0,a); }
893 //inline vec_int load_u_vec( int* a) { }
894
895 inline void store_a_vec( float* a, vec_float b) { return vec_st(b,0,a); }
896 //inline void store_u_vec( float* a, vec_float b) { }
897 inline void store_a_vec( int* a, vec_int b) { return vec_st(b,0,a); }
898 //inline void store_u_vec( int* a, vec_int b) { }
899
900
901 inline vec_float load_scal(float* a) { vector float temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
902 inline vec_int load_scal(int* a) { vector signed int temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
903 inline void store_scal(float* a, vec_float content) { vec_float temp = vec_splat(content,0); return vec_ste(temp,0,a); }
904 inline void store_scal(int* a, vec_int content) { vec_int temp = vec_splat(content,0); return vec_ste(temp,0,a); }
905
906
907
908
909 inline vec_float REC0(vec_float a) { return vec_mergeh(a,a); }
910 inline vec_float REC1(vec_float a) { return vec_mergeh(a,a); }
911 inline vec_float REC2(vec_float a) { return vec_sld(vec_splat(a,0),a,12); }
912 inline vec_float REC3(vec_float a) {
913 vector float temp1 = vec_mergel(a,a);
914 vector float temp2 = vec_mergeh(a,a);
915 return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
916 }
917
918 inline vec_int REC0(vec_int a) { return vec_mergeh(a,a); }
919 inline vec_int REC1(vec_int a) { return vec_mergeh(a,a); }
920 inline vec_int REC2(vec_int a) { return vec_sld(vec_splat(a,0),a,12); }
921 inline vec_int REC3(vec_int a) {
922 vector signed int temp1 = vec_mergel(a,a);
923 vector signed int temp2 = vec_mergeh(a,a);
924 return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
925 }
926
927
928 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
929 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) {
930 return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(float*)(0)));
931 //return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
932 }
933 inline vec_int SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) {
934 return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(int*)(0)));
935 //return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
936 }
937
938 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { __vec_float temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
939 inline vec_int SCAL2VEC(int a0, int a1, int a2, int a3) { __vec_int temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
940
941
942 // vector to scalar: build a scalar vector from one element of the initial vector
943 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0 // vec_splat(a,0) would return a0,a0,a0,a0
944 inline vec_float VEC2SCALVEC1(vec_float a) { return vec_splat(a,1); } // return a1,a1,a1,a1
945 inline vec_float VEC2SCALVEC2(vec_float a) { return vec_splat(a,2); } // return a2,a2,a2,a2
946 inline vec_float VEC2SCALVEC3(vec_float a) { return vec_splat(a,3); } // return a3,a3,a3,a3
947
948 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0 // vec_splat(a,0) would return a0,a0,a0,a0
949 inline vec_int VEC2SCALVEC1(vec_int a) { return vec_splat(a,1); } // return a1,a1,a1,a1
950 inline vec_int VEC2SCALVEC2(vec_int a) { return vec_splat(a,2); } // return a2,a2,a2,a2
951 inline vec_int VEC2SCALVEC3(vec_int a) { return vec_splat(a,3); } // return a3,a3,a3,a3
952
953
954 // vector to scalar: build a single scalar from a vector
955 inline float VEC2SCAL0(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
956 inline float VEC2SCAL1(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
957 inline float VEC2SCAL2(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
958 inline float VEC2SCAL3(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
959
960 inline int VEC2SCAL0(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
961 inline int VEC2SCAL1(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
962 inline int VEC2SCAL2(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
963 inline int VEC2SCAL3(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
964
965
966
967 // select: if( select == 0 ) then a ; else b ;
968 inline vec_float select_vec( vec_float select, vec_float a, vec_float b) { return vec_sel(a,b,(vector bool int)select ); }
969 inline vec_float select_scal( vec_float select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select ); }
970 inline vec_float select_vec( vec_int select, vec_float a, vec_float b) { return vec_sel(a,b,(vector bool int)select ); }
971 inline vec_float select_scal( vec_int select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select ); }
972 inline vec_int select_vec( vec_int select, vec_int a, vec_int b) { return vec_sel(a,b,(vector bool int)select ); }
973 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return vec_sel(a,b,(vector bool int)select ); }
974
975
976
977
978
979 // vectorial version of the "mem" Faust key-word
980 // result = { a[2] a[1] a[0] b[3] }
981 inline vec_float mem1_vec( vec_float a, vec_float b) { return vec_sld(b,a,12); }
982 inline vec_int mem1_vec( vec_int a, vec_int b) { return vec_sld(b,a,12); }
983
984 // result = { a[1] a[0] b[3] b[2] }
985 inline vec_float mem2_vec( vec_float a, vec_float b) { return vec_sld(b,a,8); }
986 inline vec_int mem2_vec( vec_int a, vec_int b) { return vec_sld(b,a,8); }
987
988 // result = { a[0] b[3] b[2] b[1] }
989 inline vec_float mem3_vec( vec_float a, vec_float b) { return vec_sld(b,a,4); }
990 inline vec_int mem3_vec( vec_int a, vec_int b) { return vec_sld(b,a,4); }
991
992
993
994
995
996 // conversion
997 inline vec_float bool2float( vec_float a ) { return vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001)))); }
998 inline vec_float bool2float( vec_int a ) { return (vec_float)(vec_and(a,vec_splat_s32(int(0x00000001)))); }
999
1000 inline vec_int bool2int( vec_int a) { return (vec_int)vec_and(a,vec_splat_s32(int(0x00000001))); }
1001 inline vec_int bool2int( vec_float a ) { return (vec_int)vec_round(vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001))))); }
1002
1003 inline vec_int boolfloat2boolint( vec_float a ) { return (vector signed int)a; }
1004 inline vec_float boolint2boolfloat( vec_int a ) { return (vector float)a; }
1005
1006 inline vec_int float2int( vec_float a) { return (vec_int)vec_round(a); }
1007 inline int float2int( float a ) { return int(a); }
1008
1009 inline vec_float int2float( vec_int a) { return vec_ctf(a,0); }
1010 inline float int2float( int a ) { return float(a); }
1011
1012
1013 #endif