1/* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2   Copyright (C) 2007-2020 Free Software Foundation, Inc.
3
4   This file is free software; you can redistribute it and/or modify it under
5   the terms of the GNU General Public License as published by the Free
6   Software Foundation; either version 3 of the License, or (at your option)
7   any later version.
8
9   This file is distributed in the hope that it will be useful, but WITHOUT
10   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12   for more details.
13
14   Under Section 7 of GPL version 3, you are granted additional
15   permissions described in the GCC Runtime Library Exception, version
16   3.1, as published by the Free Software Foundation.
17
18   You should have received a copy of the GNU General Public License and
19   a copy of the GCC Runtime Library Exception along with this program;
20   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
21   <http://www.gnu.org/licenses/>.  */
22
23#ifndef _SI2VMX_H_
24#define _SI2VMX_H_	1
25
26#ifndef __SPU__
27
28#include <stdlib.h>
29#include <vec_types.h>
30
31
32/* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33 * Users can override the action by defining it prior to including this
34 * header file.
35 */
36#ifndef SPU_HALT_ACTION
37#define SPU_HALT_ACTION		abort()
38#endif
39
40/* Specify a default stop action for the spu_stop intrinsic.
41 * Users can override the action by defining it prior to including this
42 * header file.
43 */
44#ifndef SPU_STOP_ACTION
45#define SPU_STOP_ACTION		abort()
46#endif
47
48
49/* Specify a default action for unsupported intrinsic.
50 * Users can override the action by defining it prior to including this
51 * header file.
52 */
53#ifndef SPU_UNSUPPORTED_ACTION
54#define SPU_UNSUPPORTED_ACTION	abort()
55#endif
56
57
58/* Casting intrinsics - from scalar to quadword
59 */
60
61static __inline qword si_from_uchar(unsigned char c) {
62  union {
63    qword q;
64    unsigned char c[16];
65  } x;
66  x.c[3] = c;
67  return (x.q);
68}
69
70static __inline qword si_from_char(signed char c) {
71  union {
72    qword q;
73    signed char c[16];
74  } x;
75  x.c[3] = c;
76  return (x.q);
77}
78
79static __inline qword si_from_ushort(unsigned short s) {
80  union {
81    qword q;
82    unsigned short s[8];
83  } x;
84  x.s[1] = s;
85  return (x.q);
86}
87
88static __inline qword si_from_short(short s) {
89  union {
90    qword q;
91    short s[8];
92  } x;
93  x.s[1] = s;
94  return (x.q);
95}
96
97
98static __inline qword si_from_uint(unsigned int i) {
99  union {
100    qword q;
101    unsigned int i[4];
102  } x;
103  x.i[0] = i;
104  return (x.q);
105}
106
107static __inline qword si_from_int(int i) {
108  union {
109    qword q;
110    int i[4];
111  } x;
112  x.i[0] = i;
113  return (x.q);
114}
115
116static __inline qword si_from_ullong(unsigned long long l) {
117  union {
118    qword q;
119    unsigned long long l[2];
120  } x;
121  x.l[0] = l;
122  return (x.q);
123}
124
125static __inline qword si_from_llong(long long l) {
126  union {
127    qword q;
128    long long l[2];
129  } x;
130  x.l[0] = l;
131  return (x.q);
132}
133
134static __inline qword si_from_float(float f) {
135  union {
136    qword q;
137    float f[4];
138  } x;
139  x.f[0] = f;
140  return (x.q);
141}
142
143static __inline qword si_from_double(double d) {
144  union {
145    qword q;
146    double d[2];
147  } x;
148  x.d[0] = d;
149  return (x.q);
150}
151
152static __inline qword si_from_ptr(void *ptr) {
153  union {
154    qword q;
155    void *p;
156  } x;
157  x.p = ptr;
158  return (x.q);
159}
160
161
162/* Casting intrinsics - from quadword to scalar
163 */
164static __inline unsigned char si_to_uchar(qword q) {
165  union {
166    qword q;
167    unsigned char c[16];
168  } x;
169  x.q = q;
170  return (x.c[3]);
171}
172
173static __inline signed char si_to_char(qword q) {
174  union {
175    qword q;
176    signed char c[16];
177  } x;
178  x.q = q;
179  return (x.c[3]);
180}
181
182static __inline unsigned short si_to_ushort(qword q) {
183  union {
184    qword q;
185    unsigned short s[8];
186  } x;
187  x.q = q;
188  return (x.s[1]);
189}
190
191static __inline short si_to_short(qword q) {
192  union {
193    qword q;
194    short s[8];
195  } x;
196  x.q = q;
197  return (x.s[1]);
198}
199
200static __inline unsigned int si_to_uint(qword q) {
201  union {
202    qword q;
203    unsigned int i[4];
204  } x;
205  x.q = q;
206  return (x.i[0]);
207}
208
209static __inline int si_to_int(qword q) {
210  union {
211    qword q;
212    int i[4];
213  } x;
214  x.q = q;
215  return (x.i[0]);
216}
217
218static __inline unsigned long long si_to_ullong(qword q) {
219  union {
220    qword q;
221    unsigned long long l[2];
222  } x;
223  x.q = q;
224  return (x.l[0]);
225}
226
227static __inline long long si_to_llong(qword q) {
228  union {
229    qword q;
230    long long l[2];
231  } x;
232  x.q = q;
233  return (x.l[0]);
234}
235
236static __inline float si_to_float(qword q) {
237  union {
238    qword q;
239    float f[4];
240  } x;
241  x.q = q;
242  return (x.f[0]);
243}
244
245static __inline double si_to_double(qword q) {
246  union {
247    qword q;
248    double d[2];
249  } x;
250  x.q = q;
251  return (x.d[0]);
252}
253
254static __inline void * si_to_ptr(qword q) {
255  union {
256    qword q;
257    void *p;
258  } x;
259  x.q = q;
260  return (x.p);
261}
262
263
264/* Absolute difference
265 */
266static __inline qword si_absdb(qword a, qword b)
267{
268  vec_uchar16 ac, bc, dc;
269
270  ac = (vec_uchar16)(a);
271  bc = (vec_uchar16)(b);
272  dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
273
274  return ((qword)(dc));
275}
276
277/* Add intrinsics
278 */
279#define si_a(_a, _b)		((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
280
281#define si_ah(_a, _b)		((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
282
283static __inline qword si_ai(qword a, int b)
284{
285  return ((qword)(vec_add((vec_int4)(a),
286			  vec_splat((vec_int4)(si_from_int(b)), 0))));
287}
288
289
290static __inline qword si_ahi(qword a, short b)
291{
292  return ((qword)(vec_add((vec_short8)(a),
293			  vec_splat((vec_short8)(si_from_short(b)), 1))));
294}
295
296
297#define si_fa(_a, _b)	((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
298
299
300static __inline qword si_dfa(qword a, qword b)
301{
302  union {
303    vec_double2 v;
304    double d[2];
305  } ad, bd, dd;
306
307  ad.v = (vec_double2)(a);
308  bd.v = (vec_double2)(b);
309  dd.d[0] = ad.d[0] + bd.d[0];
310  dd.d[1] = ad.d[1] + bd.d[1];
311
312  return ((qword)(dd.v));
313}
314
315/* Add word extended
316 */
317#define si_addx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), 	\
318						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
319
320
321/* Bit-wise AND
322 */
323#define si_and(_a, _b)		((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
324
325
326static __inline qword si_andbi(qword a, signed char b)
327{
328  return ((qword)(vec_and((vec_char16)(a),
329			  vec_splat((vec_char16)(si_from_char(b)), 3))));
330}
331
332static __inline qword si_andhi(qword a, signed short b)
333{
334  return ((qword)(vec_and((vec_short8)(a),
335			  vec_splat((vec_short8)(si_from_short(b)), 1))));
336}
337
338
339static __inline qword si_andi(qword a, signed int b)
340{
341  return ((qword)(vec_and((vec_int4)(a),
342			  vec_splat((vec_int4)(si_from_int(b)), 0))));
343}
344
345
346/* Bit-wise AND with complement
347 */
348#define si_andc(_a, _b)		((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
349
350
351/* Average byte vectors
352 */
353#define si_avgb(_a, _b)		((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
354
355
356/* Branch indirect and set link on external data
357 */
358#define si_bisled(_func)	/* not mappable */
359#define si_bisledd(_func)	/* not mappable */
360#define si_bislede(_func)	/* not mappable */
361
362
363/* Borrow generate
364 */
365#define si_bg(_a, _b)		((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
366
367#define si_bgx(_a, _b, _c)	((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),		\
368							vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), 	\
369								(vec_uint4)(_c))), vec_splat_u32(1))))
370
371/* Compare absolute equal
372 */
373static __inline qword si_fcmeq(qword a, qword b)
374{
375  vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
376
377  return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
378				  vec_andc((vec_float4)(b), msb))));
379}
380
381static __inline qword si_dfcmeq(qword a, qword b)
382{
383  vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385  vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
386
387  vec_uint4 biteq;
388  vec_uint4 aabs;
389  vec_uint4 babs;
390  vec_uint4 a_gt;
391  vec_uint4 ahi_inf;
392  vec_uint4 anan;
393  vec_uint4 result;
394
395  union {
396    vec_uchar16 v;
397    int i[4];
398  } x;
399
400  /* Shift 4 bytes  */
401  x.i[3] = 4 << 3;
402
403  /*  Mask out sign bits */
404  aabs = vec_and((vec_uint4)a,sign_mask);
405  babs = vec_and((vec_uint4)b,sign_mask);
406
407  /*  A)  Check for bit equality, store in high word */
408  biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
409  biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
410
411  /*
412      B)  Check if a is NaN, store in high word
413
414      B1) If the high word is greater than max_exp (indicates a NaN)
415      B2) If the low word is greater than 0
416  */
417  a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
418
419  /*  B3) Check if the high word is equal to the inf exponent */
420  ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
421
422  /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
423  anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
424
425  /*  result = A and not B  */
426  result = vec_andc(biteq, anan);
427
428  /*  Promote high words to 64 bits and return  */
429  return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
430}
431
432
433/* Compare absolute greater than
434 */
435static __inline qword si_fcmgt(qword a, qword b)
436{
437  vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
438
439  return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
440				  vec_andc((vec_float4)(b), msb))));
441}
442
443static __inline qword si_dfcmgt(qword a, qword b)
444{
445  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
448
449  union {
450    vec_uchar16 v;
451    int i[4];
452  } x;
453
454  /* Shift 4 bytes  */
455  x.i[3] = 4 << 3;
456
457  // absolute value of a,b
458  vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
459  vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
460
461  // check if a is nan
462  vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
463  vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
464  a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
465  a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
466
467  // check if b is nan
468  vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
469  vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
470  b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
471  b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
472
473  // A) Check if the exponents are different
474  vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
475
476  // B) Check if high word equal, and low word greater
477  vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
478  vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
479  vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
480
481  //  If either A or B is true, return true (unless NaNs detected)
482  vec_uint4 r = vec_or(gt_hi, eqgt);
483
484  // splat the high words of the comparison step
485  r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
486
487  // correct for NaNs in input
488  return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
489}
490
491
492/* Compare equal
493 */
494static __inline qword si_ceqb(qword a, qword b)
495{
496  return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
497}
498
499static __inline qword si_ceqh(qword a, qword b)
500{
501  return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
502}
503
504static __inline qword si_ceq(qword a, qword b)
505{
506  return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
507}
508
509static __inline qword si_fceq(qword a, qword b)
510{
511  return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
512}
513
514static __inline qword si_ceqbi(qword a, signed char b)
515{
516  return ((qword)(vec_cmpeq((vec_char16)(a),
517			    vec_splat((vec_char16)(si_from_char(b)), 3))));
518}
519
520static __inline qword si_ceqhi(qword a, signed short b)
521{
522  return ((qword)(vec_cmpeq((vec_short8)(a),
523			  vec_splat((vec_short8)(si_from_short(b)), 1))));
524}
525
526static __inline qword si_ceqi(qword a, signed int b)
527{
528  return ((qword)(vec_cmpeq((vec_int4)(a),
529			  vec_splat((vec_int4)(si_from_int(b)), 0))));
530}
531
532static __inline qword si_dfceq(qword a, qword b)
533{
534  vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536  vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
537
538  vec_uint4 biteq;
539  vec_uint4 aabs;
540  vec_uint4 babs;
541  vec_uint4 a_gt;
542  vec_uint4 ahi_inf;
543  vec_uint4 anan;
544  vec_uint4 iszero;
545  vec_uint4 result;
546
547  union {
548    vec_uchar16 v;
549    int i[4];
550  } x;
551
552  /* Shift 4 bytes  */
553  x.i[3] = 4 << 3;
554
555  /*  A)  Check for bit equality, store in high word */
556  biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
557  biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
558
559  /*  Mask out sign bits */
560  aabs = vec_and((vec_uint4)a,sign_mask);
561  babs = vec_and((vec_uint4)b,sign_mask);
562
563  /*
564      B)  Check if a is NaN, store in high word
565
566      B1) If the high word is greater than max_exp (indicates a NaN)
567      B2) If the low word is greater than 0
568  */
569  a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
570
571  /*  B3) Check if the high word is equal to the inf exponent */
572  ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
573
574  /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
575  anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
576
577  /*  C)  Check for 0 = -0 special case */
578  iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
579  iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
580
581  /*  result = (A or C) and not B  */
582  result = vec_or(biteq,iszero);
583  result = vec_andc(result, anan);
584
585  /*  Promote high words to 64 bits and return  */
586  return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
587}
588
589
590/* Compare greater than
591 */
592static __inline qword si_cgtb(qword a, qword b)
593{
594  return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
595}
596
597static __inline qword si_cgth(qword a, qword b)
598{
599  return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
600}
601
602static __inline qword si_cgt(qword a, qword b)
603{
604  return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
605}
606
607static __inline qword si_clgtb(qword a, qword b)
608{
609  return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
610}
611
612static __inline qword si_clgth(qword a, qword b)
613{
614  return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
615}
616
617static __inline qword si_clgt(qword a, qword b)
618{
619  return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
620}
621
622static __inline qword si_fcgt(qword a, qword b)
623{
624  return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
625}
626
627static __inline qword si_dfcgt(qword a, qword b)
628{
629  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630  vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
633
634  union {
635    vec_uchar16 v;
636    int i[4];
637  } x;
638
639  /* Shift 4 bytes  */
640  x.i[3] = 4 << 3;
641
642  // absolute value of a,b
643  vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
644  vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
645
646  // check if a is nan
647  vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
648  vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
649  a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
650  a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
651
652  // check if b is nan
653  vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
654  vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
655  b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
656  b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
657
658  // sign of a
659  vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
660  asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
661
662  // sign of b
663  vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
664  bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
665
666  // negative a
667  vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
668  vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
669  abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
670  vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
671
672  // pick the one we want
673  vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
674
675  // negative b
676  vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
677  bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
678  vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
679
680  // pick the one we want
681  vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
682
683  // A) Check if the exponents are different
684  vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
685
686  // B) Check if high word equal, and low word greater
687  vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
688  vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
689  vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
690
691  //  If either A or B is true, return true (unless NaNs detected)
692  vec_uint4 r = vec_or(gt_hi, eqgt);
693
694  // splat the high words of the comparison step
695  r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
696
697  // correct for NaNs in input
698  return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
699}
700
701static __inline qword si_cgtbi(qword a, signed char b)
702{
703  return ((qword)(vec_cmpgt((vec_char16)(a),
704			    vec_splat((vec_char16)(si_from_char(b)), 3))));
705}
706
707static __inline qword si_cgthi(qword a, signed short b)
708{
709  return ((qword)(vec_cmpgt((vec_short8)(a),
710			    vec_splat((vec_short8)(si_from_short(b)), 1))));
711}
712
713static __inline qword si_cgti(qword a, signed int b)
714{
715  return ((qword)(vec_cmpgt((vec_int4)(a),
716			    vec_splat((vec_int4)(si_from_int(b)), 0))));
717}
718
719static __inline qword si_clgtbi(qword a, unsigned char b)
720{
721  return ((qword)(vec_cmpgt((vec_uchar16)(a),
722			    vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
723}
724
725static __inline qword si_clgthi(qword a, unsigned short b)
726{
727  return ((qword)(vec_cmpgt((vec_ushort8)(a),
728			    vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
729}
730
731static __inline qword si_clgti(qword a, unsigned int b)
732{
733  return ((qword)(vec_cmpgt((vec_uint4)(a),
734			    vec_splat((vec_uint4)(si_from_uint(b)), 0))));
735}
736
737static __inline qword si_dftsv(qword a, char b)
738{
739  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741  vec_uint4 result = (vec_uint4){0};
742  vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
743  sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
744  vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
745
746  union {
747    vec_uchar16 v;
748    int i[4];
749  } x;
750
751  /* Shift 4 bytes  */
752  x.i[3] = 4 << 3;
753
754  /* Nan or +inf or -inf  */
755  if (b & 0x70)
756  {
757    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758    vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
759     /* NaN  */
760     if (b & 0x40)
761     {
762       vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
763       a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
764       a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
765       result = vec_or(result, a_nan);
766     }
767     /* inf  */
768     if (b & 0x30)
769     {
770       a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
771       a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
772        /* +inf  */
773        if (b & 0x20)
774          result = vec_or(vec_andc(a_inf, sign), result);
775        /* -inf  */
776        if (b & 0x10)
777          result = vec_or(vec_and(a_inf, sign), result);
778     }
779  }
780  /* 0 or denorm  */
781  if (b & 0xF)
782  {
783    vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
784    iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
785    /* denorm  */
786    if (b & 0x3)
787    {
788      vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789      vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
790      isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
791      /* +denorm  */
792     if (b & 0x2)
793        result = vec_or(vec_andc(isdenorm, sign), result);
794      /* -denorm  */
795     if (b & 0x1)
796        result = vec_or(vec_and(isdenorm, sign), result);
797    }
798    /* 0  */
799    if (b & 0xC)
800    {
801      iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
802      /* +0  */
803     if (b & 0x8)
804        result = vec_or(vec_andc(iszero, sign), result);
805      /* -0  */
806     if (b & 0x4)
807        result = vec_or(vec_and(iszero, sign), result);
808    }
809  }
810  return ((qword)result);
811}
812
813
814/* Carry generate
815 */
816#define si_cg(_a, _b)		((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
817
818#define si_cgx(_a, _b, _c)	((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), 		\
819						vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),	\
820							 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
821
822
823/* Count ones for bytes
824 */
825static __inline qword si_cntb(qword a)
826{
827  vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828  vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
829  vec_uchar16 av;
830
831  av = (vec_uchar16)(a);
832
833  return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
834			  vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
835}
836
837/* Count ones for bytes
838 */
839static __inline qword si_clz(qword a)
840{
841  vec_uchar16 av;
842  vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
843  vec_uchar16 four    = vec_splat_u8(4);
844  vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845  vec_uchar16 eight   = vec_splat_u8(8);
846  vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847  vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
848
849  av = (vec_uchar16)(a);
850
851  cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
852  cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
853
854  cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
855
856  tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
857  tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
858  tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
859
860  cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
861  cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
862  cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
863
864  return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
865}
866
867/* Convert to float
868 */
869#define si_cuflt(_a, _b)	((qword)(vec_ctf((vec_uint4)(_a), _b)))
870#define si_csflt(_a, _b)	((qword)(vec_ctf((vec_int4)(_a), _b)))
871
872/* Convert to signed int
873 */
874#define si_cflts(_a, _b)	((qword)(vec_cts((vec_float4)(_a), _b)))
875
876/* Convert to unsigned int
877 */
878#define si_cfltu(_a, _b)	((qword)(vec_ctu((vec_float4)(_a), _b)))
879
880/* Synchronize
881 */
882#define si_dsync()		/* do nothing */
883#define si_sync()		/* do nothing */
884#define si_syncc()		/* do nothing */
885
886
887/* Equivalence
888 */
889static __inline qword si_eqv(qword a, qword b)
890{
891  vec_uchar16 d;
892
893  d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
894  return ((qword)(vec_nor(d, d)));
895}
896
897/* Extend
898 */
899static __inline qword si_xsbh(qword a)
900{
901  vec_char16 av;
902
903  av = (vec_char16)(a);
904  return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
905						              0, 0, 0, 0, 0, 0, 0, 0})))));
906}
907
908static __inline qword si_xshw(qword a)
909{
910  vec_short8 av;
911
912  av = (vec_short8)(a);
913  return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
914					                      10,11,14,15,
915							      0, 0, 0, 0,
916						              0, 0, 0, 0})))));
917}
918
919static __inline qword si_xswd(qword a)
920{
921  vec_int4 av;
922
923  av = (vec_int4)(a);
924  return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
925			   ((vec_uchar16){20, 21, 22, 23,
926					   4,  5,  6,  7,
927				          28, 29, 30, 31,
928				          12, 13, 14, 15}))));
929}
930
931static __inline qword si_fesd(qword a)
932{
933  union {
934    double d[2];
935    vec_double2	vd;
936  } out;
937  union {
938    float f[4];
939    vec_float4 vf;
940  } in;
941
942  in.vf = (vec_float4)(a);
943  out.d[0] = (double)(in.f[0]);
944  out.d[1] = (double)(in.f[2]);
945  return ((qword)(out.vd));
946}
947
948/* Gather
949 */
950static __inline qword si_gbb(qword a)
951{
952  vec_uchar16 bits;
953  vec_uint4   bytes;
954
955  bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
956								            7, 6, 5, 4, 3, 2, 1, 0}));
957  bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
958
959  return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
960					                0, 0, 0, 0, 0, 0, 0, 0}))));
961}
962
963
964static __inline qword si_gbh(qword a)
965{
966  vec_ushort8 bits;
967  vec_uint4   bytes;
968
969  bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
970
971  bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
972
973  return ((qword)(vec_sld(bytes, bytes, 12)));
974}
975
976static __inline qword si_gb(qword a)
977{
978  vec_uint4 bits;
979  vec_uint4 bytes;
980
981  bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
982  bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
983  return ((qword)(vec_sld(bytes, bytes, 12)));
984}
985
986
987/* Compare and halt
988 */
989static __inline void si_heq(qword a, qword b)
990{
991  union {
992    vector unsigned int v;
993    unsigned int i[4];
994  } aa, bb;
995
996  aa.v = (vector unsigned int)(a);
997  bb.v = (vector unsigned int)(b);
998
999  if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1000}
1001
1002static __inline void si_heqi(qword a, unsigned int b)
1003{
1004  union {
1005    vector unsigned int v;
1006    unsigned int i[4];
1007  } aa;
1008
1009  aa.v = (vector unsigned int)(a);
1010
1011  if (aa.i[0] == b) { SPU_HALT_ACTION; };
1012}
1013
1014static __inline void si_hgt(qword a, qword b)
1015{
1016  union {
1017    vector signed int v;
1018    signed int i[4];
1019  } aa, bb;
1020
1021  aa.v = (vector signed int)(a);
1022  bb.v = (vector signed int)(b);
1023
1024  if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1025}
1026
1027static __inline void si_hgti(qword a, signed int b)
1028{
1029  union {
1030    vector signed int v;
1031    signed int i[4];
1032  } aa;
1033
1034  aa.v = (vector signed int)(a);
1035
1036  if (aa.i[0] > b) { SPU_HALT_ACTION; };
1037}
1038
1039static __inline void si_hlgt(qword a, qword b)
1040{
1041  union {
1042    vector unsigned int v;
1043    unsigned int i[4];
1044  } aa, bb;
1045
1046  aa.v = (vector unsigned int)(a);
1047  bb.v = (vector unsigned int)(b);
1048
1049  if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1050}
1051
1052static __inline void si_hlgti(qword a, unsigned int b)
1053{
1054  union {
1055    vector unsigned int v;
1056    unsigned int i[4];
1057  } aa;
1058
1059  aa.v = (vector unsigned int)(a);
1060
1061  if (aa.i[0] > b) { SPU_HALT_ACTION; };
1062}
1063
1064
1065/* Multiply and Add
1066 */
1067static __inline qword si_mpya(qword a, qword b, qword c)
1068{
1069  return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070				   ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071			   (vec_short8)(b), (vec_int4)(c))));
1072}
1073
1074static __inline qword si_fma(qword a, qword b, qword c)
1075{
1076  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1077}
1078
1079static __inline qword si_dfma(qword a, qword b, qword c)
1080{
1081  union {
1082    vec_double2 v;
1083    double d[2];
1084  } aa, bb, cc, dd;
1085
1086  aa.v = (vec_double2)(a);
1087  bb.v = (vec_double2)(b);
1088  cc.v = (vec_double2)(c);
1089  dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090  dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091  return ((qword)(dd.v));
1092}
1093
1094/* Form Mask
1095 */
1096#define si_fsmbi(_a)	si_fsmb(si_from_int(_a))
1097
1098static __inline qword si_fsmb(qword a)
1099{
1100  vec_char16 mask;
1101  vec_ushort8 in;
1102
1103  in = (vec_ushort8)(a);
1104  mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105					              3, 3, 3, 3, 3, 3, 3, 3})));
1106  return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107				                      0, 1, 2, 3, 4, 5, 6, 7})),
1108			  vec_splat_u8(7))));
1109}
1110
1111
1112static __inline qword si_fsmh(qword a)
1113{
1114  vec_uchar16 in;
1115  vec_short8 mask;
1116
1117  in = (vec_uchar16)(a);
1118  mask = (vec_short8)(vec_splat(in, 3));
1119  return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120			  vec_splat_u16(15))));
1121}
1122
1123static __inline qword si_fsm(qword a)
1124{
1125  vec_uchar16 in;
1126  vec_int4 mask;
1127
1128  in = (vec_uchar16)(a);
1129  mask = (vec_int4)(vec_splat(in, 3));
1130  return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131			  ((vec_uint4){31,31,31,31}))));
1132}
1133
1134/* Move from/to registers
1135 */
1136#define si_fscrrd()		((qword)((vec_uint4){0}))
1137#define si_fscrwr(_a)
1138
1139#define si_mfspr(_reg)		((qword)((vec_uint4){0}))
1140#define si_mtspr(_reg, _a)
1141
1142/* Multiply High High Add
1143 */
1144static __inline qword si_mpyhha(qword a, qword b, qword c)
1145{
1146  return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1147}
1148
1149static __inline qword si_mpyhhau(qword a, qword b, qword c)
1150{
1151  return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1152}
1153
1154/* Multiply Subtract
1155 */
1156static __inline qword si_fms(qword a, qword b, qword c)
1157{
1158  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159			   vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1160}
1161
1162static __inline qword si_dfms(qword a, qword b, qword c)
1163{
1164  union {
1165    vec_double2 v;
1166    double d[2];
1167  } aa, bb, cc, dd;
1168
1169  aa.v = (vec_double2)(a);
1170  bb.v = (vec_double2)(b);
1171  cc.v = (vec_double2)(c);
1172  dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173  dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174  return ((qword)(dd.v));
1175}
1176
1177/* Multiply
1178 */
1179static __inline qword si_fm(qword a, qword b)
1180{
1181  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1182}
1183
1184static __inline qword si_dfm(qword a, qword b)
1185{
1186  union {
1187    vec_double2 v;
1188    double d[2];
1189  } aa, bb, dd;
1190
1191  aa.v = (vec_double2)(a);
1192  bb.v = (vec_double2)(b);
1193  dd.d[0] = aa.d[0] * bb.d[0];
1194  dd.d[1] = aa.d[1] * bb.d[1];
1195  return ((qword)(dd.v));
1196}
1197
1198/* Multiply High
1199 */
1200static __inline qword si_mpyh(qword a, qword b)
1201{
1202  vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1203
1204  return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1205}
1206
1207
1208/* Multiply High High
1209 */
1210static __inline qword si_mpyhh(qword a, qword b)
1211{
1212  return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1213}
1214
1215static __inline qword si_mpyhhu(qword a, qword b)
1216{
1217  return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1218}
1219
1220/* Multiply Odd
1221 */
1222static __inline qword si_mpy(qword a, qword b)
1223{
1224  return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1225}
1226
1227static __inline qword si_mpyu(qword a, qword b)
1228{
1229  return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1230}
1231
1232static __inline qword si_mpyi(qword a, short b)
1233{
1234  return ((qword)(vec_mulo((vec_short8)(a),
1235			   vec_splat((vec_short8)(si_from_short(b)), 1))));
1236}
1237
1238static __inline qword si_mpyui(qword a, unsigned short b)
1239{
1240  return ((qword)(vec_mulo((vec_ushort8)(a),
1241			   vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1242}
1243
1244/* Multiply and Shift Right
1245 */
1246static __inline qword si_mpys(qword a, qword b)
1247{
1248  return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1249}
1250
1251/* Nand
1252 */
1253static __inline qword si_nand(qword a, qword b)
1254{
1255  vec_uchar16 d;
1256
1257  d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258  return ((qword)(vec_nor(d, d)));
1259}
1260
1261/* Negative Multiply Add
1262 */
1263static __inline qword si_dfnma(qword a, qword b, qword c)
1264{
1265  union {
1266    vec_double2 v;
1267    double d[2];
1268  } aa, bb, cc, dd;
1269
1270  aa.v = (vec_double2)(a);
1271  bb.v = (vec_double2)(b);
1272  cc.v = (vec_double2)(c);
1273  dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274  dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275  return ((qword)(dd.v));
1276}
1277
1278/* Negative Multiply and Subtract
1279 */
1280static __inline qword si_fnms(qword a, qword b, qword c)
1281{
1282  return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1283}
1284
1285static __inline qword si_dfnms(qword a, qword b, qword c)
1286{
1287  union {
1288    vec_double2 v;
1289    double d[2];
1290  } aa, bb, cc, dd;
1291
1292  aa.v = (vec_double2)(a);
1293  bb.v = (vec_double2)(b);
1294  cc.v = (vec_double2)(c);
1295  dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296  dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297  return ((qword)(dd.v));
1298}
1299
1300/* Nor
1301 */
1302static __inline qword si_nor(qword a, qword b)
1303{
1304  return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1305}
1306
1307/* Or
1308 */
1309static __inline qword si_or(qword a, qword b)
1310{
1311  return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1312}
1313
1314static __inline qword si_orbi(qword a, unsigned char b)
1315{
1316  return ((qword)(vec_or((vec_uchar16)(a),
1317			 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1318}
1319
1320static __inline qword si_orhi(qword a, unsigned short b)
1321{
1322  return ((qword)(vec_or((vec_ushort8)(a),
1323			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1324}
1325
1326static __inline qword si_ori(qword a, unsigned int b)
1327{
1328  return ((qword)(vec_or((vec_uint4)(a),
1329			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1330}
1331
1332/* Or Complement
1333 */
1334static __inline qword si_orc(qword a, qword b)
1335{
1336  return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1337}
1338
1339
1340/* Or Across
1341 */
1342static __inline qword si_orx(qword a)
1343{
1344  vec_uchar16 tmp;
1345  tmp = (vec_uchar16)(a);
1346  tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347  tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348  return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349				              0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1350}
1351
1352
1353/* Estimates
1354 */
1355static __inline qword si_frest(qword a)
1356{
1357  return ((qword)(vec_re((vec_float4)(a))));
1358}
1359
1360static __inline qword si_frsqest(qword a)
1361{
1362  return ((qword)(vec_rsqrte((vec_float4)(a))));
1363}
1364
1365#define si_fi(_a, _d)		(_d)
1366
1367/* Channel Read and Write
1368 */
1369#define si_rdch(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
1370#define si_rchcnt(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
1371#define si_wrch(_channel, _a)		/* not mappable */
1372
1373/* Rotate Left
1374 */
1375static __inline qword si_roth(qword a, qword b)
1376{
1377  return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1378}
1379
1380static __inline qword si_rot(qword a, qword b)
1381{
1382  return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1383}
1384
1385static __inline qword si_rothi(qword a, int b)
1386{
1387  return ((qword)(vec_rl((vec_ushort8)(a),
1388			 vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1389}
1390
1391static __inline qword si_roti(qword a, int b)
1392{
1393  return ((qword)(vec_rl((vec_uint4)(a),
1394			 vec_splat((vec_uint4)(si_from_int(b)), 0))));
1395}
1396
1397/* Rotate Left with Mask
1398 */
1399static __inline qword si_rothm(qword a, qword b)
1400{
1401  vec_ushort8 neg_b;
1402  vec_ushort8 mask;
1403
1404  neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406  return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1407}
1408
1409static __inline qword si_rotm(qword a, qword b)
1410{
1411  vec_uint4 neg_b;
1412  vec_uint4 mask;
1413
1414  neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416  return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1417}
1418
1419static __inline qword si_rothmi(qword a, int b)
1420{
1421  vec_ushort8 neg_b;
1422  vec_ushort8 mask;
1423
1424  neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426  return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1427}
1428
1429static __inline qword si_rotmi(qword a, int b)
1430{
1431  vec_uint4 neg_b;
1432  vec_uint4 mask;
1433
1434  neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436  return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1437}
1438
1439
1440/* Rotate Left Algebraic with Mask
1441 */
1442static __inline qword si_rotmah(qword a, qword b)
1443{
1444  vec_ushort8 neg_b;
1445  vec_ushort8 mask;
1446
1447  neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449  return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1450}
1451
1452static __inline qword si_rotma(qword a, qword b)
1453{
1454  vec_uint4 neg_b;
1455  vec_uint4 mask;
1456
1457  neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459  return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1460}
1461
1462
1463static __inline qword si_rotmahi(qword a, int b)
1464{
1465  vec_ushort8 neg_b;
1466  vec_ushort8 mask;
1467
1468  neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470  return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1471}
1472
1473static __inline qword si_rotmai(qword a, int b)
1474{
1475  vec_uint4 neg_b;
1476  vec_uint4 mask;
1477
1478  neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480  return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1481}
1482
1483
1484/* Rotate Left Quadword by Bytes with Mask
1485 */
1486static __inline qword si_rotqmbyi(qword a, int count)
1487{
1488  union {
1489    vec_uchar16 v;
1490    int i[4];
1491  } x;
1492  vec_uchar16 mask;
1493
1494  count = 0 - count;
1495  x.i[3] = count << 3;
1496  mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1497
1498  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1499}
1500
1501
1502static __inline qword si_rotqmby(qword a, qword count)
1503{
1504  union {
1505    vec_uchar16 v;
1506    int i[4];
1507  } x;
1508  int cnt;
1509  vec_uchar16 mask;
1510
1511  x.v = (vec_uchar16)(count);
1512  x.i[0] = cnt = (0 - x.i[0]) << 3;
1513
1514  x.v = vec_splat(x.v, 3);
1515  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1516
1517  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1518}
1519
1520
1521/* Rotate Left Quadword by Bytes
1522 */
1523static __inline qword si_rotqbyi(qword a, int count)
1524{
1525  union {
1526    vec_uchar16 v;
1527    int i[4];
1528  } left, right;
1529
1530  count <<= 3;
1531  left.i[3] = count;
1532  right.i[3] = 0 - count;
1533  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1534}
1535
1536static __inline qword si_rotqby(qword a, qword count)
1537{
1538  vec_uchar16 left, right;
1539
1540  left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541  right = vec_sub(vec_splat_u8(0), left);
1542  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1543}
1544
1545/* Rotate Left Quadword by Bytes Bit Count
1546 */
1547static __inline qword si_rotqbybi(qword a, qword count)
1548{
1549  vec_uchar16 left, right;
1550
1551  left = vec_splat((vec_uchar16)(count), 3);
1552  right = vec_sub(vec_splat_u8(7), left);
1553  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1554}
1555
1556
1557/* Rotate Left Quadword by Bytes Bit Count
1558 */
1559static __inline qword si_rotqbii(qword a, int count)
1560{
1561  vec_uchar16 x, y;
1562  vec_uchar16 result;
1563
1564  x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565  y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567  result = vec_or(vec_sll((qword)(a), x), y);
1568  return ((qword)(result));
1569}
1570
1571static __inline qword si_rotqbi(qword a, qword count)
1572{
1573  vec_uchar16 x, y;
1574  vec_uchar16 result;
1575
1576  x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577  y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1579
1580  result = vec_or(vec_sll((qword)(a), x), y);
1581  return ((qword)(result));
1582}
1583
1584
1585/* Rotate Left Quadword and Mask by Bits
1586 */
1587static __inline qword si_rotqmbii(qword a, int count)
1588{
1589  return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1590}
1591
1592static __inline qword si_rotqmbi(qword a, qword count)
1593{
1594  return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1595}
1596
1597
1598/* Rotate Left Quadword and Mask by Bytes with Bit Count
1599 */
1600static __inline qword si_rotqmbybi(qword a, qword count)
1601{
1602  union {
1603    vec_uchar16 v;
1604    int i[4];
1605  } x;
1606  int cnt;
1607  vec_uchar16 mask;
1608
1609  x.v = (vec_uchar16)(count);
1610  x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611  x.v = vec_splat(x.v, 3);
1612  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1613
1614  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1615}
1616
1617
1618
1619
1620/* Round Double to Float
1621 */
1622static __inline qword si_frds(qword a)
1623{
1624  union {
1625    vec_float4 v;
1626    float f[4];
1627  } d;
1628  union {
1629    vec_double2 v;
1630    double d[2];
1631  } in;
1632
1633  in.v = (vec_double2)(a);
1634  d.v = (vec_float4){0.0f};
1635  d.f[0] = (float)in.d[0];
1636  d.f[2] = (float)in.d[1];
1637
1638  return ((qword)(d.v));
1639}
1640
1641/* Select Bits
1642 */
1643static __inline qword si_selb(qword a, qword b, qword c)
1644{
1645  return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1646}
1647
1648
1649/* Shuffle Bytes
1650 */
1651static __inline qword si_shufb(qword a, qword b, qword pattern)
1652{
1653  vec_uchar16 pat;
1654
1655  pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656		vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657		vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658  return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659			   ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660				          0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661			   pat)));
1662}
1663
1664
1665/* Shift Left
1666 */
1667static __inline qword si_shlh(qword a, qword b)
1668{
1669  vec_ushort8 mask;
1670
1671  mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672  return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1673}
1674
1675static __inline qword si_shl(qword a, qword b)
1676{
1677  vec_uint4 mask;
1678
1679  mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680  return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1681}
1682
1683
1684static __inline qword si_shlhi(qword a, unsigned int b)
1685{
1686  vec_ushort8 mask;
1687  vec_ushort8 bv;
1688
1689  bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690  mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691  return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1692}
1693
1694static __inline qword si_shli(qword a, unsigned int b)
1695{
1696  vec_uint4 bv;
1697  vec_uint4 mask;
1698
1699  bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700  mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701  return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1702}
1703
1704
1705/* Shift Left Quadword
1706 */
1707static __inline qword si_shlqbii(qword a, unsigned int count)
1708{
1709  vec_uchar16 x;
1710
1711  x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712  return ((qword)(vec_sll((vec_uchar16)(a), x)));
1713}
1714
1715static __inline qword si_shlqbi(qword a, qword count)
1716{
1717  vec_uchar16 x;
1718
1719  x = vec_splat((vec_uchar16)(count), 3);
1720  return ((qword)(vec_sll((vec_uchar16)(a), x)));
1721}
1722
1723
1724/* Shift Left Quadword by Bytes
1725 */
1726static __inline qword si_shlqbyi(qword a, unsigned int count)
1727{
1728  union {
1729    vec_uchar16 v;
1730    int i[4];
1731  } x;
1732  vec_uchar16 mask;
1733
1734  x.i[3] = count << 3;
1735  mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1737}
1738
1739static __inline qword si_shlqby(qword a, qword count)
1740{
1741  union {
1742    vec_uchar16 v;
1743    unsigned int i[4];
1744  } x;
1745  unsigned int cnt;
1746  vec_uchar16 mask;
1747
1748  x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749  cnt = x.i[0];
1750  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1752}
1753
1754/* Shift Left Quadword by Bytes with Bit Count
1755 */
1756static __inline qword si_shlqbybi(qword a, qword count)
1757{
1758  union {
1759    vec_uchar16 v;
1760    int i[4];
1761  } x;
1762  unsigned int cnt;
1763  vec_uchar16 mask;
1764
1765  x.v = vec_splat((vec_uchar16)(count), 3);
1766  cnt = x.i[0];
1767  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1769}
1770
1771
1772/* Stop and Signal
1773 */
1774#define si_stop(_type)		SPU_STOP_ACTION
1775#define si_stopd(a, b, c)	SPU_STOP_ACTION
1776
1777
1778/* Subtract
1779 */
1780static __inline qword si_sfh(qword a, qword b)
1781{
1782  return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1783}
1784
1785static __inline qword si_sf(qword a, qword b)
1786{
1787  return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1788}
1789
1790static __inline qword si_fs(qword a, qword b)
1791{
1792  return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1793}
1794
1795static __inline qword si_dfs(qword a, qword b)
1796{
1797  union {
1798    vec_double2 v;
1799    double d[2];
1800  } aa, bb, dd;
1801
1802  aa.v = (vec_double2)(a);
1803  bb.v = (vec_double2)(b);
1804  dd.d[0] = aa.d[0] - bb.d[0];
1805  dd.d[1] = aa.d[1] - bb.d[1];
1806  return ((qword)(dd.v));
1807}
1808
1809static __inline qword si_sfhi(qword a, short b)
1810{
1811  return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812			  (vec_short8)(a))));
1813}
1814
1815static __inline qword si_sfi(qword a, int b)
1816{
1817  return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818			  (vec_int4)(a))));
1819}
1820
1821/* Subtract word extended
1822 */
1823#define si_sfx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_b), 				\
1824							 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), 	\
1825						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1826
1827
1828/* Sum Bytes into Shorts
1829 */
1830static __inline qword si_sumb(qword a, qword b)
1831{
1832  vec_uint4 zero = (vec_uint4){0};
1833  vec_ushort8 sum_a, sum_b;
1834
1835  sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836  sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1837
1838  return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
1839					                26, 27, 10, 11, 30, 31, 14, 15}))));
1840}
1841
1842/* Exclusive OR
1843 */
1844static __inline qword si_xor(qword a, qword b)
1845{
1846  return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1847}
1848
1849static __inline qword si_xorbi(qword a, unsigned char b)
1850{
1851  return ((qword)(vec_xor((vec_uchar16)(a),
1852			  vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1853}
1854
1855static __inline qword si_xorhi(qword a, unsigned short b)
1856{
1857  return ((qword)(vec_xor((vec_ushort8)(a),
1858			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1859}
1860
1861static __inline qword si_xori(qword a, unsigned int b)
1862{
1863  return ((qword)(vec_xor((vec_uint4)(a),
1864			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1865}
1866
1867
1868/* Generate Controls for Sub-Quadword Insertion
1869 */
1870static __inline qword si_cbd(qword a, int imm)
1871{
1872  union {
1873    vec_uint4 v;
1874    unsigned char c[16];
1875  } shmask;
1876
1877  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878  shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879  return ((qword)(shmask.v));
1880}
1881
1882static __inline qword si_cdd(qword a, int imm)
1883{
1884  union {
1885    vec_uint4 v;
1886    unsigned long long ll[2];
1887  } shmask;
1888
1889  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890  shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891  return ((qword)(shmask.v));
1892}
1893
1894static __inline qword si_chd(qword a, int imm)
1895{
1896  union {
1897    vec_uint4 v;
1898    unsigned short s[8];
1899  } shmask;
1900
1901  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902  shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903  return ((qword)(shmask.v));
1904}
1905
1906static __inline qword si_cwd(qword a, int imm)
1907{
1908  union {
1909    vec_uint4 v;
1910    unsigned int i[4];
1911  } shmask;
1912
1913  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914  shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915  return ((qword)(shmask.v));
1916}
1917
1918static __inline qword si_cbx(qword a, qword b)
1919{
1920  union {
1921    vec_uint4 v;
1922    unsigned char c[16];
1923  } shmask;
1924
1925  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926  shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927  return ((qword)(shmask.v));
1928}
1929
1930
1931static __inline qword si_cdx(qword a, qword b)
1932{
1933  union {
1934    vec_uint4 v;
1935    unsigned long long ll[2];
1936  } shmask;
1937
1938  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939  shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940  return ((qword)(shmask.v));
1941}
1942
1943static __inline qword si_chx(qword a, qword b)
1944{
1945  union {
1946    vec_uint4 v;
1947    unsigned short s[8];
1948  } shmask;
1949
1950  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951  shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952  return ((qword)(shmask.v));
1953}
1954
1955static __inline qword si_cwx(qword a, qword b)
1956{
1957  union {
1958    vec_uint4 v;
1959    unsigned int i[4];
1960  } shmask;
1961
1962  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963  shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964  return ((qword)(shmask.v));
1965}
1966
1967
1968/* Constant Formation
1969 */
1970static __inline qword si_il(signed short imm)
1971{
1972  return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1973}
1974
1975
1976static __inline qword si_ila(unsigned int imm)
1977{
1978  return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1979}
1980
1981static __inline qword si_ilh(signed short imm)
1982{
1983  return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1984}
1985
1986static __inline qword si_ilhu(signed short imm)
1987{
1988  return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1989}
1990
1991static __inline qword si_iohl(qword a, unsigned short imm)
1992{
1993  return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1994}
1995
1996/* No Operation
1997 */
1998#define si_lnop()		/* do nothing */
1999#define si_nop()		/* do nothing */
2000
2001
2002/* Memory Load and Store
2003 */
2004static __inline qword si_lqa(unsigned int imm)
2005{
2006  return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2007}
2008
2009static __inline qword si_lqd(qword a, unsigned int imm)
2010{
2011  return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2012}
2013
2014static __inline qword si_lqr(unsigned int imm)
2015{
2016  return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2017}
2018
2019static __inline qword si_lqx(qword a, qword b)
2020{
2021  return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2022}
2023
2024static __inline void si_stqa(qword a, unsigned int imm)
2025{
2026  vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2027}
2028
2029static __inline void si_stqd(qword a, qword b, unsigned int imm)
2030{
2031  vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2032}
2033
2034static __inline void si_stqr(qword a, unsigned int imm)
2035{
2036  vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2037}
2038
2039static __inline void si_stqx(qword a, qword b, qword c)
2040{
2041  vec_st((vec_uchar16)(a),
2042	 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043	 (vector unsigned char *)(0));
2044}
2045
2046#endif /* !__SPU__ */
2047#endif /* !_SI2VMX_H_ */
2048
2049