1#include <nmmintrin.h>
2#include <string.h>
3
4#define CFLAG 0x00000001
5#define ZFLAG 0x00000002
6#define SFLAG 0x00000004
7#define OFLAG 0x00000008
8#define AFLAG 0x00000010
9#define PFLAG 0x00000020
10
11#define PCMPSTR_EQ(X, Y, RES) \
12  {							\
13    int __size = (sizeof (*X) ^ 3) * 8;			\
14    int __i, __j;					\
15    for (__i = 0; __i < __size; __i++)			\
16      for (__j = 0; __j < __size; __j++)		\
17        RES[__j][__i] = (X[__i] == Y[__j]);		\
18  }
19
20#define PCMPSTR_RNG(X, Y, RES) \
21  {							\
22    int __size = (sizeof (*X) ^ 3) * 8;			\
23    int __i, __j;					\
24    for (__j = 0; __j < __size; __j++)			\
25      for (__i = 0; __i < __size - 1; __i += 2)		\
26	{						\
27	  RES[__j][__i] = (Y[__j] >= X[__i]);		\
28	  RES[__j][__i+1] = (Y[__j] <= X[__i + 1]);	\
29	}						\
30  }
31
32static void
33override_invalid (unsigned char res[16][16], int la, int lb,
34		  const int mode, int dim)
35{
36  int i, j;
37
38  for (j = 0; j < dim; j++)
39    for (i = 0; i < dim; i++)
40      if (i < la && j >= lb)
41	res[j][i] = 0;
42      else if (i >= la)
43	switch ((mode & 0x0C))
44	  {
45	  case _SIDD_CMP_EQUAL_ANY:
46	  case _SIDD_CMP_RANGES:
47	    res[j][i] = 0;
48	    break;
49	  case _SIDD_CMP_EQUAL_EACH:
50	    res[j][i] = (j >= lb) ? 1: 0;
51	    break;
52	  case _SIDD_CMP_EQUAL_ORDERED:
53	    res[j][i] = 1;
54	    break;
55          }
56}
57
58static void
59calc_matrix (__m128i a, int la, __m128i b, int lb, const int mode,
60	     unsigned char res[16][16])
61{
62  union
63    {
64      __m128i x;
65      signed char sc[16];
66      unsigned char uc[16];
67      signed short ss[8];
68      unsigned short us[8];
69    } d, s;
70
71  d.x = a;
72  s.x = b;
73
74  switch ((mode & 3))
75    {
76    case _SIDD_UBYTE_OPS:
77      if ((mode & 0x0C) == _SIDD_CMP_RANGES)
78	{
79	  PCMPSTR_RNG (d.uc, s.uc, res);
80	}
81      else
82	{
83	  PCMPSTR_EQ (d.uc, s.uc, res);
84	}
85      break;
86    case _SIDD_UWORD_OPS:
87      if ((mode & 0x0C) == _SIDD_CMP_RANGES)
88	{
89	  PCMPSTR_RNG (d.us, s.us, res);
90	}
91      else
92	{
93	  PCMPSTR_EQ (d.us, s.us, res);
94	}
95      break;
96    case _SIDD_SBYTE_OPS:
97      if ((mode & 0x0C) == _SIDD_CMP_RANGES)
98	{
99	  PCMPSTR_RNG (d.sc, s.sc, res);
100	}
101      else
102	{
103	  PCMPSTR_EQ (d.sc, s.sc, res);
104	}
105      break;
106    case _SIDD_SWORD_OPS:
107      if ((mode & 0x0C) == _SIDD_CMP_RANGES)
108	{
109	  PCMPSTR_RNG (d.ss, s.ss, res);
110	}
111      else
112	{
113	  PCMPSTR_EQ (d.ss, s.ss, res);
114	}
115      break;
116    }
117
118  override_invalid (res, la, lb, mode, (mode & 1) == 0 ? 16 : 8);
119}
120
121static int
122calc_res (__m128i a, int la, __m128i b, int lb, const int mode)
123{
124  unsigned char mtx[16][16];
125  int i, j, k, dim, res = 0;
126
127  memset (mtx, 0, sizeof (mtx));
128
129  dim = (mode & 1) == 0 ? 16 : 8;
130
131  if (la < 0)
132    la = -la;
133
134  if (lb < 0)
135    lb = -lb;
136
137  if (la > dim)
138    la = dim;
139
140  if (lb > dim)
141    lb = dim;
142
143  calc_matrix (a, la, b, lb, mode, mtx);
144
145  switch ((mode & 0x0C))
146    {
147    case _SIDD_CMP_EQUAL_ANY:
148      for (i = 0; i < dim; i++)
149	for (j = 0; j < dim; j++)
150	  if (mtx[i][j])
151	    res |= (1 << i);
152      break;
153
154     case _SIDD_CMP_RANGES:
155      for (i = 0; i < dim; i += 2)
156	for(j = 0; j < dim; j++)
157	  if (mtx[j][i] && mtx[j][i+1])
158	    res |= (1 << j);
159      break;
160
161     case _SIDD_CMP_EQUAL_EACH:
162      for(i = 0; i < dim; i++)
163	if (mtx[i][i])
164	  res |= (1 << i);
165      break;
166
167     case _SIDD_CMP_EQUAL_ORDERED:
168      for(i = 0; i < dim; i++)
169	{
170	  unsigned char val = 1;
171
172	  for (j = 0, k = i; j < dim - i && k < dim; j++, k++)
173	    val &= mtx[k][j];
174
175	  if (val)
176	    res |= (1 << i);
177	  else
178	    res &= ~(1 << i);
179	}
180      break;
181    }
182
183  switch ((mode & 0x30))
184    {
185    case _SIDD_POSITIVE_POLARITY:
186    case _SIDD_MASKED_POSITIVE_POLARITY:
187      break;
188
189    case _SIDD_NEGATIVE_POLARITY:
190      res ^= -1;
191      break;
192
193    case _SIDD_MASKED_NEGATIVE_POLARITY:
194      for (i = 0; i < lb; i++)
195	if (res & (1 << i))
196	  res &= ~(1 << i);
197	else
198	  res |= (1 << i);
199      break;
200    }
201
202  return res & ((dim == 8) ? 0xFF : 0xFFFF);
203}
204
205static int
206cmp_flags (__m128i a, int la, __m128i b, int lb,
207	   int mode, int res2, int is_implicit)
208{
209  int i;
210  int flags = 0;
211  int is_bytes_mode = (mode & 1) == 0;
212  union
213    {
214      __m128i x;
215      unsigned char uc[16];
216      unsigned short us[8];
217    } d, s;
218
219  d.x = a;
220  s.x = b;
221
222  /* CF: reset if (RES2 == 0), set otherwise.  */
223  if (res2 != 0)
224    flags |= CFLAG;
225
226  if (is_implicit)
227    {
228      /* ZF: set if any byte/word of src xmm operand is null, reset
229	 otherwise.
230	 SF: set if any byte/word of dst xmm operand is null, reset
231	 otherwise.  */
232
233      if (is_bytes_mode)
234	{
235	  for (i = 0; i < 16; i++)
236	    {
237	      if (s.uc[i] == 0)
238		flags |= ZFLAG;
239	      if (d.uc[i] == 0)
240		flags |= SFLAG;
241            }
242	}
243      else
244	{
245	  for (i = 0; i < 8; i++)
246	    {
247	      if (s.us[i] == 0)
248		flags |= ZFLAG;
249	      if (d.us[i] == 0)
250		flags |= SFLAG;
251            }
252        }
253    }
254  else
255    {
256      /* ZF: set if abs value of EDX/RDX < 16 (8), reset otherwise.
257	 SF: set if abs value of EAX/RAX < 16 (8), reset otherwise.  */
258      int max_ind = is_bytes_mode ? 16 : 8;
259
260      if (la < 0)
261	la = -la;
262      if (lb < 0)
263	lb = -lb;
264
265      if (lb < max_ind)
266	flags |= ZFLAG;
267      if (la < max_ind)
268	flags |= SFLAG;
269    }
270
271  /* OF: equal to RES2[0].  */
272  if ((res2 & 0x1))
273    flags |= OFLAG;
274
275  /* AF: Reset.
276     PF: Reset.  */
277  return flags;
278}
279
280static int
281cmp_indexed (__m128i a, int la, __m128i b, int lb,
282	     const int mode, int *res2)
283{
284  int i, ndx;
285  int dim = (mode & 1) == 0 ? 16 : 8;
286  int r2;
287
288  r2 = calc_res (a, la, b, lb, mode);
289
290  ndx = dim;
291  if ((mode & 0x40))
292    {
293      for (i = dim - 1; i >= 0; i--)
294	if (r2 & (1 << i))
295	  {
296	    ndx = i;
297	    break;
298	  }
299    }
300  else
301    {
302      for (i = 0; i < dim; i++)
303	if ((r2 & (1 << i)))
304	  {
305	    ndx = i;
306	    break;
307	  }
308    }
309
310   *res2 = r2;
311   return ndx;
312}
313
314static __m128i
315cmp_masked (__m128i a, int la, __m128i b, int lb,
316	    const int mode, int *res2)
317{
318  union
319    {
320      __m128i x;
321      char c[16];
322      short s[8];
323    } ret;
324  int i;
325  int dim = (mode & 1) == 0 ? 16 : 8;
326  union
327    {
328      int i;
329      char c[4];
330      short s[2];
331    } r2;
332
333  r2.i = calc_res (a, la, b, lb, mode);
334
335  memset (&ret, 0, sizeof (ret));
336
337  if (mode & 0x40)
338    {
339      for (i = 0; i < dim; i++)
340	if (dim == 8)
341	  ret.s [i] = (r2.i & (1 << i)) ? -1 : 0;
342	else
343	  ret.c [i] = (r2.i & (1 << i)) ? -1 : 0;
344    }
345  else
346    {
347      if (dim == 16)
348	ret.s[0] = r2.s[0];
349      else
350	ret.c[0] = r2.c[0];
351    }
352
353   *res2 = r2.i;
354
355   return ret.x;
356}
357
358static int
359calc_str_len (__m128i a, const int mode)
360{
361  union
362    {
363      __m128i x;
364      char c[16];
365      short s[8];
366    } s;
367  int i;
368  int dim  = (mode & 1) == 0 ? 16 : 8;
369
370  s.x = a;
371
372  if ((mode & 1))
373    {
374      for (i = 0; i < dim; i++)
375	if (s.s[i] == 0)
376	  break;
377    }
378  else
379    {
380      for (i = 0; i < dim; i++)
381       if (s.c[i] == 0)
382	 break;
383    }
384
385  return i;
386}
387
388static inline int
389cmp_ei (__m128i *a, int la, __m128i *b, int lb,
390	const int mode, int *flags)
391{
392  int res2;
393  int index = cmp_indexed (*a, la, *b, lb, mode, &res2);
394
395  if (flags != NULL)
396    *flags = cmp_flags (*a, la, *b, lb, mode, res2, 0);
397
398  return index;
399}
400
401static inline int
402cmp_ii (__m128i *a, __m128i *b, const int mode, int *flags)
403{
404  int la, lb;
405  int res2;
406  int index;
407
408  la = calc_str_len (*a, mode);
409  lb = calc_str_len (*b, mode);
410
411  index = cmp_indexed (*a, la, *b, lb, mode, &res2);
412
413  if (flags != NULL)
414    *flags = cmp_flags (*a, la, *b, lb, mode, res2, 1);
415
416  return index;
417}
418
419static inline __m128i
420cmp_em (__m128i *a, int la, __m128i *b, int lb,
421	const int mode, int *flags )
422{
423  int res2;
424  __m128i mask = cmp_masked (*a, la, *b, lb, mode, &res2);
425
426  if (flags != NULL)
427    *flags = cmp_flags (*a, la, *b, lb, mode, res2, 0);
428
429  return mask;
430}
431
432static inline __m128i
433cmp_im (__m128i *a, __m128i *b, const int mode, int *flags)
434{
435  int la, lb;
436  int res2;
437  __m128i mask;
438
439  la = calc_str_len (*a, mode);
440  lb = calc_str_len (*b, mode);
441
442  mask = cmp_masked (*a, la, *b, lb, mode, &res2);
443  if (flags != NULL)
444    *flags = cmp_flags (*a, la, *b, lb, mode, res2, 1);
445
446  return mask;
447}
448