1(* Common code for ARM NEON header file, documentation and test case
2   generators.
3
4   Copyright (C) 2006-2015 Free Software Foundation, Inc.
5   Contributed by CodeSourcery.
6
7   This file is part of GCC.
8
9   GCC is free software; you can redistribute it and/or modify it under
10   the terms of the GNU General Public License as published by the Free
11   Software Foundation; either version 3, or (at your option) any later
12   version.
13
14   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15   WARRANTY; without even the implied warranty of MERCHANTABILITY or
16   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17   for more details.
18
19   You should have received a copy of the GNU General Public License
20   along with GCC; see the file COPYING3.  If not see
21   <http://www.gnu.org/licenses/>.  *)
22
23(* Shorthand types for vector elements.  *)
24type elts = S8 | S16 | S32 | S64 | F16 | F32 | U8 | U16 | U32 | U64 | P8 | P16
25          | P64 | P128 | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
26          | Cast of elts * elts | NoElts
27
28type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
29	      | ConvClass of eltclass * eltclass | NoType
30
31(* These vector types correspond directly to C types.  *)
32type vectype = T_int8x8    | T_int8x16
33             | T_int16x4   | T_int16x8
34	     | T_int32x2   | T_int32x4
35	     | T_int64x1   | T_int64x2
36	     | T_uint8x8   | T_uint8x16
37	     | T_uint16x4  | T_uint16x8
38	     | T_uint32x2  | T_uint32x4
39	     | T_uint64x1  | T_uint64x2
40	     | T_float16x4
41	     | T_float32x2 | T_float32x4
42	     | T_poly8x8   | T_poly8x16
43	     | T_poly16x4  | T_poly16x8
44	     | T_immediate of int * int
45             | T_int8      | T_int16
46             | T_int32     | T_int64
47             | T_uint8     | T_uint16
48             | T_uint32    | T_uint64
49             | T_poly8     | T_poly16
50             | T_poly64    | T_poly64x1
51             | T_poly64x2  | T_poly128
52             | T_float16   | T_float32
53             | T_arrayof of int * vectype
54             | T_ptrto of vectype | T_const of vectype
55             | T_void      | T_intQI
56             | T_intHI     | T_intSI
57             | T_intDI     | T_intTI
58             | T_floatHF   | T_floatSF
59
60(* The meanings of the following are:
61     TImode : "Tetra", two registers (four words).
62     EImode : "hExa", three registers (six words).
63     OImode : "Octa", four registers (eight words).
64     CImode : "dodeCa", six registers (twelve words).
65     XImode : "heXadeca", eight registers (sixteen words).
66*)
67
68type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
69
70type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
71               | PtrTo of shape_elt | CstPtrTo of shape_elt
72	       (* These next ones are used only in the test generator.  *)
73	       | Element_of_dreg	(* Used for "lane" variants.  *)
74	       | Element_of_qreg	(* Likewise.  *)
75	       | All_elements_of_dreg	(* Used for "dup" variants.  *)
76	       | Alternatives of shape_elt list (* Used for multiple valid operands *)
77
78type shape_form = All of int * shape_elt
79                | Long
80		| Long_noreg of shape_elt
81		| Wide
82		| Wide_noreg of shape_elt
83		| Narrow
84                | Long_imm
85                | Narrow_imm
86                | Binary_imm of shape_elt
87                | Use_operands of shape_elt array
88                | By_scalar of shape_elt
89                | Unary_scalar of shape_elt
90                | Wide_lane
91                | Wide_scalar
92                | Pair_result of shape_elt
93
94type arity = Arity0 of vectype
95           | Arity1 of vectype * vectype
96	   | Arity2 of vectype * vectype * vectype
97	   | Arity3 of vectype * vectype * vectype * vectype
98           | Arity4 of vectype * vectype * vectype * vectype * vectype
99
100type vecmode = V8QI | V4HI | V4HF |V2SI | V2SF | DI
101             | V16QI | V8HI | V4SI | V4SF | V2DI | TI
102             | QI | HI | SI | SF
103
104type opcode =
105  (* Binary ops.  *)
106    Vadd
107  | Vmul
108  | Vmla
109  | Vmls
110  | Vfma
111  | Vfms
112  | Vsub
113  | Vceq
114  | Vcge
115  | Vcgt
116  | Vcle
117  | Vclt
118  | Vcage
119  | Vcagt
120  | Vcale
121  | Vcalt
122  | Vtst
123  | Vabd
124  | Vaba
125  | Vmax
126  | Vmin
127  | Vpadd
128  | Vpada
129  | Vpmax
130  | Vpmin
131  | Vrecps
132  | Vrsqrts
133  | Vshl
134  | Vshr_n
135  | Vshl_n
136  | Vsra_n
137  | Vsri
138  | Vsli
139  (* Logic binops.  *)
140  | Vand
141  | Vorr
142  | Veor
143  | Vbic
144  | Vorn
145  | Vbsl
146  (* Ops with scalar.  *)
147  | Vmul_lane
148  | Vmla_lane
149  | Vmls_lane
150  | Vmul_n
151  | Vmla_n
152  | Vmls_n
153  | Vmull_n
154  | Vmull_lane
155  | Vqdmull_n
156  | Vqdmull_lane
157  | Vqdmulh_n
158  | Vqdmulh_lane
159  (* Unary ops.  *)
160  | Vrintn
161  | Vrinta
162  | Vrintp
163  | Vrintm
164  | Vrintz
165  | Vabs
166  | Vneg
167  | Vcls
168  | Vclz
169  | Vcnt
170  | Vrecpe
171  | Vrsqrte
172  | Vmvn
173  (* Vector extract.  *)
174  | Vext
175  (* Reverse elements.  *)
176  | Vrev64
177  | Vrev32
178  | Vrev16
179  (* Transposition ops.  *)
180  | Vtrn
181  | Vzip
182  | Vuzp
183  (* Loads and stores (VLD1/VST1/VLD2...), elements and structures.  *)
184  | Vldx of int
185  | Vstx of int
186  | Vldx_lane of int
187  | Vldx_dup of int
188  | Vstx_lane of int
189  (* Set/extract lanes from a vector.  *)
190  | Vget_lane
191  | Vset_lane
192  (* Initialize vector from bit pattern.  *)
193  | Vcreate
194  (* Set all lanes to same value.  *)
195  | Vdup_n
196  | Vmov_n  (* Is this the same?  *)
197  (* Duplicate scalar to all lanes of vector.  *)
198  | Vdup_lane
199  (* Combine vectors.  *)
200  | Vcombine
201  (* Get quadword high/low parts.  *)
202  | Vget_high
203  | Vget_low
204  (* Convert vectors.  *)
205  | Vcvt
206  | Vcvt_n
207  (* Narrow/lengthen vectors.  *)
208  | Vmovn
209  | Vmovl
210  (* Table lookup.  *)
211  | Vtbl of int
212  | Vtbx of int
213  (* Reinterpret casts.  *)
214  | Vreinterp
215
216let rev_elems revsize elsize nelts _ =
217  let mask = (revsize / elsize) - 1 in
218  let arr = Array.init nelts
219    (fun i -> i lxor mask) in
220  Array.to_list arr
221
222let permute_range i stride nelts increment =
223  let rec build i = function
224    0 -> []
225  | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
226  build i nelts
227
228(* Generate a list of integers suitable for vzip.  *)
229let zip_range i stride nelts = permute_range i stride nelts 1
230
231(* Generate a list of integers suitable for vunzip.  *)
232let uzip_range i stride nelts = permute_range i stride nelts 4
233
234(* Generate a list of integers suitable for trn.  *)
235let trn_range i stride nelts = permute_range i stride nelts 2
236
237let zip_elems _ nelts part =
238  match part with
239    `lo -> zip_range 0 nelts (nelts / 2)
240  | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
241
242let uzip_elems _ nelts part =
243  match part with
244    `lo -> uzip_range 0 2 (nelts / 2)
245  | `hi -> uzip_range 1 2 (nelts / 2)
246
247let trn_elems _ nelts part =
248  match part with
249    `lo -> trn_range 0 nelts (nelts / 2)
250  | `hi -> trn_range 1 nelts (nelts / 2)
251
252(* Features used for documentation, to distinguish between some instruction
253   variants, and to signal special requirements (e.g. swapping arguments).  *)
254
255type features =
256    Halving
257  | Rounding
258  | Saturating
259  | Dst_unsign
260  | High_half
261  | Doubling
262  | Flipped of string  (* Builtin name to use with flipped arguments.  *)
263  | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
264                 for All _, Long, Wide, Narrow shape_forms.  *)
265    (* Implement builtin as shuffle.  The parameter is a function which returns
266       masks suitable for __builtin_shuffle: arguments are (element size,
267       number of elements, high/low part selector).  *)
268  | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
269    (* A specification as to the shape of instruction expected upon
270       disassembly, used if it differs from the shape used to build the
271       intrinsic prototype.  Multiple entries in the constructor's argument
272       indicate that the intrinsic expands to more than one assembly
273       instruction, each with a corresponding shape specified here.  *)
274  | Disassembles_as of shape_form list
275  | Builtin_name of string  (* Override the name of the builtin.  *)
276    (* Override the name of the instruction.  If more than one name
277       is specified, it means that the instruction can have any of those
278       names.  *)
279  | Instruction_name of string list
280    (* Mark that the intrinsic yields no instructions, or expands to yield
281       behavior that the test generator cannot test.  *)
282  | No_op
283    (* Mark that the intrinsic has constant arguments that cannot be set
284       to the defaults (zero for pointers and one otherwise) in the test
285       cases.  The function supplied must return the integer to be written
286       into the testcase for the argument number (0-based) supplied to it.  *)
287  | Const_valuator of (int -> int)
288  | Fixed_vector_reg
289  | Fixed_core_reg
290    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
291  | Requires_feature of string
292    (* Mark that the intrinsic requires a particular architecture version.  *)
293  | Requires_arch of int
294    (* Mark that the intrinsic requires a particular bit in __ARM_FP to
295    be set.   *)
296  | Requires_FP_bit of int
297    (* Compiler optimization level for the test.  *)
298  | Compiler_optim of string
299
300exception MixedMode of elts * elts
301
302let rec elt_width = function
303    S8 | U8 | P8 | I8 | B8 -> 8
304  | S16 | U16 | P16 | I16 | B16 | F16 -> 16
305  | S32 | F32 | U32 | I32 | B32 -> 32
306  | S64 | U64 | P64 | I64 | B64 -> 64
307  | P128 -> 128
308  | Conv (a, b) ->
309      let wa = elt_width a and wb = elt_width b in
310      if wa = wb then wa else raise (MixedMode (a, b))
311  | Cast (a, b) -> raise (MixedMode (a, b))
312  | NoElts -> failwith "No elts"
313
314let rec elt_class = function
315    S8 | S16 | S32 | S64 -> Signed
316  | U8 | U16 | U32 | U64 -> Unsigned
317  | P8 | P16 | P64 | P128 -> Poly
318  | F16 | F32 -> Float
319  | I8 | I16 | I32 | I64 -> Int
320  | B8 | B16 | B32 | B64 -> Bits
321  | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
322  | NoElts -> NoType
323
324let elt_of_class_width c w =
325  match c, w with
326    Signed, 8 -> S8
327  | Signed, 16 -> S16
328  | Signed, 32 -> S32
329  | Signed, 64 -> S64
330  | Float, 16 -> F16
331  | Float, 32 -> F32
332  | Unsigned, 8 -> U8
333  | Unsigned, 16 -> U16
334  | Unsigned, 32 -> U32
335  | Unsigned, 64 -> U64
336  | Poly, 8 -> P8
337  | Poly, 16 -> P16
338  | Poly, 64 -> P64
339  | Poly, 128 -> P128
340  | Int, 8 -> I8
341  | Int, 16 -> I16
342  | Int, 32 -> I32
343  | Int, 64 -> I64
344  | Bits, 8 -> B8
345  | Bits, 16 -> B16
346  | Bits, 32 -> B32
347  | Bits, 64 -> B64
348  | _ -> failwith "Bad element type"
349
350(* Return unsigned integer element the same width as argument.  *)
351let unsigned_of_elt elt =
352  elt_of_class_width Unsigned (elt_width elt)
353
354let signed_of_elt elt =
355  elt_of_class_width Signed (elt_width elt)
356
357(* Return untyped bits element the same width as argument.  *)
358let bits_of_elt elt =
359  elt_of_class_width Bits (elt_width elt)
360
361let non_signed_variant = function
362    S8 -> I8
363  | S16 -> I16
364  | S32 -> I32
365  | S64 -> I64
366  | U8 -> I8
367  | U16 -> I16
368  | U32 -> I32
369  | U64 -> I64
370  | x -> x
371
372let poly_unsigned_variant v =
373  let elclass = match elt_class v with
374    Poly -> Unsigned
375  | x -> x in
376  elt_of_class_width elclass (elt_width v)
377
378let widen_elt elt =
379  let w = elt_width elt
380  and c = elt_class elt in
381  elt_of_class_width c (w * 2)
382
383let narrow_elt elt =
384  let w = elt_width elt
385  and c = elt_class elt in
386  elt_of_class_width c (w / 2)
387
388(* If we're trying to find a mode from a "Use_operands" instruction, use the
389   last vector operand as the dominant mode used to invoke the correct builtin.
390   We must stick to this rule in neon.md.  *)
391let find_key_operand operands =
392  let rec scan opno =
393    match operands.(opno) with
394      Qreg -> Qreg
395    | Dreg -> Dreg
396    | VecArray (_, Qreg) -> Qreg
397    | VecArray (_, Dreg) -> Dreg
398    | _ -> scan (opno-1)
399  in
400    scan ((Array.length operands) - 1)
401
402(* Find a vecmode from a shape_elt ELT for an instruction with shape_form
403   SHAPE.  For a Use_operands shape, if ARGPOS is passed then return the mode
404   for the given argument position, else determine which argument to return a
405   mode for automatically.  *)
406
407let rec mode_of_elt ?argpos elt shape =
408  let flt = match elt_class elt with
409    Float | ConvClass(_, Float) -> true | _ -> false in
410  let idx =
411    match elt_width elt with
412      8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3 | 128 -> 4
413    | _ -> failwith "Bad element width"
414  in match shape with
415    All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
416  | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
417      if flt then
418        [| V8QI; V4HF; V2SF; DI |].(idx)
419      else
420        [| V8QI; V4HI; V2SI; DI |].(idx)
421  | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
422  | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
423      [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI; TI|].(idx)
424  | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
425      [| QI; HI; if flt then SF else SI; DI |].(idx)
426  | Long | Wide | Wide_lane | Wide_scalar
427  | Long_imm ->
428      [| V8QI; V4HI; V2SI; DI |].(idx)
429  | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
430  | Use_operands ops ->
431      begin match argpos with
432        None -> mode_of_elt ?argpos elt (All (0, (find_key_operand ops)))
433      | Some pos -> mode_of_elt ?argpos elt (All (0, ops.(pos)))
434      end
435  | _ -> failwith "invalid shape"
436
437(* Modify an element type dependent on the shape of the instruction and the
438   operand number.  *)
439
440let shapemap shape no =
441  let ident = fun x -> x in
442  match shape with
443    All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
444  | Binary_imm _ -> ident
445  | Long | Long_noreg _ | Wide_scalar | Long_imm ->
446      [| widen_elt; ident; ident |].(no)
447  | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
448  | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
449  | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
450
451(* Register type (D/Q) of an operand, based on shape and operand number.  *)
452
453let regmap shape no =
454  match shape with
455    All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
456  | Long -> [| Qreg; Dreg; Dreg |].(no)
457  | Wide -> [| Qreg; Qreg; Dreg |].(no)
458  | Narrow -> [| Dreg; Qreg; Qreg |].(no)
459  | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
460  | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
461  | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
462  | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
463  | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
464  | Binary_imm reg -> [| reg; reg; Immed |].(no)
465  | Long_imm -> [| Qreg; Dreg; Immed |].(no)
466  | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
467  | Use_operands these -> these.(no)
468
469let type_for_elt shape elt no =
470  let elt = (shapemap shape no) elt in
471  let reg = regmap shape no in
472  let rec type_for_reg_elt reg elt =
473    match reg with
474      Dreg ->
475        begin match elt with
476          S8 -> T_int8x8
477        | S16 -> T_int16x4
478        | S32 -> T_int32x2
479        | S64 -> T_int64x1
480        | U8 -> T_uint8x8
481        | U16 -> T_uint16x4
482        | U32 -> T_uint32x2
483        | U64 -> T_uint64x1
484        | P64 -> T_poly64x1
485        | P128 -> T_poly128
486        | F16 -> T_float16x4
487        | F32 -> T_float32x2
488        | P8 -> T_poly8x8
489        | P16 -> T_poly16x4
490        | _ -> failwith "Bad elt type for Dreg"
491        end
492    | Qreg ->
493        begin match elt with
494          S8 -> T_int8x16
495        | S16 -> T_int16x8
496        | S32 -> T_int32x4
497        | S64 -> T_int64x2
498        | U8 -> T_uint8x16
499        | U16 -> T_uint16x8
500        | U32 -> T_uint32x4
501        | U64 -> T_uint64x2
502        | F32 -> T_float32x4
503        | P8 -> T_poly8x16
504        | P16 -> T_poly16x8
505        | P64 -> T_poly64x2
506        | P128 -> T_poly128
507        | _ -> failwith "Bad elt type for Qreg"
508        end
509    | Corereg ->
510        begin match elt with
511          S8 -> T_int8
512        | S16 -> T_int16
513        | S32 -> T_int32
514        | S64 -> T_int64
515        | U8 -> T_uint8
516        | U16 -> T_uint16
517        | U32 -> T_uint32
518        | U64 -> T_uint64
519        | P8 -> T_poly8
520        | P16 -> T_poly16
521        | P64 -> T_poly64
522        | P128 -> T_poly128
523        | F32 -> T_float32
524        | _ -> failwith "Bad elt type for Corereg"
525        end
526    | Immed ->
527        T_immediate (0, 0)
528    | VecArray (num, sub) ->
529        T_arrayof (num, type_for_reg_elt sub elt)
530    | PtrTo x ->
531        T_ptrto (type_for_reg_elt x elt)
532    | CstPtrTo x ->
533        T_ptrto (T_const (type_for_reg_elt x elt))
534    (* Anything else is solely for the use of the test generator.  *)
535    | _ -> assert false
536  in
537    type_for_reg_elt reg elt
538
539(* Return size of a vector type, in bits.  *)
540let vectype_size = function
541    T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
542  | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
543  | T_float32x2 | T_poly8x8 | T_poly64x1 | T_poly16x4 | T_float16x4 -> 64
544  | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
545  | T_uint8x16 | T_uint16x8  | T_uint32x4  | T_uint64x2
546  | T_float32x4 | T_poly8x16 | T_poly64x2 | T_poly16x8 -> 128
547  | _ -> raise Not_found
548
549let inttype_for_array num elttype =
550  let eltsize = vectype_size elttype in
551  let numwords = (num * eltsize) / 32 in
552  match numwords with
553    4 -> B_TImode
554  | 6 -> B_EImode
555  | 8 -> B_OImode
556  | 12 -> B_CImode
557  | 16 -> B_XImode
558  | _ -> failwith ("no int type for size " ^ string_of_int numwords)
559
560(* These functions return pairs of (internal, external) types, where "internal"
561   types are those seen by GCC, and "external" are those seen by the assembler.
562   These types aren't necessarily the same, since the intrinsics can munge more
563   than one C type into each assembler opcode.  *)
564
565let make_sign_invariant func shape elt =
566  let arity, elt' = func shape elt in
567  arity, non_signed_variant elt'
568
569(* Don't restrict any types.  *)
570
571let elts_same make_arity shape elt =
572  let vtype = type_for_elt shape elt in
573  make_arity vtype, elt
574
575(* As sign_invar_*, but when sign matters.  *)
576let elts_same_io_lane =
577  elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
578
579let elts_same_io =
580  elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
581
582let elts_same_2_lane =
583  elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
584
585let elts_same_3 = elts_same_2_lane
586
587let elts_same_2 =
588  elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
589
590let elts_same_1 =
591  elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
592
593(* Use for signed/unsigned invariant operations (i.e. where the operation
594   doesn't depend on the sign of the data.  *)
595
596let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
597let sign_invar_io = make_sign_invariant elts_same_io
598let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
599let sign_invar_2 = make_sign_invariant elts_same_2
600let sign_invar_1 = make_sign_invariant elts_same_1
601
602(* Sign-sensitive comparison.  *)
603
604let cmp_sign_matters shape elt =
605  let vtype = type_for_elt shape elt
606  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
607  Arity2 (rtype, vtype 1, vtype 2), elt
608
609(* Signed/unsigned invariant comparison.  *)
610
611let cmp_sign_invar shape elt =
612  let shape', elt' = cmp_sign_matters shape elt in
613  let elt'' =
614    match non_signed_variant elt' with
615      P8 -> I8
616    | x -> x
617  in
618    shape', elt''
619
620(* Comparison (VTST) where only the element width matters.  *)
621
622let cmp_bits shape elt =
623  let vtype = type_for_elt shape elt
624  and rtype = type_for_elt shape (unsigned_of_elt elt) 0
625  and bits_only = bits_of_elt elt in
626  Arity2 (rtype, vtype 1, vtype 2), bits_only
627
628let reg_shift shape elt =
629  let vtype = type_for_elt shape elt
630  and op2type = type_for_elt shape (signed_of_elt elt) 2 in
631  Arity2 (vtype 0, vtype 1, op2type), elt
632
633(* Genericised constant-shift type-generating function.  *)
634
635let const_shift mkimm ?arity ?result shape elt =
636  let op2type = (shapemap shape 2) elt in
637  let op2width = elt_width op2type in
638  let op2 = mkimm op2width
639  and op1 = type_for_elt shape elt 1
640  and r_elt =
641    match result with
642      None -> elt
643    | Some restriction -> restriction elt in
644  let rtype = type_for_elt shape r_elt 0 in
645  match arity with
646    None -> Arity2 (rtype, op1, op2), elt
647  | Some mkarity -> mkarity rtype op1 op2, elt
648
649(* Use for immediate right-shifts.  *)
650
651let shift_right shape elt =
652  const_shift (fun imm -> T_immediate (1, imm)) shape elt
653
654let shift_right_acc shape elt =
655  const_shift (fun imm -> T_immediate (1, imm))
656    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
657
658(* Use for immediate right-shifts when the operation doesn't care about
659   signedness.  *)
660
661let shift_right_sign_invar =
662  make_sign_invariant shift_right
663
664(* Immediate right-shift; result is unsigned even when operand is signed.  *)
665
666let shift_right_to_uns shape elt =
667  const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
668    shape elt
669
670(* Immediate left-shift.  *)
671
672let shift_left shape elt =
673  const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
674
675(* Immediate left-shift, unsigned result.  *)
676
677let shift_left_to_uns shape elt =
678  const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
679    shape elt
680
681(* Immediate left-shift, don't care about signs.  *)
682
683let shift_left_sign_invar =
684  make_sign_invariant shift_left
685
686(* Shift left/right and insert: only element size matters.  *)
687
688let shift_insert shape elt =
689  let arity, elt =
690    const_shift (fun imm -> T_immediate (1, imm))
691    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
692  arity, bits_of_elt elt
693
694(* Get/set lane.  *)
695
696let get_lane shape elt =
697  let vtype = type_for_elt shape elt in
698  Arity2 (vtype 0, vtype 1, vtype 2),
699    (match elt with P8 -> U8 | P16 -> U16 | S32 | U32 | F32 -> B32 | x -> x)
700
701let set_lane shape elt =
702  let vtype = type_for_elt shape elt in
703  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
704
705let set_lane_notype shape elt =
706  let vtype = type_for_elt shape elt in
707  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
708
709let create_vector shape elt =
710  let vtype = type_for_elt shape U64 1
711  and rtype = type_for_elt shape elt 0 in
712  Arity1 (rtype, vtype), elt
713
714let conv make_arity shape elt =
715  let edest, esrc = match elt with
716    Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
717  | _ -> failwith "Non-conversion element in conversion" in
718  let vtype = type_for_elt shape esrc
719  and rtype = type_for_elt shape edest 0 in
720  make_arity rtype vtype, elt
721
722let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
723let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
724
725(* Operation has an unsigned result even if operands are signed.  *)
726
727let dst_unsign make_arity shape elt =
728  let vtype = type_for_elt shape elt
729  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
730  make_arity rtype vtype, elt
731
732let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
733
734let make_bits_only func shape elt =
735  let arity, elt' = func shape elt in
736  arity, bits_of_elt elt'
737
738(* Extend operation.  *)
739
740let extend shape elt =
741  let vtype = type_for_elt shape elt in
742  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
743
744(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
745   integer ops respectively, or unsigned for polynomial ops.  *)
746
747let table mkarity shape elt =
748  let vtype = type_for_elt shape elt in
749  let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
750  mkarity vtype op2, bits_of_elt elt
751
752let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
753let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
754
755(* Operations where only bits matter.  *)
756
757let bits_1 = make_bits_only elts_same_1
758let bits_2 = make_bits_only elts_same_2
759let bits_3 = make_bits_only elts_same_3
760
761(* Store insns.  *)
762let store_1 shape elt =
763  let vtype = type_for_elt shape elt in
764  Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
765
766let store_3 shape elt =
767  let vtype = type_for_elt shape elt in
768  Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
769
770let make_notype func shape elt =
771  let arity, _ = func shape elt in
772  arity, NoElts
773
774let notype_1 = make_notype elts_same_1
775let notype_2 = make_notype elts_same_2
776let notype_3 = make_notype elts_same_3
777
778(* Bit-select operations (first operand is unsigned int).  *)
779
780let bit_select shape elt =
781  let vtype = type_for_elt shape elt
782  and itype = type_for_elt shape (unsigned_of_elt elt) in
783  Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
784
785(* Common lists of supported element types.  *)
786
787let s_8_32 = [S8; S16; S32]
788let u_8_32 = [U8; U16; U32]
789let su_8_32 = [S8; S16; S32; U8; U16; U32]
790let su_8_64 = S64 :: U64 :: su_8_32
791let su_16_64 = [S16; S32; S64; U16; U32; U64]
792let pf_su_8_16 = [P8; P16; S8; S16; U8; U16]
793let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
794let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
795let suf_32 = [S32; U32; F32]
796
797let ops =
798  [
799    (* Addition.  *)
800    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32;
801    Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64];
802    Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
803    Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
804    Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
805    Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
806    Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
807    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
808      All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
809    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
810      All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
811    Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
812    Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
813    Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
814    Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
815      Narrow, "vRaddhn", sign_invar_2, su_16_64;
816
817    (* Multiplication.  *)
818    Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
819    Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
820    Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
821      elts_same_2, [S16; S32];
822    Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
823      elts_same_2, [S16; S32];
824    Vmul,
825      [Saturating; Rounding; Doubling; High_half;
826       Instruction_name ["vqrdmulh"]],
827      All (3, Dreg), "vqRdmulh",
828      elts_same_2, [S16; S32];
829    Vmul,
830      [Saturating; Rounding; Doubling; High_half;
831       Instruction_name ["vqrdmulh"]],
832      All (3, Qreg), "vqRdmulhQ",
833      elts_same_2, [S16; S32];
834    Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
835    Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
836
837    (* Multiply-accumulate. *)
838    Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
839    Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
840    Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
841    Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
842
843    (* Multiply-subtract.  *)
844    Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
845    Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
846    Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
847    Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
848
849    (* Fused-multiply-accumulate. *)
850    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
851    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
852    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
853    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
854
855    (* Round to integral. *)
856    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
857            "vrndn", elts_same_1, [F32];
858    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
859            "vrndqn", elts_same_1, [F32];
860    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
861            "vrnda", elts_same_1, [F32];
862    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
863            "vrndqa", elts_same_1, [F32];
864    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
865            "vrndp", elts_same_1, [F32];
866    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
867            "vrndqp", elts_same_1, [F32];
868    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
869            "vrndm", elts_same_1, [F32];
870    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
871            "vrndqm", elts_same_1, [F32];
872    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
873            "vrnd", elts_same_1, [F32];
874    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
875            "vrndq", elts_same_1, [F32];
876    (* Subtraction.  *)
877    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
878    Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
879    Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
880    Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
881    Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
882    Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
883    Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
884    Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
885    Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
886    Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
887    Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
888      Narrow, "vRsubhn", sign_invar_2, su_16_64;
889
890    (* Comparison, equal.  *)
891    Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
892    Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
893
894    (* Comparison, greater-than or equal.  *)
895    Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: s_8_32;
896    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
897      All (3, Dreg), "vcge", cmp_sign_matters,
898      u_8_32;
899    Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: s_8_32;
900    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
901      All (3, Qreg), "vcgeQ", cmp_sign_matters,
902      u_8_32;
903
904    (* Comparison, less-than or equal.  *)
905    Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
906      F32 :: s_8_32;
907    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeu"],
908      All (3, Dreg), "vcle", cmp_sign_matters,
909      u_8_32;
910    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
911      All (3, Qreg), "vcleQ", cmp_sign_matters,
912      F32 :: s_8_32;
913    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeuQ"],
914      All (3, Qreg), "vcleQ", cmp_sign_matters,
915      u_8_32;
916
917    (* Comparison, greater-than.  *)
918    Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: s_8_32;
919    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
920      All (3, Dreg), "vcgt", cmp_sign_matters,
921      u_8_32;
922    Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: s_8_32;
923    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
924      All (3, Qreg), "vcgtQ", cmp_sign_matters,
925      u_8_32;
926
927    (* Comparison, less-than.  *)
928    Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
929      F32 :: s_8_32;
930    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtu"],
931      All (3, Dreg), "vclt", cmp_sign_matters,
932      u_8_32;
933    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
934      All (3, Qreg), "vcltQ", cmp_sign_matters,
935      F32 :: s_8_32;
936    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtuQ"],
937      All (3, Qreg), "vcltQ", cmp_sign_matters,
938      u_8_32;
939
940    (* Compare absolute greater-than or equal.  *)
941    Vcage, [Instruction_name ["vacge"]],
942      All (3, Dreg), "vcage", cmp_sign_matters, [F32];
943    Vcage, [Instruction_name ["vacge"]],
944      All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
945
946    (* Compare absolute less-than or equal.  *)
947    Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
948      All (3, Dreg), "vcale", cmp_sign_matters, [F32];
949    Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
950      All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
951
952    (* Compare absolute greater-than or equal.  *)
953    Vcagt, [Instruction_name ["vacgt"]],
954      All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
955    Vcagt, [Instruction_name ["vacgt"]],
956      All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
957
958    (* Compare absolute less-than or equal.  *)
959    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
960      All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
961    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
962      All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
963
964    (* Test bits.  *)
965    Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
966    Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
967
968    (* Absolute difference.  *)
969    Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
970    Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
971    Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
972
973    (* Absolute difference and accumulate.  *)
974    Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
975    Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
976    Vaba, [], Long, "vabal", elts_same_io, su_8_32;
977
978    (* Max.  *)
979    Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
980    Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
981
982    (* Min.  *)
983    Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
984    Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
985
986    (* Pairwise add.  *)
987    Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
988    Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
989    Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
990
991    (* Pairwise add, widen and accumulate.  *)
992    Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
993    Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
994
995    (* Folding maximum, minimum.  *)
996    Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
997    Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
998
999    (* Reciprocal step.  *)
1000    Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
1001    Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
1002    Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
1003    Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
1004
1005    (* Vector shift left.  *)
1006    Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
1007    Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
1008    Vshl, [Instruction_name ["vrshl"]; Rounding],
1009      All (3, Dreg), "vRshl", reg_shift, su_8_64;
1010    Vshl, [Instruction_name ["vrshl"]; Rounding],
1011      All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
1012    Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
1013    Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
1014    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
1015      All (3, Dreg), "vqRshl", reg_shift, su_8_64;
1016    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
1017      All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
1018
1019    (* Vector shift right by constant.  *)
1020    Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
1021    Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
1022    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
1023      "vRshr_n", shift_right, su_8_64;
1024    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
1025      "vRshrQ_n", shift_right, su_8_64;
1026    Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
1027    Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
1028      shift_right_sign_invar, su_16_64;
1029    Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
1030    Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
1031      "vqRshrn_n", shift_right, su_16_64;
1032    Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
1033      shift_right_to_uns, [S16; S32; S64];
1034    Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
1035      Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
1036
1037    (* Vector shift left by constant.  *)
1038    Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
1039    Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
1040    Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
1041    Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
1042    Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
1043      shift_left_to_uns, [S8; S16; S32; S64];
1044    Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
1045      shift_left_to_uns, [S8; S16; S32; S64];
1046    Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
1047
1048    (* Vector shift right by constant and accumulate.  *)
1049    Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
1050    Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
1051    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
1052      "vRsra_n", shift_right_acc, su_8_64;
1053    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
1054      "vRsraQ_n", shift_right_acc, su_8_64;
1055
1056    (* Vector shift right and insert.  *)
1057    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
1058      [P64];
1059    Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
1060      P8 :: P16 :: su_8_64;
1061    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
1062      [P64];
1063    Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
1064      P8 :: P16 :: su_8_64;
1065
1066    (* Vector shift left and insert.  *)
1067    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
1068      [P64];
1069    Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
1070      P8 :: P16 :: su_8_64;
1071    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
1072      [P64];
1073    Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
1074      P8 :: P16 :: su_8_64;
1075
1076    (* Absolute value.  *)
1077    Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
1078    Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
1079    Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
1080    Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
1081
1082    (* Negate.  *)
1083    Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
1084    Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
1085    Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
1086    Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
1087
1088    (* Bitwise not.  *)
1089    Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
1090    Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
1091
1092    (* Count leading sign bits.  *)
1093    Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
1094    Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
1095
1096    (* Count leading zeros.  *)
1097    Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
1098    Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
1099
1100    (* Count number of set bits.  *)
1101    Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
1102    Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
1103
1104    (* Reciprocal estimate.  *)
1105    Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
1106    Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
1107
1108    (* Reciprocal square-root estimate.  *)
1109    Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
1110    Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
1111
1112    (* Get lanes from a vector.  *)
1113    Vget_lane,
1114      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1115       Instruction_name ["vmov"]],
1116      Use_operands [| Corereg; Dreg; Immed |],
1117      "vget_lane", get_lane, pf_su_8_32;
1118    Vget_lane,
1119      [No_op;
1120       InfoWord;
1121       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1122       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1123      Use_operands [| Corereg; Dreg; Immed |],
1124      "vget_lane", notype_2, [S64; U64];
1125    Vget_lane,
1126      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1127       Instruction_name ["vmov"]],
1128      Use_operands [| Corereg; Qreg; Immed |],
1129      "vgetQ_lane", get_lane, pf_su_8_32;
1130    Vget_lane,
1131      [InfoWord;
1132       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1133       Instruction_name ["vmov"; "fmrrd"]; Const_valuator (fun _ -> 0);
1134       Fixed_core_reg],
1135      Use_operands [| Corereg; Qreg; Immed |],
1136      "vgetQ_lane", notype_2, [S64; U64];
1137
1138    (* Set lanes in a vector.  *)
1139    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1140                Instruction_name ["vmov"]],
1141      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1142      set_lane, pf_su_8_32;
1143    Vset_lane, [No_op;
1144                Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1145                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1146      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1147      set_lane_notype, [S64; U64];
1148    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1149                Instruction_name ["vmov"]],
1150      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1151      set_lane, pf_su_8_32;
1152    Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1153                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1154      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1155      set_lane_notype, [S64; U64];
1156
1157    (* Create vector from literal bit pattern.  *)
1158    Vcreate,
1159      [Requires_feature "CRYPTO"; No_op], (* Not really, but it can yield various things that are too
1160                                   hard for the test generator at this time.  *)
1161      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
1162      [P64];
1163    Vcreate,
1164      [No_op], (* Not really, but it can yield various things that are too
1165                  hard for the test generator at this time.  *)
1166      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
1167      pf_su_8_64;
1168
1169    (* Set all lanes to the same value.  *)
1170    Vdup_n,
1171      [Disassembles_as [Use_operands [| Dreg;
1172                                        Alternatives [ Corereg;
1173                                                       Element_of_dreg ] |]]],
1174      Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
1175      pf_su_8_32;
1176    Vdup_n,
1177      [No_op; Requires_feature "CRYPTO";
1178       Instruction_name ["vmov"];
1179       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1180      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
1181      [P64];
1182    Vdup_n,
1183      [No_op;
1184       Instruction_name ["vmov"];
1185       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1186      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
1187      [S64; U64];
1188    Vdup_n,
1189      [No_op; Requires_feature "CRYPTO";
1190       Disassembles_as [Use_operands [| Qreg;
1191                                        Alternatives [ Corereg;
1192                                                       Element_of_dreg ] |]]],
1193      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
1194      [P64];
1195    Vdup_n,
1196      [Disassembles_as [Use_operands [| Qreg;
1197                                        Alternatives [ Corereg;
1198                                                       Element_of_dreg ] |]]],
1199      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
1200      pf_su_8_32;
1201    Vdup_n,
1202      [No_op;
1203       Instruction_name ["vmov"];
1204       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1205                        Use_operands [| Dreg; Corereg; Corereg |]]],
1206      Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
1207      [S64; U64];
1208
1209    (* These are just aliases for the above.  *)
1210    Vmov_n,
1211      [Builtin_name "vdup_n";
1212       Disassembles_as [Use_operands [| Dreg;
1213                                        Alternatives [ Corereg;
1214                                                       Element_of_dreg ] |]]],
1215      Use_operands [| Dreg; Corereg |],
1216      "vmov_n", bits_1, pf_su_8_32;
1217    Vmov_n,
1218      [No_op;
1219       Builtin_name "vdup_n";
1220       Instruction_name ["vmov"];
1221       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1222      Use_operands [| Dreg; Corereg |],
1223      "vmov_n", notype_1, [S64; U64];
1224    Vmov_n,
1225      [Builtin_name "vdupQ_n";
1226       Disassembles_as [Use_operands [| Qreg;
1227                                        Alternatives [ Corereg;
1228                                                       Element_of_dreg ] |]]],
1229      Use_operands [| Qreg; Corereg |],
1230      "vmovQ_n", bits_1, pf_su_8_32;
1231    Vmov_n,
1232      [No_op;
1233       Builtin_name "vdupQ_n";
1234       Instruction_name ["vmov"];
1235       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1236                        Use_operands [| Dreg; Corereg; Corereg |]]],
1237      Use_operands [| Qreg; Corereg |],
1238      "vmovQ_n", notype_1, [S64; U64];
1239
1240    (* Duplicate, lane version.  We can't use Use_operands here because the
1241       rightmost register (always Dreg) would be picked up by find_key_operand,
1242       when we want the leftmost register to be used in this case (otherwise
1243       the modes are indistinguishable in neon.md, etc.  *)
1244    Vdup_lane,
1245      [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
1246      Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
1247    Vdup_lane,
1248      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1249      Unary_scalar Dreg, "vdup_lane", bits_2, [P64];
1250    Vdup_lane,
1251      [No_op; Const_valuator (fun _ -> 0)],
1252      Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
1253    Vdup_lane,
1254      [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
1255      Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
1256    Vdup_lane,
1257      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1258      Unary_scalar Qreg, "vdupQ_lane", bits_2, [P64];
1259    Vdup_lane,
1260      [No_op; Const_valuator (fun _ -> 0)],
1261      Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
1262
1263    (* Combining vectors.  *)
1264    Vcombine, [Requires_feature "CRYPTO"; No_op],
1265      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
1266      [P64];
1267    Vcombine, [No_op],
1268      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
1269      pf_su_8_64;
1270
1271    (* Splitting vectors.  *)
1272    Vget_high, [Requires_feature "CRYPTO"; No_op],
1273      Use_operands [| Dreg; Qreg |], "vget_high",
1274      notype_1, [P64];
1275    Vget_high, [No_op],
1276      Use_operands [| Dreg; Qreg |], "vget_high",
1277      notype_1, pf_su_8_64;
1278    Vget_low, [Instruction_name ["vmov"];
1279               Disassembles_as [Use_operands [| Dreg; Dreg |]];
1280	       Fixed_vector_reg],
1281      Use_operands [| Dreg; Qreg |], "vget_low",
1282      notype_1, pf_su_8_32;
1283    Vget_low, [Requires_feature "CRYPTO"; No_op],
1284      Use_operands [| Dreg; Qreg |], "vget_low",
1285      notype_1, [P64];
1286    Vget_low, [No_op],
1287      Use_operands [| Dreg; Qreg |], "vget_low",
1288      notype_1, [S64; U64];
1289
1290    (* Conversions.  *)
1291    Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
1292      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1293    Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
1294      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1295    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
1296          Use_operands [| Dreg; Qreg; |], "vcvt", conv_1, [Conv (F16, F32)];
1297    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
1298          Use_operands [| Qreg; Dreg; |], "vcvt", conv_1, [Conv (F32, F16)];
1299    Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
1300      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1301    Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
1302      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1303
1304    (* Move, narrowing.  *)
1305    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
1306      Narrow, "vmovn", sign_invar_1, su_16_64;
1307    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
1308      Narrow, "vqmovn", elts_same_1, su_16_64;
1309    Vmovn,
1310      [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
1311      Narrow, "vqmovun", dst_unsign_1,
1312      [S16; S32; S64];
1313
1314    (* Move, long.  *)
1315    Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
1316      Long, "vmovl", elts_same_1, su_8_32;
1317
1318    (* Table lookup.  *)
1319    Vtbl 1,
1320      [Instruction_name ["vtbl"];
1321       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1322      Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
1323    Vtbl 2, [Instruction_name ["vtbl"]],
1324      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
1325      [U8; S8; P8];
1326    Vtbl 3, [Instruction_name ["vtbl"]],
1327      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
1328      [U8; S8; P8];
1329    Vtbl 4, [Instruction_name ["vtbl"]],
1330      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
1331      [U8; S8; P8];
1332
1333    (* Extended table lookup.  *)
1334    Vtbx 1,
1335      [Instruction_name ["vtbx"];
1336       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1337      Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
1338    Vtbx 2, [Instruction_name ["vtbx"]],
1339      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
1340      [U8; S8; P8];
1341    Vtbx 3, [Instruction_name ["vtbx"]],
1342      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
1343      [U8; S8; P8];
1344    Vtbx 4, [Instruction_name ["vtbx"]],
1345      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
1346      [U8; S8; P8];
1347
1348    (* Multiply, lane.  (note: these were undocumented at the time of
1349       writing).  *)
1350    Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
1351      [S16; S32; U16; U32; F32];
1352    Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
1353      [S16; S32; U16; U32; F32];
1354
1355    (* Multiply-accumulate, lane.  *)
1356    Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
1357      [S16; S32; U16; U32; F32];
1358    Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
1359      [S16; S32; U16; U32; F32];
1360    Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
1361      [S16; S32; U16; U32];
1362    Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
1363      elts_same_io_lane, [S16; S32];
1364
1365    (* Multiply-subtract, lane.  *)
1366    Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
1367      [S16; S32; U16; U32; F32];
1368    Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
1369      [S16; S32; U16; U32; F32];
1370    Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
1371      [S16; S32; U16; U32];
1372    Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
1373      elts_same_io_lane, [S16; S32];
1374
1375    (* Long multiply, lane.  *)
1376    Vmull_lane, [],
1377      Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
1378
1379    (* Saturating doubling long multiply, lane.  *)
1380    Vqdmull_lane, [Saturating; Doubling],
1381      Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
1382
1383    (* Saturating doubling long multiply high, lane.  *)
1384    Vqdmulh_lane, [Saturating; Halving],
1385      By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
1386    Vqdmulh_lane, [Saturating; Halving],
1387      By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
1388    Vqdmulh_lane, [Saturating; Halving; Rounding;
1389		   Instruction_name ["vqrdmulh"]],
1390      By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
1391    Vqdmulh_lane, [Saturating; Halving; Rounding;
1392		   Instruction_name ["vqrdmulh"]],
1393      By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
1394
1395    (* Vector multiply by scalar.  *)
1396    Vmul_n, [InfoWord;
1397             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1398             Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
1399      sign_invar_2, [S16; S32; U16; U32; F32];
1400    Vmul_n, [InfoWord;
1401             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1402             Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
1403      sign_invar_2, [S16; S32; U16; U32; F32];
1404
1405    (* Vector long multiply by scalar.  *)
1406    Vmull_n, [Instruction_name ["vmull"];
1407              Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
1408              Wide_scalar, "vmull_n",
1409      elts_same_2, [S16; S32; U16; U32];
1410
1411    (* Vector saturating doubling long multiply by scalar.  *)
1412    Vqdmull_n, [Saturating; Doubling;
1413	        Disassembles_as [Use_operands [| Qreg; Dreg;
1414						 Element_of_dreg |]]],
1415                Wide_scalar, "vqdmull_n",
1416      elts_same_2, [S16; S32];
1417
1418    (* Vector saturating doubling long multiply high by scalar.  *)
1419    Vqdmulh_n,
1420      [Saturating; Halving; InfoWord;
1421       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1422      Use_operands [| Qreg; Qreg; Corereg |],
1423      "vqdmulhQ_n", elts_same_2, [S16; S32];
1424    Vqdmulh_n,
1425      [Saturating; Halving; InfoWord;
1426       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1427      Use_operands [| Dreg; Dreg; Corereg |],
1428      "vqdmulh_n", elts_same_2, [S16; S32];
1429    Vqdmulh_n,
1430      [Saturating; Halving; Rounding; InfoWord;
1431       Instruction_name ["vqrdmulh"];
1432       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1433      Use_operands [| Qreg; Qreg; Corereg |],
1434      "vqRdmulhQ_n", elts_same_2, [S16; S32];
1435    Vqdmulh_n,
1436      [Saturating; Halving; Rounding; InfoWord;
1437       Instruction_name ["vqrdmulh"];
1438       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1439      Use_operands [| Dreg; Dreg; Corereg |],
1440      "vqRdmulh_n", elts_same_2, [S16; S32];
1441
1442    (* Vector multiply-accumulate by scalar.  *)
1443    Vmla_n, [InfoWord;
1444             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1445      Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
1446      sign_invar_io, [S16; S32; U16; U32; F32];
1447    Vmla_n, [InfoWord;
1448             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1449      Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
1450      sign_invar_io, [S16; S32; U16; U32; F32];
1451    Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
1452    Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
1453      [S16; S32];
1454
1455    (* Vector multiply subtract by scalar.  *)
1456    Vmls_n, [InfoWord;
1457             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1458      Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
1459      sign_invar_io, [S16; S32; U16; U32; F32];
1460    Vmls_n, [InfoWord;
1461             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1462      Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
1463      sign_invar_io, [S16; S32; U16; U32; F32];
1464    Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
1465    Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
1466      [S16; S32];
1467
1468    (* Vector extract.  *)
1469    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1470      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
1471      [P64];
1472    Vext, [Const_valuator (fun _ -> 0)],
1473      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
1474      pf_su_8_64;
1475    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1476      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
1477      [P64];
1478    Vext, [Const_valuator (fun _ -> 0)],
1479      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
1480      pf_su_8_64;
1481
1482    (* Reverse elements.  *)
1483    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
1484      P8 :: P16 :: F32 :: su_8_32;
1485    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
1486      P8 :: P16 :: F32 :: su_8_32;
1487    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
1488      [P8; P16; S8; U8; S16; U16];
1489    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
1490      [P8; P16; S8; U8; S16; U16];
1491    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
1492      [P8; S8; U8];
1493    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
1494      [P8; S8; U8];
1495
1496    (* Bit selection.  *)
1497    Vbsl,
1498      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
1499       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
1500      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
1501      [P64];
1502    Vbsl,
1503      [Instruction_name ["vbsl"; "vbit"; "vbif"];
1504       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
1505      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
1506      pf_su_8_64;
1507    Vbsl,
1508      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
1509       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
1510      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
1511      [P64];
1512    Vbsl,
1513      [Instruction_name ["vbsl"; "vbit"; "vbif"];
1514       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
1515      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
1516      pf_su_8_64;
1517
1518    Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16;
1519    Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32;
1520    Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
1521    (* Zip elements.  *)
1522    Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16;
1523    Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32;
1524    Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; 
1525
1526    (* Unzip elements.  *)
1527    Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
1528      pf_su_8_32;
1529    Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
1530      pf_su_8_32;
1531
1532    (* Element/structure loads.  VLD1 variants.  *)
1533    Vldx 1,
1534      [Requires_feature "CRYPTO";
1535       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1536                                        CstPtrTo Corereg |]]],
1537      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
1538      [P64];
1539    Vldx 1,
1540      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1541                                        CstPtrTo Corereg |]]],
1542      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
1543      pf_su_8_64;
1544    Vldx 1, [Requires_feature "CRYPTO";
1545             Disassembles_as [Use_operands [| VecArray (2, Dreg);
1546					      CstPtrTo Corereg |]]],
1547      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
1548      [P64];
1549    Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1550					      CstPtrTo Corereg |]]],
1551      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
1552      pf_su_8_64;
1553
1554    Vldx_lane 1,
1555      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1556                                        CstPtrTo Corereg |]]],
1557      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1558      "vld1_lane", bits_3, pf_su_8_32;
1559    Vldx_lane 1,
1560      [Requires_feature "CRYPTO";
1561       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1562                                        CstPtrTo Corereg |]];
1563       Const_valuator (fun _ -> 0)],
1564      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1565      "vld1_lane", bits_3, [P64];
1566    Vldx_lane 1,
1567      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1568                                        CstPtrTo Corereg |]];
1569       Const_valuator (fun _ -> 0)],
1570      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1571      "vld1_lane", bits_3, [S64; U64];
1572    Vldx_lane 1,
1573      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1574                                        CstPtrTo Corereg |]]],
1575      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1576      "vld1Q_lane", bits_3, pf_su_8_32;
1577    Vldx_lane 1,
1578      [Requires_feature "CRYPTO";
1579       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1580                                        CstPtrTo Corereg |]]],
1581      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1582      "vld1Q_lane", bits_3, [P64];
1583    Vldx_lane 1,
1584      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1585                                        CstPtrTo Corereg |]]],
1586      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1587      "vld1Q_lane", bits_3, [S64; U64];
1588
1589    Vldx_dup 1,
1590      [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
1591                                        CstPtrTo Corereg |]]],
1592      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1593      bits_1, pf_su_8_32;
1594    Vldx_dup 1,
1595      [Requires_feature "CRYPTO";
1596       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1597                                        CstPtrTo Corereg |]]],
1598      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1599      bits_1, [P64];
1600    Vldx_dup 1,
1601      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1602                                        CstPtrTo Corereg |]]],
1603      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1604      bits_1, [S64; U64];
1605    Vldx_dup 1,
1606      [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
1607                                        CstPtrTo Corereg |]]],
1608      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1609      bits_1, pf_su_8_32;
1610    (* Treated identically to vld1_dup above as we now
1611       do a single load followed by a duplicate.  *)
1612    Vldx_dup 1,
1613      [Requires_feature "CRYPTO";
1614       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1615                                        CstPtrTo Corereg |]]],
1616      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1617      bits_1, [P64];
1618    Vldx_dup 1,
1619      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1620                                        CstPtrTo Corereg |]]],
1621      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1622      bits_1, [S64; U64];
1623
1624    (* VST1 variants.  *)
1625    Vstx 1, [Requires_feature "CRYPTO";
1626             Disassembles_as [Use_operands [| VecArray (1, Dreg);
1627                                              PtrTo Corereg |]]],
1628      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
1629      store_1, [P64];
1630    Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1631                                              PtrTo Corereg |]]],
1632      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
1633      store_1, pf_su_8_64;
1634    Vstx 1, [Requires_feature "CRYPTO";
1635             Disassembles_as [Use_operands [| VecArray (2, Dreg);
1636					      PtrTo Corereg |]]],
1637      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
1638      store_1, [P64];
1639    Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1640					      PtrTo Corereg |]]],
1641      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
1642      store_1, pf_su_8_64;
1643
1644    Vstx_lane 1,
1645      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1646                                        CstPtrTo Corereg |]]],
1647      Use_operands [| PtrTo Corereg; Dreg; Immed |],
1648      "vst1_lane", store_3, pf_su_8_32;
1649    Vstx_lane 1,
1650      [Requires_feature "CRYPTO";
1651       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1652                                        CstPtrTo Corereg |]];
1653       Const_valuator (fun _ -> 0)],
1654      Use_operands [| PtrTo Corereg; Dreg; Immed |],
1655      "vst1_lane", store_3, [P64];
1656    Vstx_lane 1,
1657      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1658                                        CstPtrTo Corereg |]];
1659       Const_valuator (fun _ -> 0)],
1660      Use_operands [| PtrTo Corereg; Dreg; Immed |],
1661      "vst1_lane", store_3, [U64; S64];
1662    Vstx_lane 1,
1663      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1664                                        CstPtrTo Corereg |]]],
1665      Use_operands [| PtrTo Corereg; Qreg; Immed |],
1666      "vst1Q_lane", store_3, pf_su_8_32;
1667    Vstx_lane 1,
1668      [Requires_feature "CRYPTO";
1669       Disassembles_as [Use_operands [| VecArray (1, Dreg);
1670                                        CstPtrTo Corereg |]]],
1671      Use_operands [| PtrTo Corereg; Qreg; Immed |],
1672      "vst1Q_lane", store_3, [P64];
1673    Vstx_lane 1,
1674      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1675                                        CstPtrTo Corereg |]]],
1676      Use_operands [| PtrTo Corereg; Qreg; Immed |],
1677      "vst1Q_lane", store_3, [U64; S64];
1678
1679    (* VLD2 variants.  *)
1680    Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1681      "vld2", bits_1, pf_su_8_32;
1682    Vldx 2, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1683       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1684      "vld2", bits_1, [P64];
1685    Vldx 2, [Instruction_name ["vld1"]],
1686       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1687      "vld2", bits_1, [S64; U64];
1688    Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1689                                              CstPtrTo Corereg |];
1690                              Use_operands [| VecArray (2, Dreg);
1691					      CstPtrTo Corereg |]]],
1692      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
1693      "vld2Q", bits_1, pf_su_8_32;
1694
1695    Vldx_lane 2,
1696      [Disassembles_as [Use_operands
1697        [| VecArray (2, Element_of_dreg);
1698           CstPtrTo Corereg |]]],
1699      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
1700                      VecArray (2, Dreg); Immed |],
1701      "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1702    Vldx_lane 2,
1703      [Disassembles_as [Use_operands
1704        [| VecArray (2, Element_of_dreg);
1705           CstPtrTo Corereg |]]],
1706      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
1707 	              VecArray (2, Qreg); Immed |],
1708      "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1709
1710    Vldx_dup 2,
1711      [Disassembles_as [Use_operands
1712        [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
1713      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1714      "vld2_dup", bits_1, pf_su_8_32;
1715    Vldx_dup 2,
1716      [Requires_feature "CRYPTO";
1717       Instruction_name ["vld1"]; Disassembles_as [Use_operands
1718        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
1719      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1720      "vld2_dup", bits_1, [P64];
1721    Vldx_dup 2,
1722      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1723        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
1724      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1725      "vld2_dup", bits_1, [S64; U64];
1726
1727    (* VST2 variants.  *)
1728    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1729                                              PtrTo Corereg |]]],
1730      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1731      store_1, pf_su_8_32;
1732    Vstx 2, [Requires_feature "CRYPTO";
1733             Disassembles_as [Use_operands [| VecArray (2, Dreg);
1734                                              PtrTo Corereg |]];
1735             Instruction_name ["vst1"]],
1736      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1737      store_1, [P64];
1738    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1739                                              PtrTo Corereg |]];
1740             Instruction_name ["vst1"]],
1741      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1742      store_1, [S64; U64];
1743    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1744					      PtrTo Corereg |];
1745                              Use_operands [| VecArray (2, Dreg);
1746				              PtrTo Corereg |]]],
1747      Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
1748      store_1, pf_su_8_32;
1749
1750    Vstx_lane 2,
1751      [Disassembles_as [Use_operands
1752        [| VecArray (2, Element_of_dreg);
1753           CstPtrTo Corereg |]]],
1754      Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
1755      store_3, P8 :: P16 :: F32 :: su_8_32;
1756    Vstx_lane 2,
1757      [Disassembles_as [Use_operands
1758        [| VecArray (2, Element_of_dreg);
1759           CstPtrTo Corereg |]]],
1760      Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
1761      store_3, [P16; F32; U16; U32; S16; S32];
1762
1763    (* VLD3 variants.  *)
1764    Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1765      "vld3", bits_1, pf_su_8_32;
1766    Vldx 3, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1767      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1768      "vld3", bits_1, [P64];
1769    Vldx 3, [Instruction_name ["vld1"]],
1770      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1771      "vld3", bits_1, [S64; U64];
1772    Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1773					      CstPtrTo Corereg |];
1774                              Use_operands [| VecArray (3, Dreg);
1775					      CstPtrTo Corereg |]]],
1776      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
1777      "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1778
1779    Vldx_lane 3,
1780      [Disassembles_as [Use_operands
1781        [| VecArray (3, Element_of_dreg);
1782           CstPtrTo Corereg |]]],
1783      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
1784                                     VecArray (3, Dreg); Immed |],
1785      "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1786    Vldx_lane 3,
1787      [Disassembles_as [Use_operands
1788        [| VecArray (3, Element_of_dreg);
1789           CstPtrTo Corereg |]]],
1790      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
1791				     VecArray (3, Qreg); Immed |],
1792      "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1793
1794    Vldx_dup 3,
1795      [Disassembles_as [Use_operands
1796        [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
1797      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1798      "vld3_dup", bits_1, pf_su_8_32;
1799    Vldx_dup 3,
1800      [Requires_feature "CRYPTO";
1801       Instruction_name ["vld1"]; Disassembles_as [Use_operands
1802        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
1803      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1804      "vld3_dup", bits_1, [P64];
1805    Vldx_dup 3,
1806      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1807        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
1808      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1809      "vld3_dup", bits_1, [S64; U64];
1810
1811    (* VST3 variants.  *)
1812    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1813                                              PtrTo Corereg |]]],
1814      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1815      store_1, pf_su_8_32;
1816    Vstx 3, [Requires_feature "CRYPTO";
1817             Disassembles_as [Use_operands [| VecArray (4, Dreg);
1818                                              PtrTo Corereg |]];
1819             Instruction_name ["vst1"]],
1820      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1821      store_1, [P64];
1822    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1823                                              PtrTo Corereg |]];
1824             Instruction_name ["vst1"]],
1825      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1826      store_1, [S64; U64];
1827    Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1828					      PtrTo Corereg |];
1829                              Use_operands [| VecArray (3, Dreg);
1830					      PtrTo Corereg |]]],
1831      Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
1832      store_1, pf_su_8_32;
1833
1834    Vstx_lane 3,
1835      [Disassembles_as [Use_operands
1836        [| VecArray (3, Element_of_dreg);
1837           CstPtrTo Corereg |]]],
1838      Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
1839      store_3, P8 :: P16 :: F32 :: su_8_32;
1840    Vstx_lane 3,
1841      [Disassembles_as [Use_operands
1842        [| VecArray (3, Element_of_dreg);
1843           CstPtrTo Corereg |]]],
1844      Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
1845      store_3, [P16; F32; U16; U32; S16; S32];
1846
1847    (* VLD4/VST4 variants.  *)
1848    Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1849      "vld4", bits_1, pf_su_8_32;
1850    Vldx 4, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1851      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1852      "vld4", bits_1, [P64];
1853    Vldx 4, [Instruction_name ["vld1"]],
1854      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1855      "vld4", bits_1, [S64; U64];
1856    Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1857					      CstPtrTo Corereg |];
1858                              Use_operands [| VecArray (4, Dreg);
1859					      CstPtrTo Corereg |]]],
1860      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
1861      "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1862
1863    Vldx_lane 4,
1864      [Disassembles_as [Use_operands
1865        [| VecArray (4, Element_of_dreg);
1866           CstPtrTo Corereg |]]],
1867      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
1868                                     VecArray (4, Dreg); Immed |],
1869      "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1870    Vldx_lane 4,
1871      [Disassembles_as [Use_operands
1872        [| VecArray (4, Element_of_dreg);
1873           CstPtrTo Corereg |]]],
1874      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
1875   	              VecArray (4, Qreg); Immed |],
1876      "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1877
1878    Vldx_dup 4,
1879      [Disassembles_as [Use_operands
1880        [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
1881      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1882      "vld4_dup", bits_1, pf_su_8_32;
1883    Vldx_dup 4,
1884      [Requires_feature "CRYPTO";
1885       Instruction_name ["vld1"]; Disassembles_as [Use_operands
1886        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
1887      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1888      "vld4_dup", bits_1, [P64];
1889    Vldx_dup 4,
1890      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1891        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
1892      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1893      "vld4_dup", bits_1, [S64; U64];
1894
1895    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1896                                              PtrTo Corereg |]]],
1897      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1898      store_1, pf_su_8_32;
1899    Vstx 4, [Requires_feature "CRYPTO";
1900             Disassembles_as [Use_operands [| VecArray (4, Dreg);
1901                                              PtrTo Corereg |]];
1902             Instruction_name ["vst1"]],
1903      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1904      store_1, [P64];
1905    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1906                                              PtrTo Corereg |]];
1907             Instruction_name ["vst1"]],
1908      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1909      store_1, [S64; U64];
1910    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1911					      PtrTo Corereg |];
1912                              Use_operands [| VecArray (4, Dreg);
1913					      PtrTo Corereg |]]],
1914     Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
1915      store_1, pf_su_8_32;
1916
1917    Vstx_lane 4,
1918      [Disassembles_as [Use_operands
1919        [| VecArray (4, Element_of_dreg);
1920           CstPtrTo Corereg |]]],
1921      Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
1922      store_3, P8 :: P16 :: F32 :: su_8_32;
1923    Vstx_lane 4,
1924      [Disassembles_as [Use_operands
1925        [| VecArray (4, Element_of_dreg);
1926           CstPtrTo Corereg |]]],
1927      Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
1928      store_3, [P16; F32; U16; U32; S16; S32];
1929
1930    (* Logical operations. And.  *)
1931    Vand, [], All (3, Dreg), "vand", notype_2, su_8_32;
1932    Vand, [No_op], All (3, Dreg), "vand", notype_2, [S64; U64];
1933    Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
1934
1935    (* Or.  *)
1936    Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_32;
1937    Vorr, [No_op], All (3, Dreg), "vorr", notype_2, [S64; U64];
1938    Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
1939
1940    (* Eor.  *)
1941    Veor, [], All (3, Dreg), "veor", notype_2, su_8_32;
1942    Veor, [No_op], All (3, Dreg), "veor", notype_2, [S64; U64];
1943    Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
1944
1945    (* Bic (And-not).  *)
1946    Vbic, [Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, su_8_32;
1947    Vbic, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, [S64; U64];
1948    Vbic, [Compiler_optim "-O2"], All (3, Qreg), "vbicQ", notype_2, su_8_64;
1949
1950    (* Or-not.  *)
1951    Vorn, [Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, su_8_32;
1952    Vorn, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, [S64; U64];
1953    Vorn, [Compiler_optim "-O2"], All (3, Qreg), "vornQ", notype_2, su_8_64;
1954  ]
1955
1956let type_in_crypto_only t
1957  = (t == P64) || (t == P128)
1958
1959let cross_product s1 s2
1960  = List.filter (fun (e, e') -> e <> e')
1961                (List.concat (List.map (fun e1 -> List.map (fun e2 -> (e1,e2)) s1) s2))
1962
1963let reinterp =
1964  let elems = P8 :: P16 :: F32 :: P64 :: su_8_64 in
1965  let casts = cross_product elems elems in
1966  List.map
1967    (fun (convto, convfrom) ->
1968       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
1969                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Dreg; Dreg |],
1970                   "vreinterpret", conv_1, [Cast (convto, convfrom)])
1971    casts
1972
1973let reinterpq =
1974  let elems = P8 :: P16 :: F32 :: P64 :: P128 :: su_8_64 in
1975  let casts = cross_product elems elems in
1976  List.map
1977    (fun (convto, convfrom) ->
1978       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
1979                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Qreg; Qreg |],
1980                   "vreinterpretQ", conv_1, [Cast (convto, convfrom)])
1981    casts
1982
1983(* Output routines.  *)
1984
1985let rec string_of_elt = function
1986    S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
1987  | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
1988  | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
1989  | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
1990  | F16 -> "f16" | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
1991  | P64 -> "p64" | P128 -> "p128"
1992  | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
1993  | NoElts -> failwith "No elts"
1994
1995let string_of_elt_dots elt =
1996  match elt with
1997    Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
1998  | _ -> string_of_elt elt
1999
2000let string_of_vectype vt =
2001  let rec name affix = function
2002    T_int8x8 -> affix "int8x8"
2003  | T_int8x16 -> affix "int8x16"
2004  | T_int16x4 -> affix "int16x4"
2005  | T_int16x8 -> affix "int16x8"
2006  | T_int32x2 -> affix "int32x2"
2007  | T_int32x4 -> affix "int32x4"
2008  | T_int64x1 -> affix "int64x1"
2009  | T_int64x2 -> affix "int64x2"
2010  | T_uint8x8 -> affix "uint8x8"
2011  | T_uint8x16 -> affix "uint8x16"
2012  | T_uint16x4 -> affix "uint16x4"
2013  | T_uint16x8 -> affix "uint16x8"
2014  | T_uint32x2 -> affix "uint32x2"
2015  | T_uint32x4 -> affix "uint32x4"
2016  | T_uint64x1 -> affix "uint64x1"
2017  | T_uint64x2 -> affix "uint64x2"
2018  | T_float16x4 -> affix "float16x4"
2019  | T_float32x2 -> affix "float32x2"
2020  | T_float32x4 -> affix "float32x4"
2021  | T_poly8x8 -> affix "poly8x8"
2022  | T_poly8x16 -> affix "poly8x16"
2023  | T_poly16x4 -> affix "poly16x4"
2024  | T_poly16x8 -> affix "poly16x8"
2025  | T_int8 -> affix "int8"
2026  | T_int16 -> affix "int16"
2027  | T_int32 -> affix "int32"
2028  | T_int64 -> affix "int64"
2029  | T_uint8 -> affix "uint8"
2030  | T_uint16 -> affix "uint16"
2031  | T_uint32 -> affix "uint32"
2032  | T_uint64 -> affix "uint64"
2033  | T_poly8 -> affix "poly8"
2034  | T_poly16 -> affix "poly16"
2035  | T_poly64 -> affix "poly64"
2036  | T_poly64x1 -> affix "poly64x1"
2037  | T_poly64x2 -> affix "poly64x2"
2038  | T_poly128 -> affix "poly128"
2039  | T_float16 -> affix "float16"
2040  | T_float32 -> affix "float32"
2041  | T_immediate _ -> "const int"
2042  | T_void -> "void"
2043  | T_intQI -> "__builtin_neon_qi"
2044  | T_intHI -> "__builtin_neon_hi"
2045  | T_intSI -> "__builtin_neon_si"
2046  | T_intDI -> "__builtin_neon_di"
2047  | T_intTI -> "__builtin_neon_ti"
2048  | T_floatHF -> "__builtin_neon_hf"
2049  | T_floatSF -> "__builtin_neon_sf"
2050  | T_arrayof (num, base) ->
2051      let basename = name (fun x -> x) base in
2052      affix (Printf.sprintf "%sx%d" basename num)
2053  | T_ptrto x ->
2054      let basename = name affix x in
2055      Printf.sprintf "%s *" basename
2056  | T_const x ->
2057      let basename = name affix x in
2058      Printf.sprintf "const %s" basename
2059  in
2060    name (fun x -> x ^ "_t") vt
2061
2062let string_of_inttype = function
2063    B_TImode -> "__builtin_neon_ti"
2064  | B_EImode -> "__builtin_neon_ei"
2065  | B_OImode -> "__builtin_neon_oi"
2066  | B_CImode -> "__builtin_neon_ci"
2067  | B_XImode -> "__builtin_neon_xi"
2068
2069let string_of_mode = function
2070    V8QI -> "v8qi" | V4HI -> "v4hi" | V4HF  -> "v4hf"  | V2SI -> "v2si"
2071  | V2SF -> "v2sf" | DI   -> "di"   | V16QI -> "v16qi" | V8HI -> "v8hi"
2072  | V4SI -> "v4si" | V4SF -> "v4sf" | V2DI  -> "v2di"  | QI   -> "qi"
2073  | HI -> "hi" | SI -> "si" | SF -> "sf" | TI -> "ti"
2074
2075(* Use uppercase chars for letters which form part of the intrinsic name, but
2076   should be omitted from the builtin name (the info is passed in an extra
2077   argument, instead).  *)
2078let intrinsic_name name = String.lowercase name
2079
2080(* Allow the name of the builtin to be overridden by things (e.g. Flipped)
2081   found in the features list.  *)
2082let builtin_name features name =
2083  let name = List.fold_right
2084               (fun el name ->
2085                 match el with
2086                   Flipped x | Builtin_name x -> x
2087                 | _ -> name)
2088               features name in
2089  let islower x = let str = String.make 1 x in (String.lowercase str) = str
2090  and buf = Buffer.create (String.length name) in
2091  String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
2092  Buffer.contents buf
2093
2094(* Transform an arity into a list of strings.  *)
2095let strings_of_arity a =
2096  match a with
2097  | Arity0 vt -> [string_of_vectype vt]
2098  | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
2099  | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
2100			       string_of_vectype vt2;
2101                               string_of_vectype vt3]
2102  | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
2103                                    string_of_vectype vt2;
2104                                    string_of_vectype vt3;
2105                                    string_of_vectype vt4]
2106  | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
2107                                         string_of_vectype vt2;
2108                                         string_of_vectype vt3;
2109                                         string_of_vectype vt4;
2110                                         string_of_vectype vt5]
2111
2112(* Suffixes on the end of builtin names that are to be stripped in order
2113   to obtain the name used as an instruction.  They are only stripped if
2114   preceded immediately by an underscore.  *)
2115let suffixes_to_strip = [ "n"; "lane"; "dup" ]
2116
2117(* Get the possible names of an instruction corresponding to a "name" from the
2118   ops table.  This is done by getting the equivalent builtin name and
2119   stripping any suffixes from the list at the top of this file, unless
2120   the features list presents with an Instruction_name entry, in which
2121   case that is used; or unless the features list presents with a Flipped
2122   entry, in which case that is used.  If both such entries are present,
2123   the first in the list will be chosen.  *)
2124let get_insn_names features name =
2125  let names = try
2126  begin
2127    match List.find (fun feature -> match feature with
2128                                      Instruction_name _ -> true
2129				    | Flipped _ -> true
2130				    | _ -> false) features
2131    with
2132      Instruction_name names -> names
2133    | Flipped name -> [name]
2134    | _ -> assert false
2135  end
2136  with Not_found -> [builtin_name features name]
2137  in
2138  begin
2139    List.map (fun name' ->
2140      try
2141        let underscore = String.rindex name' '_' in
2142        let our_suffix = String.sub name' (underscore + 1)
2143                                    ((String.length name') - underscore - 1)
2144        in
2145          let rec strip remaining_suffixes =
2146            match remaining_suffixes with
2147              [] -> name'
2148            | s::ss when our_suffix = s -> String.sub name' 0 underscore
2149            | _::ss -> strip ss
2150          in
2151            strip suffixes_to_strip
2152      with (Not_found | Invalid_argument _) -> name') names
2153  end
2154
2155(* Apply a function to each element of a list and then comma-separate
2156   the resulting strings.  *)
2157let rec commas f elts acc =
2158  match elts with
2159    [] -> acc
2160  | [elt] -> acc ^ (f elt)
2161  | elt::elts ->
2162    commas f elts (acc ^ (f elt) ^ ", ")
2163
2164(* Given a list of features and the shape specified in the "ops" table, apply
2165   a function to each possible shape that the instruction may have.
2166   By default, this is the "shape" entry in "ops".  If the features list
2167   contains a Disassembles_as entry, the shapes contained in that entry are
2168   mapped to corresponding outputs and returned in a list.  If there is more
2169   than one Disassembles_as entry, only the first is used.  *)
2170let analyze_all_shapes features shape f =
2171  try
2172    match List.find (fun feature ->
2173                       match feature with Disassembles_as _ -> true
2174                                        | _ -> false)
2175                    features with
2176      Disassembles_as shapes -> List.map f shapes
2177    | _ -> assert false
2178  with Not_found -> [f shape]
2179
2180(* The crypto intrinsics have unconventional shapes and are not that
2181   numerous to be worth the trouble of encoding here.  We implement them
2182   explicitly here.  *)
2183let crypto_intrinsics =
2184"
2185#ifdef __ARM_FEATURE_CRYPTO
2186
2187__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2188vldrq_p128 (poly128_t const * __ptr)
2189{
2190#ifdef __ARM_BIG_ENDIAN
2191  poly64_t* __ptmp = (poly64_t*) __ptr;
2192  poly64_t __d0 = vld1_p64 (__ptmp);
2193  poly64_t __d1 = vld1_p64 (__ptmp + 1);
2194  return vreinterpretq_p128_p64 (vcombine_p64 (__d1, __d0));
2195#else
2196  return vreinterpretq_p128_p64 (vld1q_p64 ((poly64_t*) __ptr));
2197#endif
2198}
2199
2200__extension__ static __inline void __attribute__ ((__always_inline__))
2201vstrq_p128 (poly128_t * __ptr, poly128_t __val)
2202{
2203#ifdef __ARM_BIG_ENDIAN
2204  poly64x2_t __tmp = vreinterpretq_p64_p128 (__val);
2205  poly64_t __d0 = vget_high_p64 (__tmp);
2206  poly64_t __d1 = vget_low_p64 (__tmp);
2207  vst1q_p64 ((poly64_t*) __ptr, vcombine_p64 (__d0, __d1));
2208#else
2209  vst1q_p64 ((poly64_t*) __ptr, vreinterpretq_p64_p128 (__val));
2210#endif
2211}
2212
2213/* The vceq_p64 intrinsic does not map to a single instruction.
2214   Instead we emulate it by performing a 32-bit variant of the vceq
2215   and applying a pairwise min reduction to the result.
2216   vceq_u32 will produce two 32-bit halves, each of which will contain either
2217   all ones or all zeros depending on whether the corresponding 32-bit
2218   halves of the poly64_t were equal.  The whole poly64_t values are equal
2219   if and only if both halves are equal, i.e. vceq_u32 returns all ones.
2220   If the result is all zeroes for any half then the whole result is zeroes.
2221   This is what the pairwise min reduction achieves.  */
2222
2223__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
2224vceq_p64 (poly64x1_t __a, poly64x1_t __b)
2225{
2226  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
2227  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
2228  uint32x2_t __c = vceq_u32 (__t_a, __t_b);
2229  uint32x2_t __m = vpmin_u32 (__c, __c);
2230  return vreinterpret_u64_u32 (__m);
2231}
2232
2233/* The vtst_p64 intrinsic does not map to a single instruction.
2234   We emulate it in way similar to vceq_p64 above but here we do
2235   a reduction with max since if any two corresponding bits
2236   in the two poly64_t's match, then the whole result must be all ones.  */
2237
2238__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
2239vtst_p64 (poly64x1_t __a, poly64x1_t __b)
2240{
2241  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
2242  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
2243  uint32x2_t __c = vtst_u32 (__t_a, __t_b);
2244  uint32x2_t __m = vpmax_u32 (__c, __c);
2245  return vreinterpret_u64_u32 (__m);
2246}
2247
2248__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2249vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
2250{
2251  return __builtin_arm_crypto_aese (__data, __key);
2252}
2253
2254__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2255vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
2256{
2257  return __builtin_arm_crypto_aesd (__data, __key);
2258}
2259
2260__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2261vaesmcq_u8 (uint8x16_t __data)
2262{
2263  return __builtin_arm_crypto_aesmc (__data);
2264}
2265
2266__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2267vaesimcq_u8 (uint8x16_t __data)
2268{
2269  return __builtin_arm_crypto_aesimc (__data);
2270}
2271
2272__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
2273vsha1h_u32 (uint32_t __hash_e)
2274{
2275  uint32x4_t __t = vdupq_n_u32 (0);
2276  __t = vsetq_lane_u32 (__hash_e, __t, 0);
2277  __t = __builtin_arm_crypto_sha1h (__t);
2278  return vgetq_lane_u32 (__t, 0);
2279}
2280
2281__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2282vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2283{
2284  uint32x4_t __t = vdupq_n_u32 (0);
2285  __t = vsetq_lane_u32 (__hash_e, __t, 0);
2286  return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
2287}
2288
2289__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2290vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2291{
2292  uint32x4_t __t = vdupq_n_u32 (0);
2293  __t = vsetq_lane_u32 (__hash_e, __t, 0);
2294  return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
2295}
2296
2297__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2298vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2299{
2300  uint32x4_t __t = vdupq_n_u32 (0);
2301  __t = vsetq_lane_u32 (__hash_e, __t, 0);
2302  return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
2303}
2304
2305__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2306vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
2307{
2308  return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
2309}
2310
2311__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2312vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
2313{
2314  return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
2315}
2316
2317__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2318vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
2319{
2320  return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
2321}
2322
2323__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2324vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
2325{
2326  return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
2327}
2328
2329__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2330vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
2331{
2332  return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
2333}
2334
2335__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2336vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
2337{
2338  return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
2339}
2340
2341__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2342vmull_p64 (poly64_t __a, poly64_t __b)
2343{
2344  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
2345}
2346
2347__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2348vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
2349{
2350  poly64_t __t1 = vget_high_p64 (__a);
2351  poly64_t __t2 = vget_high_p64 (__b);
2352
2353  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __t1, (uint64_t) __t2);
2354}
2355
2356#endif
2357"
2358