1(*
2    Copyright David C. J. Matthews 1989, 2000, 2009-10, 2012-13, 2015-20
3    
4    Based on original code:    
5    Copyright (c) 2000
6        Cambridge University Technical Services Limited
7
8    This library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU Lesser General Public
10    License version 2.1 as published by the Free Software Foundation.
11    
12    This library is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    Lesser General Public License for more details.
16    
17    You should have received a copy of the GNU Lesser General Public
18    License along with this library; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20*)
21
22(*
23    Title:      Code Generator Routines.
24    Author:     Dave Matthews, Cambridge University Computer Laboratory
25    Copyright   Cambridge University 1989
26*)
27
28(* This module contains the code vector and operations to insert code into
29   it. Each procedure is compiled into a separate segment. Initially it is
30   compiled into a fixed size segment, and then copied into a segment of the
31   correct size at the end.
32   This module contains all the definitions of the X86 opCodes and registers.
33   It uses "codeseg" to create and operate on the segment itself.
34 *)
35
36functor X86OUTPUTCODE (
37structure DEBUG: DEBUG
38structure PRETTY: PRETTYSIG (* for compilerOutTag *)
39structure CODE_ARRAY: CODEARRAYSIG
40
41) : X86CODESIG =
42
43struct
44    open CODE_ARRAY
45    open DEBUG
46    open Address
47    open Misc
48
49    (* May be targeted at native 32-bit, native 64-bit or X86/64 with 32-bit words
50       and addresses as object Ids. *)
51    datatype targetArch = Native32Bit | Native64Bit | ObjectId32Bit
52        
53    val targetArch =
54        case PolyML.architecture() of
55            "I386"      => Native32Bit
56        |   "X86_64"    => Native64Bit
57        |   "X86_64_32" => ObjectId32Bit
58        |   _ => raise InternalError "Unknown target architecture"
59    
60    (* Some checks - *)
61    val () =
62        case (targetArch, wordSize, nativeWordSize) of
63            (Native32Bit, 0w4, 0w4) => ()
64        |   (Native64Bit, 0w8, 0w8) => ()
65        |   (ObjectId32Bit, 0w4, 0w8) => ()
66        |   _ => raise InternalError "Mismatch of architecture and word-length"
67
68    val hostIsX64 = targetArch <> Native32Bit
69
70    infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *)
71    infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8
72    
73    val op << = Word.<< and op >> = Word.>>
74    val (*op <<+ = LargeWord.<< and *) op >>+ = LargeWord.>>
75    val op <<- = Word8.<< and op >>- = Word8.>>
76
77    val op orb8 = Word8.orb
78    val op andb8 = Word8.andb
79
80    val op andb = Word.andb (* and op andbL = LargeWord.andb *)
81    and op orb  = Word.orb
82
83    val wordToWord8 = Word8.fromLargeWord o Word.toLargeWord
84    (*and word8ToWord = Word.fromLargeWord o Word8.toLargeWord*)
85
86    val exp2_16 =        0x10000
87    val exp2_31 =        0x80000000: LargeInt.int
88
89    (* Returns true if this a 32-bit machine or if the constant is within 32-bits.
90       This is exported to the higher levels.  N.B.  The test for not isX64
91       avoids a significant overhead with arbitrary precision arithmetic on
92       X86/32. *)
93    fun is32bit v = not hostIsX64 orelse ~exp2_31 <= v andalso v < exp2_31
94
95    (* tag a short constant *)
96    fun tag c = 2 * c + 1;
97
98    fun is8BitL (n: LargeInt.int) = ~ 0x80 <= n andalso n < 0x80
99
100    local
101        val shift =
102            if wordSize = 0w4
103            then 0w2
104            else if wordSize = 0w8
105            then 0w3
106            else raise InternalError "Invalid word size for x86_32 or x86+64"
107    in
108        fun wordsToBytes n = n << shift
109        and bytesToWords n = n >> shift
110    end
111
112    infix 6 addrPlus addrMinus;
113  
114    (* All indexes into the code vector have type "addrs". This is really a legacy. *)
115    type addrs = Word.word
116  
117    val addrZero = 0w0
118 
119    (* This is the external label type used when constructing operations. *)
120    datatype label = Label of { labelNo: int }
121   
122  (* Constants which are too large to go inline in the code are put in
123     a list and put at the end of the code. They are arranged so that
124     the garbage collector can find them and change them as necessary.
125     A reference to a constant is treated like a forward reference to a
126     label. *)
127
128    datatype code =
129    Code of 
130    {
131        procName:       string,         (* Name of the procedure. *)
132        printAssemblyCode:bool,            (* Whether to print the code when we finish. *)
133        printStream:    string->unit,   (* The stream to use *)
134        lowLevelOptimise: bool,         (* Whether to do the low-level optimisation pass *)
135        profileObject   : machineWord  (* The profile object for this code. *)
136    }
137
138    (* Exported functions *)
139    fun lowLevelOptimise(Code{lowLevelOptimise, ...}) = lowLevelOptimise
140
141  (* EBP/RBP points to a structure that interfaces to the RTS.  These are
142     offsets into that structure. *)
143    val memRegLocalMPointer       = 0 (* Not used in 64-bit *)
144    and memRegHandlerRegister     = Word.toInt nativeWordSize
145    and memRegLocalMbottom        = 2 * Word.toInt nativeWordSize
146    and memRegStackLimit          = 3 * Word.toInt nativeWordSize
147    and memRegExceptionPacket     = 4 * Word.toInt nativeWordSize
148    and memRegCStackPtr           = 6 * Word.toInt nativeWordSize
149    and memRegThreadSelf          = 7 * Word.toInt nativeWordSize
150    and memRegStackPtr            = 8 * Word.toInt nativeWordSize
151    and memRegHeapOverflowCall    = 10 * Word.toInt nativeWordSize
152    and memRegStackOverflowCall   = 11 * Word.toInt nativeWordSize
153    and memRegStackOverflowCallEx = 12 * Word.toInt nativeWordSize
154    and memRegSavedRbx            = 14 * Word.toInt nativeWordSize (* Heap base in 32-in-64. *)
155
156    (* create and initialise a code segment *)
157    fun codeCreate (name : string, profObj, parameters) : code =
158    let
159        val printStream = PRETTY.getSimplePrinter(parameters, [])
160    in
161        Code
162        { 
163            procName       = name,
164            printAssemblyCode = DEBUG.getParameter DEBUG.assemblyCodeTag parameters,
165            printStream    = printStream,
166            lowLevelOptimise = DEBUG.getParameter DEBUG.lowlevelOptimiseTag parameters,
167            profileObject  = profObj
168        }
169    end
170
171    (* Put 1 unsigned byte at a given offset in the segment. *)
172    fun set8u (b, addr, seg) = byteVecSet (seg, addr,  b)
173 
174    (* Put 4 bytes at a given offset in the segment. *)
175    (* b0 is the least significant byte. *)
176    fun set4Bytes (b3, b2, b1, b0, addr, seg) =
177    let
178        val a = addr;
179    in
180        (* Little-endian *)
181        byteVecSet (seg, a,     b0);
182        byteVecSet (seg, a + 0w1, b1);
183        byteVecSet (seg, a + 0w2, b2);
184        byteVecSet (seg, a + 0w3, b3)
185    end;
186
187    (* Put 1 unsigned word at a given offset in the segment. *)
188    fun set32u (ival: LargeWord.word, addr, seg) : unit =
189    let
190        val b3       = Word8.fromLargeWord (ival >>+ 0w24)
191        val b2       = Word8.fromLargeWord (ival >>+ 0w16)
192        val b1       = Word8.fromLargeWord (ival >>+ 0w8)
193        val b0       = Word8.fromLargeWord ival
194    in
195        set4Bytes (b3, b2, b1, b0, addr, seg)
196    end
197
198    (* Put 1 signed word at a given offset in the segment. *)
199    fun set32s (ival: LargeInt.int, addr, seg) = set32u(LargeWord.fromLargeInt ival, addr, seg)
200    
201    fun byteSigned ival =
202        if ~0x80 <= ival andalso ival < 0x80
203        then Word8.fromInt ival
204        else raise InternalError "byteSigned: invalid byte"
205    
206    (* Convert a large-word value to a little-endian byte sequence. *)
207    fun largeWordToBytes(_, 0) = []
208    |   largeWordToBytes(ival: LargeWord.word, n) =
209            Word8.fromLargeWord ival :: largeWordToBytes(ival >>+ 0w8, n-1)
210
211    fun word32Unsigned(ival: LargeWord.word) = largeWordToBytes(ival, 4)
212
213    fun int32Signed(ival: LargeInt.int) =
214        if is32bit ival
215        then word32Unsigned(LargeWord.fromLargeInt ival)
216        else raise InternalError "int32Signed: invalid word"
217
218    (* Registers. *)
219    datatype genReg = GeneralReg of Word8.word * bool
220    and fpReg = FloatingPtReg of Word8.word
221    and xmmReg = SSE2Reg of Word8.word
222    
223    datatype reg =
224        GenReg of genReg
225    |   FPReg of fpReg
226    |   XMMReg of xmmReg
227
228    (* These are the real registers we have.  The AMD extension encodes the
229       additional registers through the REX prefix. *)
230    val rax = GeneralReg (0w0, false)
231    val rcx = GeneralReg (0w1, false)
232    val rdx = GeneralReg (0w2, false)
233    val rbx = GeneralReg (0w3, false)
234    val rsp = GeneralReg (0w4, false)
235    val rbp = GeneralReg (0w5, false)
236    val rsi = GeneralReg (0w6, false)
237    val rdi = GeneralReg (0w7, false)
238    val eax = rax and ecx = rcx and edx = rdx and ebx = rbx
239    and esp = rsp and ebp = rbp and esi = rsi and edi = rdi
240    val r8  = GeneralReg (0w0, true)
241    val r9  = GeneralReg (0w1, true)
242    val r10 = GeneralReg (0w2, true)
243    val r11 = GeneralReg (0w3, true)
244    val r12 = GeneralReg (0w4, true)
245    val r13 = GeneralReg (0w5, true)
246    val r14 = GeneralReg (0w6, true)
247    val r15 = GeneralReg (0w7, true)
248
249    (* Floating point "registers".  Actually entries on the floating point stack.
250       The X86 has a floating point stack with eight entries. *)
251    val fp0 = FloatingPtReg 0w0
252    and fp1 = FloatingPtReg 0w1
253    and fp2 = FloatingPtReg 0w2
254    and fp3 = FloatingPtReg 0w3
255    and fp4 = FloatingPtReg 0w4
256    and fp5 = FloatingPtReg 0w5
257    and fp6 = FloatingPtReg 0w6
258    and fp7 = FloatingPtReg 0w7
259
260    (* SSE2 Registers.  These are used for floating point in 64-bity mode.
261       We only use XMM0-6 because the others are callee save and we don't
262       currently save them. *)
263    val xmm0 = SSE2Reg 0w0
264    and xmm1 = SSE2Reg 0w1
265    and xmm2 = SSE2Reg 0w2
266    and xmm3 = SSE2Reg 0w3
267    and xmm4 = SSE2Reg 0w4
268    and xmm5 = SSE2Reg 0w5
269    and xmm6 = SSE2Reg 0w6
270    and xmm7 = SSE2Reg 0w7
271
272    fun getReg (GeneralReg r) = r
273    fun mkReg  n      = GeneralReg n  (* reg.up   *)
274  
275    (* The maximum size of the register vectors and masks.  Although the
276       X86/32 has a floating point stack with eight entries it's much simpler
277       to treat it as having seven "real" registers.  Items are pushed to the
278       stack and then stored and popped into the current location.  It may be
279       possible to improve the code by some peephole optimisation. *)
280    val regs = 30 (* Include the X86/64 registers even if this is 32-bit. *)
281
282    (* The nth register (counting from 0). *)
283    (* Profiling shows that applying the constructors here creates a lot of
284       garbage.  Create the entries once and then use vector indexing instead. *)
285    local
286        fun regN i =
287            if i < 8
288            then GenReg(GeneralReg(Word8.fromInt i, false))
289            else if i < 16
290            then GenReg(GeneralReg(Word8.fromInt(i-8), true))
291            else if i < 23
292            then FPReg(FloatingPtReg(Word8.fromInt(i-16)))
293            else XMMReg(SSE2Reg(Word8.fromInt(i-23)))
294        val regVec = Vector.tabulate(regs, regN)
295    in
296        fun regN i = Vector.sub(regVec, i) handle Subscript => raise InternalError "Bad register number"
297    end
298 
299    (* The number of the register. *)
300    fun nReg(GenReg(GeneralReg(r, false))) = Word8.toInt r
301    |   nReg(GenReg(GeneralReg(r, true))) = Word8.toInt r + 8
302    |   nReg(FPReg(FloatingPtReg r)) = Word8.toInt r + 16
303    |   nReg(XMMReg(SSE2Reg r)) = Word8.toInt r + 23
304        
305    datatype opsize = SZByte | SZWord | SZDWord | SZQWord
306    
307    (* Default size when printing regs. *)
308    val sz32_64 = if hostIsX64 then SZQWord else SZDWord
309
310    fun genRegRepr(GeneralReg (0w0, false), SZByte) = "al"
311    |   genRegRepr(GeneralReg (0w1, false), SZByte) = "cl"
312    |   genRegRepr(GeneralReg (0w2, false), SZByte) = "dl"
313    |   genRegRepr(GeneralReg (0w3, false), SZByte) = "bl"
314    |   genRegRepr(GeneralReg (0w4, false), SZByte) = "ah"
315    |   genRegRepr(GeneralReg (0w5, false), SZByte) = "ch"
316    |   genRegRepr(GeneralReg (0w6, false), SZByte) = "sil" (* Assume there's a Rex code that forces low-order reg *)
317    |   genRegRepr(GeneralReg (0w7, false), SZByte) = "dil"
318    |   genRegRepr(GeneralReg (reg, true),  SZByte) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "b"
319    |   genRegRepr(GeneralReg (0w0, false), SZDWord) = "eax"
320    |   genRegRepr(GeneralReg (0w1, false), SZDWord) = "ecx"
321    |   genRegRepr(GeneralReg (0w2, false), SZDWord) = "edx"
322    |   genRegRepr(GeneralReg (0w3, false), SZDWord) = "ebx"
323    |   genRegRepr(GeneralReg (0w4, false), SZDWord) = "esp"
324    |   genRegRepr(GeneralReg (0w5, false), SZDWord) = "ebp"
325    |   genRegRepr(GeneralReg (0w6, false), SZDWord) = "esi"
326    |   genRegRepr(GeneralReg (0w7, false), SZDWord) = "edi"
327    |   genRegRepr(GeneralReg (reg, true),  SZDWord) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "d"
328    |   genRegRepr(GeneralReg (0w0, false), SZQWord) = "rax"
329    |   genRegRepr(GeneralReg (0w1, false), SZQWord) = "rcx"
330    |   genRegRepr(GeneralReg (0w2, false), SZQWord) = "rdx"
331    |   genRegRepr(GeneralReg (0w3, false), SZQWord) = "rbx"
332    |   genRegRepr(GeneralReg (0w4, false), SZQWord) = "rsp"
333    |   genRegRepr(GeneralReg (0w5, false), SZQWord) = "rbp"
334    |   genRegRepr(GeneralReg (0w6, false), SZQWord) = "rsi"
335    |   genRegRepr(GeneralReg (0w7, false), SZQWord) = "rdi"
336    |   genRegRepr(GeneralReg (reg, true),  SZQWord) = "r" ^ Int.toString(Word8.toInt reg +8)
337    |   genRegRepr(GeneralReg (0w0, false), SZWord) = "ax"
338    |   genRegRepr(GeneralReg (0w1, false), SZWord) = "cx"
339    |   genRegRepr(GeneralReg (0w2, false), SZWord) = "dx"
340    |   genRegRepr(GeneralReg (0w3, false), SZWord) = "bx"
341    |   genRegRepr(GeneralReg (0w4, false), SZWord) = "sp"
342    |   genRegRepr(GeneralReg (0w5, false), SZWord) = "bp"
343    |   genRegRepr(GeneralReg (0w6, false), SZWord) = "si"
344    |   genRegRepr(GeneralReg (0w7, false), SZWord) = "di"
345    |   genRegRepr(GeneralReg (reg, true),  SZWord) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "w"
346    |   genRegRepr _ = "unknown" (* Suppress warning because word values are not exhaustive. *)
347
348    and fpRegRepr(FloatingPtReg n) = "fp" ^ Word8.toString n
349    
350    and xmmRegRepr(SSE2Reg n) = "xmm" ^ Word8.toString n
351
352    fun regRepr(GenReg r) = genRegRepr (r, sz32_64)
353    |   regRepr(FPReg r) = fpRegRepr r
354    |   regRepr(XMMReg r) = xmmRegRepr r
355
356    (* Install a pretty printer.  This is simply for when this code is being
357       run under the debugger.  N.B. We need PolyML.PrettyString here. *)
358    val () = PolyML.addPrettyPrinter(fn _ => fn _ => fn r => PolyML.PrettyString(regRepr r))
359    
360    datatype argType = ArgGeneral | ArgFP
361    
362    (* Size of operand.  OpSize64 is only valid in 64-bit mode. *)
363    datatype opSize = OpSize32 | OpSize64
364
365    structure RegSet =
366    struct
367        (* Implement a register set as a bit mask. *)
368        datatype regSet = RegSet of word
369        fun singleton r = RegSet(0w1 << Word.fromInt(nReg r))
370        fun regSetUnion(RegSet r1, RegSet r2) = RegSet(Word.orb(r1, r2))
371        fun regSetIntersect(RegSet r1, RegSet r2) = RegSet(Word.andb(r1, r2))
372
373        local
374            fun addReg(acc, n) =
375                if n = regs then acc else addReg(regSetUnion(acc, singleton(regN n)), n+1)
376        in
377            val allRegisters = addReg(RegSet 0w0, 0)
378        end
379
380        val noRegisters = RegSet 0w0
381
382        fun inSet(r, rs) = regSetIntersect(singleton r, rs) <> noRegisters
383        
384        fun regSetMinus(RegSet s1, RegSet s2) = RegSet(Word.andb(s1, Word.notb s2))
385        
386        val listToSet = List.foldl (fn(r, rs) => regSetUnion(singleton r, rs)) noRegisters
387
388        local
389            val regs =
390                case targetArch of
391                    Native32Bit => [eax, ecx, edx, ebx, esi, edi]
392                |   Native64Bit => [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r11, r12, r13, r14]
393                |   ObjectId32Bit => [eax, ecx, edx, esi, edi, r8, r9, r10, r11, r12, r13, r14]
394        in
395            val generalRegisters = listToSet(map GenReg regs)
396        end
397        
398        (* The floating point stack.  Note that this excludes one item so it is always
399           possible to load a value onto the top of the FP stack. *)
400        val floatingPtRegisters =
401            listToSet(map FPReg [fp0, fp1, fp2, fp3, fp4, fp5, fp6(*, fp7*)])
402
403        val sse2Registers =
404            listToSet(map XMMReg [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6])
405
406        fun isAllRegs rs = rs = allRegisters
407
408        fun setToList (RegSet regSet)=
409        let
410            fun testBit (n, bit, res) =
411                if n = regs
412                then res
413                else testBit(n+1, bit << 0w1, 
414                        if (regSet andb bit) <> 0w0
415                        then regN n :: res else res)
416        in
417            testBit(0, 0w1, [])
418        end
419
420        val cardinality = List.length o setToList
421
422        (* Choose one of the set.  This chooses the least value which means that
423           the ordering of the registers is significant.  This is a hot-spot
424           so is coded directly with the word operations. *)
425        fun oneOf(RegSet regSet) =
426        let
427            fun find(n, bit) =
428                if n = Word.fromInt regs then raise InternalError "oneOf: empty"
429                else if Word.andb(bit, regSet) <> 0w0 then n
430                else find(n+0w1, Word.<<(bit, 0w1))
431        in
432            regN(Word.toInt(find(0w0, 0w1)))
433        end
434        
435        fun regSetRepr regSet =
436        let
437            val regs = setToList regSet
438        in
439            "[" ^ String.concatWith "," (List.map regRepr regs) ^ "]"
440        end
441        
442        (* Install a pretty printer for when this code is being debugged. *)
443        val () = PolyML.addPrettyPrinter(fn _ => fn _ => fn r => PolyML.PrettyString(regSetRepr r))
444     end
445
446    open RegSet
447
448    datatype arithOp = ADD | OR (*|ADC | SBB*) | AND | SUB | XOR | CMP
449  
450    fun arithOpToWord ADD = 0w0: Word8.word
451    |   arithOpToWord OR  = 0w1
452    |   arithOpToWord AND = 0w4
453    |   arithOpToWord SUB = 0w5
454    |   arithOpToWord XOR = 0w6
455    |   arithOpToWord CMP = 0w7
456
457    fun arithOpRepr ADD = "Add"
458    |   arithOpRepr OR  = "Or"
459    |   arithOpRepr AND = "And"
460    |   arithOpRepr SUB = "Sub"
461    |   arithOpRepr XOR = "Xor"
462    |   arithOpRepr CMP = "Cmp"
463
464    datatype shiftType = SHL | SHR | SAR
465
466    fun shiftTypeToWord SHL = 0w4: Word8.word
467    |   shiftTypeToWord SHR = 0w5
468    |   shiftTypeToWord SAR = 0w7
469
470    fun shiftTypeRepr SHL = "Shift Left Logical"
471    |   shiftTypeRepr SHR = "Shift Right Logical"
472    |   shiftTypeRepr SAR = "Shift Right Arithemetic"
473
474    datatype repOps = CMPS8 | MOVS8 | MOVS32 | STOS8 | STOS32 | MOVS64 | STOS64
475    
476    fun repOpsToWord CMPS8  = 0wxa6: Word8.word
477    |   repOpsToWord MOVS8  = 0wxa4
478    |   repOpsToWord MOVS32 = 0wxa5
479    |   repOpsToWord MOVS64 = 0wxa5 (* Plus Rex.w *)
480    |   repOpsToWord STOS8  = 0wxaa
481    |   repOpsToWord STOS32 = 0wxab
482    |   repOpsToWord STOS64 = 0wxab (* Plus Rex.w *)
483
484    fun repOpsRepr CMPS8    = "CompareBytes"
485    |   repOpsRepr MOVS8    = "MoveBytes"
486    |   repOpsRepr MOVS32   = "MoveWords32"
487    |   repOpsRepr MOVS64   = "MoveWords64"
488    |   repOpsRepr STOS8    = "StoreBytes"
489    |   repOpsRepr STOS32   = "StoreWords32"
490    |   repOpsRepr STOS64   = "StoreWords64"
491
492    datatype fpOps = FADD | FMUL | FCOM | FCOMP | FSUB | FSUBR | FDIV | FDIVR
493
494    fun fpOpToWord FADD  = 0w0: Word8.word
495    |   fpOpToWord FMUL  = 0w1
496    |   fpOpToWord FCOM  = 0w2
497    |   fpOpToWord FCOMP = 0w3
498    |   fpOpToWord FSUB  = 0w4
499    |   fpOpToWord FSUBR = 0w5
500    |   fpOpToWord FDIV  = 0w6
501    |   fpOpToWord FDIVR = 0w7
502
503    fun fpOpRepr FADD  = "FPAdd"
504    |   fpOpRepr FMUL  = "FPMultiply"
505    |   fpOpRepr FCOM  = "FPCompare"
506    |   fpOpRepr FCOMP = "FPCompareAndPop"
507    |   fpOpRepr FSUB  = "FPSubtract"
508    |   fpOpRepr FSUBR = "FPReverseSubtract"
509    |   fpOpRepr FDIV  = "FPDivide"
510    |   fpOpRepr FDIVR = "FPReverseDivide"
511
512    datatype fpUnaryOps = FCHS | FABS | FLD1 | FLDZ
513    
514    fun fpUnaryToWords FCHS   = {rm=0w0:Word8.word, nnn=0w4: Word8.word}
515    |   fpUnaryToWords FABS   = {rm=0w1, nnn=0w4}
516    |   fpUnaryToWords FLD1   = {rm=0w0, nnn=0w5}
517    |   fpUnaryToWords FLDZ   = {rm=0w6, nnn=0w5}
518
519    fun fpUnaryRepr FCHS   = "FPChangeSign"
520    |   fpUnaryRepr FABS   = "FPAbs"
521    |   fpUnaryRepr FLD1   = "FPLoadOne"
522    |   fpUnaryRepr FLDZ   = "FPLoadZero"
523
524    datatype branchOps = JO | JNO | JE | JNE | JL | JGE | JLE | JG | JB | JNB | JNA | JA | JP | JNP
525
526    fun branchOpToWord JO   = 0wx0: Word8.word
527    |   branchOpToWord JNO  = 0wx1
528    |   branchOpToWord JB   = 0wx2
529    |   branchOpToWord JNB  = 0wx3
530    |   branchOpToWord JE   = 0wx4
531    |   branchOpToWord JNE  = 0wx5
532    |   branchOpToWord JNA  = 0wx6
533    |   branchOpToWord JA   = 0wx7
534    |   branchOpToWord JP   = 0wxa
535    |   branchOpToWord JNP  = 0wxb
536    |   branchOpToWord JL   = 0wxc
537    |   branchOpToWord JGE  = 0wxd
538    |   branchOpToWord JLE  = 0wxe
539    |   branchOpToWord JG   = 0wxf
540 
541    fun branchOpRepr JO = "Overflow"
542    |   branchOpRepr JNO = "NotOverflow"
543    |   branchOpRepr JE = "Equal"
544    |   branchOpRepr JNE = "NotEqual"
545    |   branchOpRepr JL = "Less"
546    |   branchOpRepr JGE = "GreaterOrEqual"
547    |   branchOpRepr JLE = "LessOrEqual"
548    |   branchOpRepr JG = "Greater"
549    |   branchOpRepr JB = "Before"
550    |   branchOpRepr JNB= "NotBefore"
551    |   branchOpRepr JNA = "NotAfter"
552    |   branchOpRepr JA = "After"
553    |   branchOpRepr JP = "Parity"
554    |   branchOpRepr JNP = "NoParity"
555    
556    (* Invert a test.  This is used if we want to change the
557       sense of a test from jumping if the condition is true to
558       jumping if it is false. *)
559    fun invertTest JE  = JNE
560    |   invertTest JNE = JE
561    |   invertTest JA  = JNA
562    |   invertTest JB  = JNB
563    |   invertTest JNA = JA
564    |   invertTest JNB = JB
565    |   invertTest JL  = JGE
566    |   invertTest JG  = JLE
567    |   invertTest JLE = JG
568    |   invertTest JGE = JL
569    |   invertTest JO  = JNO
570    |   invertTest JNO = JO
571    |   invertTest JP  = JNP
572    |   invertTest JNP = JP
573
574    datatype sse2Operations =
575        SSE2MoveDouble | SSE2MoveFloat | SSE2CompDouble | SSE2AddDouble |
576        SSE2SubDouble | SSE2MulDouble | SSE2DivDouble |
577        SSE2Xor | SSE2And | SSE2FloatToDouble | SSE2DoubleToFloat |
578        SSE2CompSingle | SSE2AddSingle | SSE2SubSingle | SSE2MulSingle | SSE2DivSingle
579    
580    fun sse2OpRepr SSE2MoveDouble   = "SSE2MoveDouble"
581    |   sse2OpRepr SSE2MoveFloat    = "SSE2MoveFloat"
582    |   sse2OpRepr SSE2CompDouble   = "SSE2CompDouble"
583    |   sse2OpRepr SSE2AddDouble    = "SSE2AddDouble"
584    |   sse2OpRepr SSE2SubDouble    = "SSE2SubDouble"
585    |   sse2OpRepr SSE2MulDouble    = "SSE2MulDouble"
586    |   sse2OpRepr SSE2DivDouble    = "SSE2DivDouble"
587    |   sse2OpRepr SSE2Xor          = "SSE2Xor"
588    |   sse2OpRepr SSE2And          = "SSE2And"
589    |   sse2OpRepr SSE2CompSingle   = "SSE2CompSingle"
590    |   sse2OpRepr SSE2AddSingle    = "SSE2AddSingle"
591    |   sse2OpRepr SSE2SubSingle    = "SSE2SubSingle"
592    |   sse2OpRepr SSE2MulSingle    = "SSE2MulSingle"
593    |   sse2OpRepr SSE2DivSingle    = "SSE2DivSingle"
594    |   sse2OpRepr SSE2FloatToDouble = "SSE2FloatToDouble"
595    |   sse2OpRepr SSE2DoubleToFloat = "SSE2DoubleToFloat"
596
597    (* Primary opCodes.  N.B. only opCodes actually used are listed here.
598       If new instruction are added check they will be handled by the
599       run-time system in the event of trap. *)    
600    datatype opCode =
601        Group1_8_A32
602    |   Group1_8_A64
603    |   Group1_32_A32
604    |   Group1_32_A64
605    |   Group1_8_a
606    |   JMP_8
607    |   JMP_32
608    |   CALL_32
609    |   MOVL_A_R32
610    |   MOVL_A_R64
611    |   MOVL_R_A32
612    |   MOVL_R_A64
613    |   MOVL_R_A16
614    |   MOVB_R_A32
615    |   MOVB_R_A64 of {forceRex: bool}
616    |   PUSH_R of Word8.word
617    |   POP_R  of Word8.word
618    |   Group5
619    |   NOP
620    |   LEAL32
621    |   LEAL64
622    |   MOVL_32_R of Word8.word
623    |   MOVL_64_R of Word8.word
624    |   MOVL_32_A32
625    |   MOVL_32_A64
626    |   MOVB_8_A
627    |   POP_A
628    |   RET
629    |   RET_16
630    |   CondJump of branchOps
631    |   CondJump32 of branchOps
632    |   SetCC of branchOps
633    |   Arith32 of arithOp * Word8.word
634    |   Arith64 of arithOp * Word8.word
635    |   Group3_A32
636    |   Group3_A64
637    |   Group3_a
638    |   Group2_8_A32
639    |   Group2_8_A64
640    |   Group2_CL_A32
641    |   Group2_CL_A64
642    |   Group2_1_A32
643    |   Group2_1_A64
644    |   PUSH_8
645    |   PUSH_32
646    |   TEST_ACC8
647    |   LOCK_XADD32
648    |   LOCK_XADD64
649    |   FPESC of Word8.word
650    |   XCHNG32
651    |   XCHNG64
652    |   REP (* Rep prefix *)
653    |   MOVZB (* Needs escape code. *)
654    |   MOVZW (* Needs escape code. *)
655    |   MOVSXB32 (* Needs escape code. *)
656    |   MOVSXW32 (* Needs escape code. *)
657    |   MOVSXB64 (* Needs escape code. *)
658    |   MOVSXW64 (* Needs escape code. *)
659    |   IMUL32 (* Needs escape code. *)
660    |   IMUL64 (* Needs escape code. *)
661    |   SSE2StoreSingle (* movss with memory destination - needs escape sequence. *)
662    |   SSE2StoreDouble (* movsd with memory destination - needs escape sequence. *)
663    |   CQO_CDQ32 (* Sign extend before divide.. *)
664    |   CQO_CDQ64 (* Sign extend before divide.. *)
665    |   SSE2Ops of sse2Operations (* SSE2 instructions. *)
666    |   CVTSI2SD32
667    |   CVTSI2SD64
668    |   HLT     (* End of code marker. *)
669    |   IMUL_C8_32
670    |   IMUL_C8_64
671    |   IMUL_C32_32
672    |   IMUL_C32_64
673    |   MOVDFromXMM (* move 32 bit value from XMM to general reg. *)
674    |   MOVQToXMM   (* move 64 bit value from general reg.to XMM *)
675    |   PSRLDQ (* Shift XMM register *)
676    |   LDSTMXCSR
677    |   CVTSD2SI32 (* Double to 32-bit int *)
678    |   CVTSD2SI64 (* Double to 64-bit int *)
679    |   CVTSS2SI32 (* Single to 32-bit int *)
680    |   CVTSS2SI64 (* Single to 64-bit int *)
681    |   CVTTSD2SI32 (* Double to 32-bit int - truncate towards zero *)
682    |   CVTTSD2SI64 (* Double to 64-bit int - truncate towards zero *)
683    |   CVTTSS2SI32 (* Single to 32-bit int - truncate towards zero *)
684    |   CVTTSS2SI64 (* Single to 64-bit int - truncate towards zero *)
685    |   MOVSXD
686    |   CMOV32 of branchOps
687    |   CMOV64 of branchOps
688
689
690    fun opToInt Group1_8_A32    =  0wx83
691    |   opToInt Group1_8_A64    =  0wx83
692    |   opToInt Group1_32_A32   =  0wx81
693    |   opToInt Group1_32_A64   =  0wx81
694    |   opToInt Group1_8_a      =  0wx80
695    |   opToInt JMP_8           =  0wxeb
696    |   opToInt JMP_32          =  0wxe9
697    |   opToInt CALL_32         =  0wxe8
698    |   opToInt MOVL_A_R32      =  0wx8b
699    |   opToInt MOVL_A_R64      =  0wx8b
700    |   opToInt MOVL_R_A32      =  0wx89
701    |   opToInt MOVL_R_A64      =  0wx89
702    |   opToInt MOVL_R_A16      =  0wx89 (* Also has an OPSIZE prefix. *)
703    |   opToInt MOVB_R_A32      =  0wx88
704    |   opToInt (MOVB_R_A64 _)  =  0wx88
705    |   opToInt (PUSH_R reg)    =  0wx50 + reg
706    |   opToInt (POP_R  reg)    =  0wx58 + reg
707    |   opToInt Group5          =  0wxff
708    |   opToInt NOP             =  0wx90
709    |   opToInt LEAL32          =  0wx8d
710    |   opToInt LEAL64          =  0wx8d
711    |   opToInt (MOVL_32_R reg) =  0wxb8 + reg
712    |   opToInt (MOVL_64_R reg) =  0wxb8 + reg
713    |   opToInt MOVL_32_A32     =  0wxc7
714    |   opToInt MOVL_32_A64     =  0wxc7
715    |   opToInt MOVB_8_A        =  0wxc6
716    |   opToInt POP_A           =  0wx8f
717    |   opToInt RET             = 0wxc3
718    |   opToInt RET_16          = 0wxc2
719    |   opToInt (CondJump opc)  = 0wx70 + branchOpToWord opc
720    |   opToInt (CondJump32 opc) = 0wx80 + branchOpToWord opc (* Needs 0F prefix *)
721    |   opToInt (SetCC opc)     = 0wx90 + branchOpToWord opc (* Needs 0F prefix *)
722    |   opToInt (Arith32 (ao,dw)) = arithOpToWord ao * 0w8 + dw
723    |   opToInt (Arith64 (ao,dw)) = arithOpToWord ao * 0w8 + dw
724    |   opToInt Group3_A32      = 0wxf7
725    |   opToInt Group3_A64      = 0wxf7
726    |   opToInt Group3_a        = 0wxf6
727    |   opToInt Group2_8_A32    = 0wxc1
728    |   opToInt Group2_8_A64    = 0wxc1
729    |   opToInt Group2_1_A32    = 0wxd1
730    |   opToInt Group2_1_A64    = 0wxd1
731    |   opToInt Group2_CL_A32   = 0wxd3
732    |   opToInt Group2_CL_A64   = 0wxd3
733    |   opToInt PUSH_8          = 0wx6a
734    |   opToInt PUSH_32         = 0wx68
735    |   opToInt TEST_ACC8       = 0wxa8
736    |   opToInt LOCK_XADD32     = 0wxC1 (* Needs lock and escape prefixes. *)
737    |   opToInt LOCK_XADD64     = 0wxC1 (* Needs lock and escape prefixes. *)
738    |   opToInt (FPESC n)       = 0wxD8 orb8 n
739    |   opToInt XCHNG32         = 0wx87
740    |   opToInt XCHNG64         = 0wx87
741    |   opToInt REP             = 0wxf3
742    |   opToInt MOVZB           = 0wxb6 (* Needs escape code. *)
743    |   opToInt MOVZW           = 0wxb7 (* Needs escape code. *)
744    |   opToInt MOVSXB32        = 0wxbe (* Needs escape code. *)
745    |   opToInt MOVSXW32        = 0wxbf (* Needs escape code. *)
746    |   opToInt MOVSXB64        = 0wxbe (* Needs escape code. *)
747    |   opToInt MOVSXW64        = 0wxbf (* Needs escape code. *)
748    |   opToInt IMUL32          = 0wxaf (* Needs escape code. *)
749    |   opToInt IMUL64          = 0wxaf (* Needs escape code. *)
750    |   opToInt SSE2StoreSingle = 0wx11 (* Needs F3 0F escape. *)
751    |   opToInt SSE2StoreDouble = 0wx11 (* Needs F2 0F escape. *)
752    |   opToInt CQO_CDQ32       = 0wx99
753    |   opToInt CQO_CDQ64       = 0wx99
754    |   opToInt (SSE2Ops SSE2MoveDouble) = 0wx10 (* Needs F2 0F escape. *)
755    |   opToInt (SSE2Ops SSE2MoveFloat) = 0wx10 (* Needs F3 0F escape. *)
756    |   opToInt (SSE2Ops SSE2CompDouble) = 0wx2E (* Needs 66 0F escape. *)
757    |   opToInt (SSE2Ops SSE2AddDouble)  = 0wx58 (* Needs F2 0F escape. *)
758    |   opToInt (SSE2Ops SSE2SubDouble)  = 0wx5c (* Needs F2 0F escape. *)
759    |   opToInt (SSE2Ops SSE2MulDouble)  = 0wx59 (* Needs F2 0F escape. *)
760    |   opToInt (SSE2Ops SSE2DivDouble)  = 0wx5e (* Needs F2 0F escape. *)
761    |   opToInt (SSE2Ops SSE2CompSingle) = 0wx2E (* Needs 0F escape. *)
762    |   opToInt (SSE2Ops SSE2AddSingle)  = 0wx58 (* Needs F3 0F escape. *)
763    |   opToInt (SSE2Ops SSE2SubSingle)  = 0wx5c (* Needs F3 0F escape. *)
764    |   opToInt (SSE2Ops SSE2MulSingle)  = 0wx59 (* Needs F3 0F escape. *)
765    |   opToInt (SSE2Ops SSE2DivSingle)  = 0wx5e (* Needs F3 0F escape. *)
766    |   opToInt (SSE2Ops SSE2And)  = 0wx54 (* Needs 66 0F escape. *)
767    |   opToInt (SSE2Ops SSE2Xor)  = 0wx57 (* Needs 66 0F escape. *)
768    |   opToInt (SSE2Ops SSE2FloatToDouble)  = 0wx5A (* Needs F3 0F escape. *)
769    |   opToInt (SSE2Ops SSE2DoubleToFloat)  = 0wx5A (* Needs F2 0F escape. *)
770    |   opToInt CVTSI2SD32      = 0wx2a (* Needs F2 0F escape. *)
771    |   opToInt CVTSI2SD64      = 0wx2a (* Needs F2 0F escape. *)
772    |   opToInt HLT             = 0wxf4
773    |   opToInt IMUL_C8_32      = 0wx6b
774    |   opToInt IMUL_C8_64      = 0wx6b
775    |   opToInt IMUL_C32_32     = 0wx69
776    |   opToInt IMUL_C32_64     = 0wx69
777    |   opToInt MOVDFromXMM     = 0wx7e  (* Needs 66 0F escape. *)
778    |   opToInt MOVQToXMM       = 0wx6e  (* Needs 66 0F escape. *)
779    |   opToInt PSRLDQ          = 0wx73  (* Needs 66 0F escape. *)
780    |   opToInt LDSTMXCSR       = 0wxae  (* Needs 0F prefix. *)
781    |   opToInt CVTSD2SI32      = 0wx2d  (* Needs F2 0F prefix. *)
782    |   opToInt CVTSD2SI64      = 0wx2d  (* Needs F2 0F prefix and rex.w. *)
783    |   opToInt CVTSS2SI32      = 0wx2d  (* Needs F3 0F prefix. *)
784    |   opToInt CVTSS2SI64      = 0wx2d  (* Needs F3 0F prefix and rex.w. *)
785    |   opToInt CVTTSD2SI32     = 0wx2c  (* Needs F2 0F prefix. *)
786    |   opToInt CVTTSD2SI64     = 0wx2c  (* Needs F2 0F prefix. *)
787    |   opToInt CVTTSS2SI32     = 0wx2c  (* Needs F3 0F prefix. *)
788    |   opToInt CVTTSS2SI64     = 0wx2c  (* Needs F3 0F prefix and rex.w. *)
789    |   opToInt MOVSXD          = 0wx63
790    |   opToInt (CMOV32 opc)    = 0wx40 + branchOpToWord opc (* Needs 0F prefix *)
791    |   opToInt (CMOV64 opc)    = 0wx40 + branchOpToWord opc (* Needs 0F prefix and rex.w *)
792
793    datatype mode =
794        Based0   (* mod = 0 *)
795    |   Based8   (* mod = 1 *)
796    |   Based32  (* mod = 2 *)
797    |   Register (* mod = 3 *) ;
798
799    (* Put together the three fields which make up the mod r/m byte. *)
800    fun modrm (md : mode, rg: Word8.word, rm : Word8.word) : Word8.word =
801    let
802        val _ = if rg > 0w7 then raise InternalError "modrm: bad rg" else ()
803        val _ = if rm > 0w7 then raise InternalError "modrm: bad rm" else ()
804        val modField: Word8.word = 
805            case md of 
806                Based0   => 0w0
807            |   Based8   => 0w1
808            |   Based32  => 0w2
809            |   Register => 0w3
810    in
811        (modField <<- 0w6) orb8 (rg <<- 0w3) orb8 rm
812    end
813
814    (* REX prefix *)
815    fun rex {w,r,x,b} =
816        0wx40 orb8 (if w then 0w8 else 0w0) orb8 (if r then 0w4 else 0w0) orb8
817            (if x then 0w2 else 0w0) orb8 (if b then 0w1 else 0w0)
818
819    (* The X86 has the option to include an index register and to scale it. *)
820    datatype indexType =
821        NoIndex | Index1 of genReg | Index2 of genReg | Index4 of genReg | Index8 of genReg
822
823    (* Lock, Opsize and REPNE prefixes come before the REX. *)
824    fun opcodePrefix LOCK_XADD32                = [0wxF0] (* Requires LOCK prefix. *)
825    |   opcodePrefix LOCK_XADD64                = [0wxF0] (* Requires LOCK prefix. *)
826    |   opcodePrefix MOVL_R_A16                 = [0wx66] (* Requires OPSIZE prefix. *)
827    |   opcodePrefix SSE2StoreSingle            = [0wxf3]
828    |   opcodePrefix SSE2StoreDouble            = [0wxf2]
829    |   opcodePrefix(SSE2Ops SSE2CompDouble)    = [0wx66]
830    |   opcodePrefix(SSE2Ops SSE2And)           = [0wx66]
831    |   opcodePrefix(SSE2Ops SSE2Xor)           = [0wx66]
832    |   opcodePrefix(SSE2Ops SSE2CompSingle)    = [] (* No prefix *)
833    |   opcodePrefix(SSE2Ops SSE2MoveDouble)    = [0wxf2]
834    |   opcodePrefix(SSE2Ops SSE2AddDouble)     = [0wxf2]
835    |   opcodePrefix(SSE2Ops SSE2SubDouble)     = [0wxf2]
836    |   opcodePrefix(SSE2Ops SSE2MulDouble)     = [0wxf2]
837    |   opcodePrefix(SSE2Ops SSE2DivDouble)     = [0wxf2]
838    |   opcodePrefix(SSE2Ops SSE2DoubleToFloat) = [0wxf2]
839    |   opcodePrefix(SSE2Ops SSE2MoveFloat)     = [0wxf3]
840    |   opcodePrefix(SSE2Ops SSE2AddSingle)     = [0wxf3]
841    |   opcodePrefix(SSE2Ops SSE2SubSingle)     = [0wxf3]
842    |   opcodePrefix(SSE2Ops SSE2MulSingle)     = [0wxf3]
843    |   opcodePrefix(SSE2Ops SSE2DivSingle)     = [0wxf3]
844    |   opcodePrefix(SSE2Ops SSE2FloatToDouble) = [0wxf3]
845    |   opcodePrefix CVTSI2SD32                 = [0wxf2]
846    |   opcodePrefix CVTSI2SD64                 = [0wxf2]
847    |   opcodePrefix MOVDFromXMM                = [0wx66]
848    |   opcodePrefix MOVQToXMM                  = [0wx66]
849    |   opcodePrefix PSRLDQ                     = [0wx66]
850    |   opcodePrefix CVTSD2SI32                 = [0wxf2]
851    |   opcodePrefix CVTSD2SI64                 = [0wxf2]
852    |   opcodePrefix CVTSS2SI32                 = [0wxf3]
853    |   opcodePrefix CVTSS2SI64                 = [0wxf3]
854    |   opcodePrefix CVTTSD2SI32                = [0wxf2]
855    |   opcodePrefix CVTTSD2SI64                = [0wxf2]
856    |   opcodePrefix CVTTSS2SI32                = [0wxf3]
857    |   opcodePrefix CVTTSS2SI64                = [0wxf3]
858    |   opcodePrefix _                          = []
859
860    (* A few instructions require an escape.  Escapes come after the REX. *)
861    fun escapePrefix MOVZB                      = [0wx0f]
862    |   escapePrefix MOVZW                      = [0wx0f]
863    |   escapePrefix MOVSXB32                   = [0wx0f]
864    |   escapePrefix MOVSXW32                   = [0wx0f]
865    |   escapePrefix MOVSXB64                   = [0wx0f]
866    |   escapePrefix MOVSXW64                   = [0wx0f]
867    |   escapePrefix LOCK_XADD32                = [0wx0f]
868    |   escapePrefix LOCK_XADD64                = [0wx0f]
869    |   escapePrefix IMUL32                     = [0wx0f]
870    |   escapePrefix IMUL64                     = [0wx0f]
871    |   escapePrefix(CondJump32 _)              = [0wx0f]
872    |   escapePrefix(SetCC _)                   = [0wx0f]
873    |   escapePrefix SSE2StoreSingle            = [0wx0f]
874    |   escapePrefix SSE2StoreDouble            = [0wx0f]
875    |   escapePrefix(SSE2Ops _)                 = [0wx0f]
876    |   escapePrefix CVTSI2SD32                 = [0wx0f]
877    |   escapePrefix CVTSI2SD64                 = [0wx0f]
878    |   escapePrefix MOVDFromXMM                = [0wx0f]
879    |   escapePrefix MOVQToXMM                  = [0wx0f]
880    |   escapePrefix PSRLDQ                     = [0wx0f]
881    |   escapePrefix LDSTMXCSR                  = [0wx0f]
882    |   escapePrefix CVTSD2SI32                 = [0wx0f]
883    |   escapePrefix CVTSD2SI64                 = [0wx0f]
884    |   escapePrefix CVTSS2SI32                 = [0wx0f]
885    |   escapePrefix CVTSS2SI64                 = [0wx0f]
886    |   escapePrefix CVTTSD2SI32                = [0wx0f]
887    |   escapePrefix CVTTSD2SI64                = [0wx0f]
888    |   escapePrefix CVTTSS2SI32                = [0wx0f]
889    |   escapePrefix CVTTSS2SI64                = [0wx0f]
890    |   escapePrefix(CMOV32 _)                  = [0wx0f]
891    |   escapePrefix(CMOV64 _)                  = [0wx0f]
892    |   escapePrefix _                          = []
893
894    (* Generate an opCode byte after doing any pending operations. *)
895    fun opCodeBytes(opb:opCode, rx) =
896    let
897        val rexByte = 
898            case rx of
899                NONE => []
900            |   SOME rxx =>
901                if hostIsX64 then [rex rxx]
902                else raise InternalError "opCodeBytes: rex prefix in 32 bit mode";
903    in
904        opcodePrefix opb @ rexByte @ escapePrefix opb @ [opToInt opb]
905    end
906
907    fun rexByte(opb, rrX, rbX, riX) =
908    let
909        (* We need a rex prefix if we need to set the length to 64-bit. *)
910        val need64bit =
911            case opb of
912                Group1_8_A64 => true (* Arithmetic operations - must be 64-bit *)
913            |   Group1_32_A64 => true (* Arithmetic operations - must be 64-bit *)
914            |   Group2_1_A64 => true (* 1-bit shifts - must be 64-bit *)
915            |   Group2_8_A64 => true (* n-bit shifts - must be 64-bit *)
916            |   Group2_CL_A64 => true (* Shifts by value in CL *)
917            |   Group3_A64 => true (* Test, Not, Mul etc. *)
918            |   Arith64 (_, _) => true
919            |   MOVL_A_R64 => true (* Needed *)
920            |   MOVL_R_A64 => true (* Needed *)
921            |   XCHNG64 => true
922            |   LEAL64 => true (* Needed to ensure the result is 64-bits *)
923            |   MOVL_64_R _ => true (* Needed *)
924            |   MOVL_32_A64 => true (* Needed *)
925            |   IMUL64 => true (* Needed to ensure the result is 64-bits *)
926            |   LOCK_XADD64 => true (* Needed to ensure the result is 64-bits *)
927            |   CQO_CDQ64 => true (* It's only CQO if there's a Rex prefix. *)
928            |   CVTSI2SD64 => true (* This affects the size of the integer source. *)
929            |   IMUL_C8_64 => true
930            |   IMUL_C32_64 => true
931            |   MOVQToXMM => true
932            |   CVTSD2SI64 => true (* This affects the size of the integer source. *)
933            |   CVTSS2SI64 => true 
934            |   CVTTSD2SI64 => true
935            |   CVTTSS2SI64 => true
936            |   MOVSXD      => true
937            |   CMOV64 _    => true
938            |   MOVSXB64   => true
939            |   MOVSXW64   => true
940            (* Group5 - We only use 2/4/6 and they don't need prefix *)
941            |   _ => false
942        (* If we are using MOVB_R_A with SIL or DIL we need to force a REX prefix.
943           That's only possible in 64-bit mode.  This also applies with Test and SetCC
944           but they are dealt with elsewhere. *)
945        val forceRex =
946            case opb of
947                MOVB_R_A64 {forceRex=true} => true (* This is allowed in X86/64 but not in X86/32. *)
948            |   _ => false
949    in
950        if need64bit orelse rrX orelse rbX orelse riX orelse forceRex
951        then [rex{w=need64bit, r=rrX, b=rbX, x = riX}]
952        else []
953    end
954
955    (* Register/register operation. *)
956    fun opReg(opb:opCode, (*dest*)GeneralReg(rrC, rrX), (*source*)GeneralReg(rbC, rbX)) =
957    let
958        val pref = opcodePrefix opb (* Any opsize or lock prefix. *)
959        val rex = rexByte(opb, rrX, rbX, false)
960        val esc = escapePrefix opb (* Generate the ESCAPE code if needed. *)
961        val opc = opToInt opb
962        val mdrm = modrm(Register, rrC, rbC)
963    in
964        pref @ rex @ esc @ [opc, mdrm]
965    end
966
967    (* Operations on a register where the second "register" is actually an operation code. *)
968    fun opRegPlus2(opb:opCode, rd: genReg, op2: Word8.word) =
969    let
970        val (rrC, rrX) = getReg rd
971        val pref = opcodePrefix opb (* Any opsize or lock prefix. *)
972        val rex = rexByte(opb, false, rrX, false)
973        val opc = opToInt opb
974        val mdrm = modrm(Register, op2, rrC)
975    in
976        pref @ rex @ [opc, mdrm]
977    end
978
979    local
980        (* General instruction form with modrm and optional sib bytes.  rb is an option since the
981           base register may be omitted.  This is used with LEA to tag integers. *)
982        fun opIndexedGen (opb:opCode, offset: LargeInt.int, rb: genReg option, ri: indexType, (rrC, rrX)) =
983        let
984            (* Base encoding.  (Based0, 0w5) means "no base" so if we need ebp as the
985               base we have to use Based8 at least. *)
986            val (offsetCode, rbC, rbX) =
987                case rb of
988                    NONE => (Based0, 0w5 (* no base register *), false)
989                |   SOME rb =>
990                    let
991                        val (rbC, rbX) = getReg rb
992                        val base =
993                            if offset = 0 andalso rbC <> 0wx5 (* Can't use ebp with Based0 *)
994                            then Based0    (* no disp field *)
995                            else if is8BitL offset
996                            then Based8   (* use 8-bit disp field *)
997                            else Based32 (* use 32-bit disp field *)
998                    in
999                        (base, rbC, rbX)
1000                    end
1001
1002            (* Index coding.  esp can't be used as an index so (0w4, false) means "no index".
1003               But r12 (0w4, true) CAN be. *)
1004            val ((riC, riX), scaleFactor) =
1005                case ri of
1006                    NoIndex  => ((0w4, false), 0w0)
1007                |   Index1 i => (getReg i, 0w0)
1008                |   Index2 i => (getReg i, 0w1)
1009                |   Index4 i => (getReg i, 0w2)
1010                |   Index8 i => (getReg i, 0w3)
1011            
1012            (* If the base register is esp or r12 we have to use a sib byte even if
1013               there's no index.  That's because 0w4 as a base register means "there's
1014               a SIB byte". *)
1015            val modRmAndOptionalSib =
1016                if rbC = 0w4  (* Code for esp and r12 *) orelse riC <> 0w4 orelse riX
1017                then
1018                let
1019                    val mdrm = modrm(offsetCode, rrC, 0w4 (* s-i-b *))
1020                    val sibByte = (scaleFactor <<- 0w6) orb8 (riC <<- 0w3) orb8 rbC
1021                in
1022                    [mdrm, sibByte]
1023                end
1024                else [modrm(offsetCode, rrC, rbC)]
1025    
1026            (* Generate the disp field (if any) *)
1027            val dispField =
1028                case (offsetCode, rb) of
1029                    (Based8, _)  => [Word8.fromLargeInt offset]
1030                |   (Based32, _) => int32Signed offset
1031                |   (_, NONE)    => (* 32 bit absolute used as base *) int32Signed offset
1032                |    _ => []
1033        in
1034            opcodePrefix opb @ rexByte(opb, rrX, rbX, riX) @ escapePrefix opb @
1035                opToInt opb :: modRmAndOptionalSib @ dispField
1036        end
1037    in
1038        fun opEA(opb, offset, rb, r) = opIndexedGen(opb, offset, SOME rb, NoIndex, getReg r)
1039        (* Generate a opcode plus a second modrm byte but where the "register" field in
1040           the modrm byte is actually a code.  *)
1041        and opPlus2(opb, offset, rb, op2) = opIndexedGen(opb, offset, SOME rb, NoIndex, (op2, false))
1042        
1043        and opIndexedPlus2(opb, offset, rb, ri, op2) = opIndexedGen(opb, offset, SOME rb, ri, (op2, false))
1044        
1045        fun opIndexed (opb, offset, rb, ri, rd) =
1046            opIndexedGen(opb, offset, rb, ri, getReg rd) 
1047        
1048        fun opAddress(opb, offset, rb, ri, rd) = opIndexedGen (opb, offset, SOME rb, ri, getReg rd)
1049        and mMXAddress(opb, offset, rb, ri, SSE2Reg rrC) = opIndexedGen(opb, offset, SOME rb, ri, (rrC, false))
1050        and opAddressPlus2(opb, offset, rb, ri, op2) =
1051            opIndexedGen(opb, offset, SOME rb, ri, (op2, false))
1052    end
1053
1054    (* An operation with an operand that needs to go in the constant area, or in the case of
1055       native 32-bit, where the constant is stored in an object and the address of the
1056       object is inline.  This just puts in the instruction and the address.  The details
1057       of the constant are dealt with in putConst. *)
1058    fun opConstantOperand(opb, (*dest*)GeneralReg(rrC, rrX)) =
1059    let
1060        val pref = opcodePrefix opb (* Any opsize or lock prefix. *)
1061        val rex = rexByte(opb, rrX, false, false)
1062        val esc = escapePrefix opb (* Generate the ESCAPE code if needed. *)
1063        val opc = opToInt opb
1064        val mdrm = modrm(Based0, rrC, 0w5 (* PC-relative or absolute *))
1065    in
1066        pref @ rex @ esc @ [opc, mdrm] @ int32Signed(tag 0)
1067    end
1068
1069    fun immediateOperand (opn: arithOp, rd: genReg, imm: LargeInt.int, opSize) =
1070    if is8BitL imm
1071    then (* Can use one byte immediate *)
1072        opRegPlus2(case opSize of OpSize64 => Group1_8_A64 | OpSize32 => Group1_8_A32, rd, arithOpToWord opn) @ [Word8.fromLargeInt imm]
1073    else if is32bit imm
1074    then (* Need 32 bit immediate. *)
1075        opRegPlus2(case opSize of OpSize64 => Group1_32_A64 | OpSize32 => Group1_32_A32, rd, arithOpToWord opn) @ int32Signed imm
1076    else (* It won't fit in the immediate; put it in the non-address area. *)
1077    let
1078        val opc = case opSize of OpSize64 => Arith64 | OpSize32 => Arith32
1079    in
1080        opConstantOperand(opc(opn, 0w3 (* r/m to reg *)), rd)
1081    end
1082    
1083    fun arithOpReg(opn: arithOp, rd: genReg, rs: genReg, opIs64) =
1084        opReg ((if opIs64 then Arith64 else Arith32) (opn, 0w3 (* r/m to reg *)), rd, rs)
1085
1086    type handlerLab = addrs ref
1087
1088    fun floatingPtOp{escape, md, nnn, rm} =
1089        opCodeBytes(FPESC escape, NONE) @ [(md <<- 0w6) orb8 (nnn <<- 0w3) orb8 rm]
1090
1091    datatype trapEntries =
1092        StackOverflowCall
1093    |   StackOverflowCallEx
1094    |   HeapOverflowCall
1095
1096    (* RTS call.  We need to save any registers that may contain addresses to the stack.
1097       All the registers are preserved but not seen by the GC. *)
1098    fun rtsCall(rtsEntry, regSet) =
1099        let
1100            val entry =
1101                case rtsEntry of
1102                    StackOverflowCall   => memRegStackOverflowCall
1103                |   StackOverflowCallEx => memRegStackOverflowCallEx
1104                |   HeapOverflowCall    => memRegHeapOverflowCall
1105            val regSet = List.foldl(fn (r, a) => (0w1 << Word.fromInt(nReg(GenReg r))) orb a) 0w0 regSet
1106            val callInstr =
1107                opPlus2(Group5, LargeInt.fromInt entry, ebp, 0w2 (* call *))
1108            val regSetInstr =
1109                if regSet >= 0w256
1110                then [0wxca, (* This is actually a FAR RETURN *)
1111                        wordToWord8 regSet, (* Low byte*) wordToWord8 (regSet >> 0w8) (* High byte*)]
1112                else if regSet <> 0w0
1113                then [0wxcd, (* This is actually INT n *) wordToWord8 regSet]
1114                else []
1115        in
1116            callInstr @ regSetInstr
1117        end
1118
1119    (* Operations. *)
1120    type cases = word * label
1121
1122    type memoryAddress = { base: genReg, offset: int, index: indexType }
1123
1124    datatype 'reg regOrMemoryArg =
1125        RegisterArg of 'reg
1126    |   MemoryArg of memoryAddress
1127    |   NonAddressConstArg of LargeInt.int
1128    |   AddressConstArg of machineWord
1129
1130    datatype moveSize =
1131        Move64 | Move32 | Move8 | Move16 | Move32X64 | Move8X32 | Move8X64 | Move16X32 | Move16X64
1132    and fpSize = SinglePrecision | DoublePrecision
1133
1134    datatype operation =
1135        Move of { source: genReg regOrMemoryArg, destination: genReg regOrMemoryArg, moveSize: moveSize }
1136    |   PushToStack of genReg regOrMemoryArg
1137    |   PopR of genReg
1138    |   ArithToGenReg of { opc: arithOp, output: genReg, source: genReg regOrMemoryArg, opSize: opSize }
1139    |   ArithMemConst of { opc: arithOp, address: memoryAddress, source: LargeInt.int, opSize: opSize }
1140    |   ArithMemLongConst of { opc: arithOp, address: memoryAddress, source: machineWord }
1141    |   ArithByteMemConst of { opc: arithOp, address: memoryAddress, source: Word8.word }
1142    |   ShiftConstant of { shiftType: shiftType, output: genReg, shift: Word8.word, opSize: opSize }
1143    |   ShiftVariable of { shiftType: shiftType, output: genReg, opSize: opSize } (* Shift amount is in ecx *)
1144    |   ConditionalBranch of { test: branchOps, label: label }
1145    |   SetCondition of { output: genReg, test: branchOps }
1146    |   LoadAddress of { output: genReg, offset: int, base: genReg option, index: indexType, opSize: opSize }
1147    |   TestByteBits of { arg: genReg regOrMemoryArg, bits: Word8.word }
1148    |   CallRTS of {rtsEntry: trapEntries, saveRegs: genReg list }
1149    |   AllocStore of { size: int, output: genReg, saveRegs: genReg list }
1150    |   AllocStoreVariable of { size: genReg, output: genReg, saveRegs: genReg list }
1151    |   StoreInitialised
1152    |   CallAddress of genReg regOrMemoryArg
1153    |   JumpAddress of genReg regOrMemoryArg
1154    |   ReturnFromFunction of int
1155    |   RaiseException of { workReg: genReg }
1156    |   UncondBranch of label
1157    |   ResetStack of { numWords: int, preserveCC: bool }
1158    |   JumpLabel of label
1159    |   LoadLabelAddress of { label: label, output: genReg }
1160    |   RepeatOperation of repOps
1161    |   DivideAccR of {arg: genReg, isSigned: bool, opSize: opSize }
1162    |   DivideAccM of {base: genReg, offset: int, isSigned: bool, opSize: opSize }
1163    |   AtomicXAdd of {address: memoryAddress, output: genReg, opSize: opSize }
1164    |   FPLoadFromMemory of { address: memoryAddress, precision: fpSize }
1165    |   FPLoadFromFPReg of { source: fpReg, lastRef: bool }
1166    |   FPLoadFromConst of { constant: machineWord, precision: fpSize }
1167    |   FPStoreToFPReg of { output: fpReg, andPop: bool }
1168    |   FPStoreToMemory of { address: memoryAddress, precision: fpSize, andPop: bool }
1169    |   FPArithR of { opc: fpOps, source: fpReg }
1170    |   FPArithConst of { opc: fpOps, source: machineWord, precision: fpSize }
1171    |   FPArithMemory of { opc: fpOps, base: genReg, offset: int, precision: fpSize }
1172    |   FPUnary of fpUnaryOps
1173    |   FPStatusToEAX
1174    |   FPLoadInt of { base: genReg, offset: int, opSize: opSize }
1175    |   FPFree of fpReg
1176    |   MultiplyR of { source: genReg regOrMemoryArg, output: genReg, opSize: opSize }
1177    |   XMMArith of { opc: sse2Operations, source: xmmReg regOrMemoryArg, output: xmmReg }
1178    |   XMMStoreToMemory of { toStore: xmmReg, address: memoryAddress, precision: fpSize }
1179    |   XMMConvertFromInt of { source: genReg, output: xmmReg, opSize: opSize }
1180    |   SignExtendForDivide of opSize
1181    |   XChng of { reg: genReg, arg: genReg regOrMemoryArg, opSize: opSize }
1182    |   Negative of { output: genReg, opSize: opSize }
1183    |   JumpTable of { cases: label list, jumpSize: jumpSize ref }
1184    |   IndexedJumpCalc of { addrReg: genReg, indexReg: genReg, jumpSize: jumpSize ref }
1185    |   MoveXMMRegToGenReg of { source: xmmReg, output: genReg }
1186    |   MoveGenRegToXMMReg of { source: genReg, output: xmmReg }
1187    |   XMMShiftRight of { output: xmmReg, shift: Word8.word }
1188    |   FPLoadCtrlWord of memoryAddress (* Load FP control word. *)
1189    |   FPStoreCtrlWord of memoryAddress (* Store FP control word. *)
1190    |   XMMLoadCSR of memoryAddress (* Load combined control/status word. *)
1191    |   XMMStoreCSR of memoryAddress (* Store combined control/status word. *)
1192    |   FPStoreInt of memoryAddress
1193    |   XMMStoreInt of { source: xmmReg regOrMemoryArg, output: genReg, precision: fpSize, isTruncate: bool }
1194    |   CondMove of { test: branchOps, output: genReg, source: genReg regOrMemoryArg, opSize: opSize }
1195    |   LoadAbsolute of { destination: genReg, value: machineWord }
1196
1197    and jumpSize = JumpSize2 | JumpSize8
1198
1199    type operations = operation list
1200
1201    fun printOperation(operation, stream) =
1202    let
1203        fun printGReg r = stream(genRegRepr(r, sz32_64))
1204        val printFPReg = stream o fpRegRepr
1205        and printXMMReg = stream o xmmRegRepr
1206        fun printBaseOffset(b, x, i) =
1207        (
1208            stream(Int.toString i); stream "("; printGReg b; stream ")";
1209            case x of
1210                NoIndex => ()
1211            |   Index1 x => (stream "["; printGReg x; stream "]")
1212            |   Index2 x => (stream "["; printGReg x; stream "*2]")
1213            |   Index4 x => (stream "["; printGReg x; stream "*4]")
1214            |   Index8 x => (stream "["; printGReg x; stream "*8]")
1215        )
1216        fun printMemAddress({ base, offset, index }) = printBaseOffset(base, index, offset)
1217        
1218        fun printRegOrMemoryArg printReg (RegisterArg r) = printReg r
1219        |   printRegOrMemoryArg _ (MemoryArg{ base, offset, index }) = printBaseOffset(base, index, offset)
1220        |   printRegOrMemoryArg _ (NonAddressConstArg c) = stream(LargeInt.toString c)
1221        |   printRegOrMemoryArg _ (AddressConstArg c) = stream(Address.stringOfWord c)
1222        
1223        fun printOpSize OpSize32 = "32"
1224        |   printOpSize OpSize64 = "64"
1225     in
1226        case operation of
1227            Move { source, destination, moveSize } =>
1228            (
1229                case moveSize of
1230                    Move64  => stream "Move64 "
1231                |   Move32  => stream "Move32 "
1232                |   Move8   => stream "Move8 "
1233                |   Move16  => stream "Move16 "
1234                |   Move32X64 => stream "Move32X64 "
1235                |   Move8X32  => stream "Move8X32 "
1236                |   Move8X64  => stream "Move8X64 "
1237                |   Move16X32 => stream "Move16X32 "
1238                |   Move16X64 => stream "Move16X64 ";
1239                printRegOrMemoryArg printGReg destination; stream " <= "; printRegOrMemoryArg printGReg source
1240            )
1241
1242        |   ArithToGenReg { opc, output, source, opSize } =>
1243                (stream (arithOpRepr opc); stream "RR"; stream(printOpSize opSize); stream " "; printGReg output; stream " <= "; printRegOrMemoryArg printGReg source )
1244
1245        |   ArithMemConst { opc, address, source, opSize } =>
1246            (
1247                stream (arithOpRepr opc); stream "MC"; stream(printOpSize opSize); stream " ";
1248                printMemAddress address;
1249                stream " "; stream(LargeInt.toString source)
1250            )
1251
1252        |   ArithMemLongConst { opc, address, source } =>
1253            (
1254                stream (arithOpRepr opc ^ "MC "); printMemAddress address;
1255                stream " <= "; stream(Address.stringOfWord source)
1256            )
1257
1258        |   ArithByteMemConst { opc, address, source } =>
1259            (
1260                stream (arithOpRepr opc); stream "MC8"; stream " ";
1261                printMemAddress address; stream " "; stream(Word8.toString source)
1262            )
1263
1264        |   ShiftConstant { shiftType, output, shift, opSize } =>
1265            (
1266                stream(shiftTypeRepr shiftType); stream(printOpSize opSize); stream " "; printGReg output;
1267                stream " by "; stream(Word8.toString shift)
1268            )
1269
1270        |   ShiftVariable { shiftType, output, opSize } => (* Shift amount is in ecx *)
1271            (
1272                stream(shiftTypeRepr shiftType); stream(printOpSize opSize); stream " "; printGReg output; stream " by ECX"
1273            )
1274
1275        |   ConditionalBranch { test, label=Label{labelNo, ...} } =>
1276            (
1277                stream "Jump"; stream(branchOpRepr test); stream " L"; stream(Int.toString labelNo)
1278            )
1279
1280        |   SetCondition { output, test } =>
1281            (
1282                stream "SetCC"; stream(branchOpRepr test); stream " => "; printGReg output
1283            )
1284
1285        |   PushToStack source => (stream "Push "; printRegOrMemoryArg printGReg source)
1286
1287        |   PopR dest => (stream "PopR "; printGReg dest)
1288
1289        |   LoadAddress{ output, offset, base, index, opSize } =>
1290            (
1291                stream "LoadAddress"; stream(printOpSize opSize); stream " "; 
1292                case base of NONE => () | SOME r => (printGReg r; stream " + ");
1293                stream(Int.toString offset);
1294                case index of
1295                    NoIndex => ()
1296                |   Index1 x => (stream " + "; printGReg x)
1297                |   Index2 x => (stream " + "; printGReg x; stream "*2 ")
1298                |   Index4 x => (stream " + "; printGReg x; stream "*4 ")
1299                |   Index8 x => (stream " + "; printGReg x; stream "*8 ");
1300                stream " => "; printGReg output
1301            )
1302
1303        |   TestByteBits { arg, bits } =>
1304                ( stream "TestByteBits "; printRegOrMemoryArg printGReg arg; stream " 0x"; stream(Word8.toString bits) )
1305
1306        |   CallRTS {rtsEntry, ...} =>
1307            (
1308                stream "CallRTS ";
1309                case rtsEntry of
1310                    StackOverflowCall => stream "StackOverflowCall"
1311                |   HeapOverflowCall => stream "HeapOverflow"
1312                |   StackOverflowCallEx => stream "StackOverflowCallEx"
1313            )
1314
1315        |   AllocStore { size, output, ... } =>
1316                (stream "AllocStore "; stream(Int.toString size); stream " => "; printGReg output )
1317
1318        |   AllocStoreVariable { output, size, ...} =>
1319                (stream "AllocStoreVariable "; printGReg size; stream " => "; printGReg output )
1320        
1321        |   StoreInitialised => stream "StoreInitialised"
1322
1323        |   CallAddress source => (stream "CallAddress "; printRegOrMemoryArg printGReg source)
1324        |   JumpAddress source => (stream "JumpAddress "; printRegOrMemoryArg printGReg source)
1325
1326        |   ReturnFromFunction argsToRemove =>
1327                (stream "ReturnFromFunction "; stream(Int.toString argsToRemove))
1328
1329        |   RaiseException { workReg } => (stream "RaiseException "; printGReg workReg)
1330        |   UncondBranch(Label{labelNo, ...})=>
1331                (stream "UncondBranch L"; stream(Int.toString labelNo))
1332        |   ResetStack{numWords, preserveCC} =>
1333                (stream "ResetStack "; stream(Int.toString numWords); if preserveCC then stream " preserve CC" else ())
1334        |   JumpLabel(Label{labelNo, ...}) =>
1335                (stream "L"; stream(Int.toString labelNo); stream ":")
1336        |   LoadLabelAddress{ label=Label{labelNo, ...}, output } =>
1337                (stream "LoadLabelAddress L"; stream(Int.toString labelNo); stream "=>"; printGReg output)
1338        |   RepeatOperation repOp => (stream "Repeat "; stream(repOpsRepr repOp))
1339        |   DivideAccR{arg, isSigned, opSize} =>
1340                ( stream(if isSigned then "DivideSigned" else "DivideUnsigned"); stream(printOpSize opSize); stream " "; printGReg arg)
1341        |   DivideAccM{base, offset, isSigned, opSize} =>
1342                ( stream(if isSigned then "DivideSigned" else "DivideUnsigned"); stream(printOpSize opSize); stream " "; printBaseOffset(base, NoIndex, offset))
1343        |   AtomicXAdd{address, output, opSize} =>
1344                (stream "LockedXAdd"; stream(printOpSize opSize); printMemAddress address; stream " <=> "; printGReg output)
1345        |   FPLoadFromMemory{address, precision=DoublePrecision} => (stream "FPLoadDouble "; printMemAddress address)
1346        |   FPLoadFromMemory{address, precision=SinglePrecision} => (stream "FPLoadSingle "; printMemAddress address)
1347        |   FPLoadFromFPReg {source, lastRef} =>
1348                (stream "FPLoad "; printFPReg source; if lastRef then stream " (LAST)" else())
1349        |   FPLoadFromConst{constant, precision} =>
1350            (
1351                case precision of DoublePrecision => stream "FPLoadD " |  SinglePrecision => stream "FPLoadS";
1352                stream(Address.stringOfWord constant)
1353            )
1354        |   FPStoreToFPReg{ output, andPop } =>
1355                (if andPop then stream "FPStoreAndPop => " else stream "FPStore => "; printFPReg output)
1356        |   FPStoreToMemory{ address, precision=DoublePrecision, andPop: bool } =>
1357            (
1358                if andPop then stream "FPStoreDoubleAndPop => " else stream "FPStoreDouble => ";
1359                printMemAddress address
1360            )
1361        |   FPStoreToMemory{ address, precision=SinglePrecision, andPop: bool } =>
1362            (
1363                if andPop then stream "FPStoreSingleAndPop => " else stream "FPStoreSingle => ";
1364                printMemAddress address
1365            )
1366        |   FPArithR{ opc, source } => (stream(fpOpRepr opc); stream " "; printFPReg source)
1367        |   FPArithConst{ opc, source, precision } =>
1368                (stream(fpOpRepr opc); case precision of DoublePrecision => stream "D " |  SinglePrecision => stream "S "; stream(Address.stringOfWord source))
1369        |   FPArithMemory{ opc, base, offset, precision } =>
1370                (stream(fpOpRepr opc); case precision of DoublePrecision => stream "D " |  SinglePrecision => stream "S "; printBaseOffset(base, NoIndex, offset))
1371        |   FPUnary opc => stream(fpUnaryRepr opc)
1372        |   FPStatusToEAX => (stream "FPStatus "; printGReg eax)
1373        |   FPLoadInt { base, offset, opSize} =>
1374                (stream "FPLoadInt"; stream(printOpSize opSize); stream " "; printBaseOffset(base, NoIndex, offset))
1375        |   FPFree reg => (stream "FPFree "; printFPReg reg)
1376        |   MultiplyR {source, output, opSize } =>
1377                (stream "MultiplyR"; stream(printOpSize opSize); stream " "; printRegOrMemoryArg printGReg source; stream " *=>"; printGReg output)
1378        |   XMMArith { opc, source, output } =>
1379            (
1380                stream (sse2OpRepr opc ^ "RM "); printXMMReg output; stream " <= "; printRegOrMemoryArg printXMMReg source
1381            )
1382        |   XMMStoreToMemory { toStore, address, precision=DoublePrecision } =>
1383            (
1384                stream "MoveDouble "; printXMMReg toStore; stream " => "; printMemAddress address
1385            )
1386        |   XMMStoreToMemory { toStore, address, precision=SinglePrecision } =>
1387            (
1388                stream "MoveSingle "; printXMMReg toStore; stream " => "; printMemAddress address
1389            )
1390        |   XMMConvertFromInt { source, output, opSize } =>
1391            (
1392                stream "ConvertFromInt "; stream(printOpSize opSize); stream " "; printGReg source; stream " => "; printXMMReg output
1393            )
1394        |   SignExtendForDivide opSize => ( stream "SignExtendForDivide"; stream(printOpSize opSize) )
1395        |   XChng { reg, arg, opSize } =>
1396                (stream "XChng"; stream(printOpSize opSize); stream " "; printGReg reg; stream " <=> "; printRegOrMemoryArg printGReg arg)
1397        |   Negative { output, opSize } =>
1398                (stream "Negative"; stream(printOpSize opSize); stream " "; printGReg output)
1399        |   JumpTable{cases, ...} =>
1400                List.app(fn(Label{labelNo, ...}) => (stream "UncondBranch L"; stream(Int.toString labelNo); stream "\n")) cases
1401        |   IndexedJumpCalc { addrReg, indexReg, jumpSize=ref jumpSize } =>
1402            (
1403                stream "IndexedJumpCalc "; printGReg addrReg; stream " += "; printGReg indexReg;
1404                stream (case jumpSize of JumpSize2 => " * 2" | JumpSize8 => " * 8 ")
1405            )
1406        |   MoveXMMRegToGenReg { source, output } =>
1407            (
1408                stream "MoveXMMRegToGenReg "; printXMMReg source; stream " => "; printGReg output
1409            )
1410        |   MoveGenRegToXMMReg { source, output } =>
1411            (
1412                stream "MoveGenRegToXMMReg "; printGReg source; stream " => "; printXMMReg output
1413            )
1414        |   XMMShiftRight { output, shift } =>
1415            (
1416                stream "XMMShiftRight "; printXMMReg output; stream " by "; stream(Word8.toString shift)
1417            )
1418        |   FPLoadCtrlWord address =>
1419            (
1420                stream "FPLoadCtrlWord "; stream " => "; printMemAddress address
1421            )
1422        |   FPStoreCtrlWord address =>
1423            (
1424                stream "FPStoreCtrlWord "; stream " <= "; printMemAddress address
1425            )
1426        |   XMMLoadCSR address =>
1427            (
1428                stream "XMMLoadCSR "; stream " => "; printMemAddress address
1429            )
1430        |   XMMStoreCSR address =>
1431            (
1432                stream "XMMStoreCSR "; stream " <= "; printMemAddress address
1433            )
1434        |   FPStoreInt address =>
1435            (
1436                stream "FPStoreInt "; stream " <= "; printMemAddress address
1437            )
1438        |   XMMStoreInt{ source, output, precision, isTruncate } =>
1439            (
1440                stream "XMMStoreInt";
1441                case precision of SinglePrecision => stream "Single" | DoublePrecision => stream "Double";
1442                if isTruncate then stream "Truncate " else stream " ";
1443                printGReg output; stream " <= "; printRegOrMemoryArg printXMMReg source
1444            )
1445        |   CondMove { test, output, source, opSize } =>
1446            (
1447                stream "CondMove"; stream(branchOpRepr test); stream(printOpSize opSize);
1448                printGReg output; stream " <= "; printRegOrMemoryArg printGReg source 
1449            )
1450        |   LoadAbsolute { destination, value } =>
1451                ( stream "LoadAbsolute "; printGReg destination; stream " <= "; stream(Address.stringOfWord value) )
1452        ;
1453 
1454        stream "\n"
1455    end
1456
1457    datatype implement = ImplementGeneral | ImplementLiteral of machineWord
1458
1459    fun printLowLevelCode(ops, Code{printAssemblyCode, printStream, procName, ...}) =
1460        if printAssemblyCode
1461        then
1462        (
1463            if procName = "" (* No name *) then printStream "?" else printStream procName;
1464            printStream ":\n";
1465            List.app(fn i => printOperation(i, printStream)) ops;
1466            printStream "\n"
1467        )
1468        else ()
1469    
1470(*    val opLen = if isX64 then OpSize64 else OpSize32 *)
1471
1472    (* Code generate a list of operations.  The list is in reverse order i.e. last instruction first. *)
1473    fun codeGenerate ops =
1474    let
1475        fun cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move64 }) =
1476                (* Move from one general register to another.  N.B. Because we're using the
1477                   "store" version of the Move the source and output are reversed. *)
1478                opReg(MOVL_R_A64, source, output)
1479
1480        |   cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move32 }) =
1481                opReg(MOVL_R_A32, source, output)
1482
1483        |   cgOp(Move{ source=NonAddressConstArg source, destination=RegisterArg output, moveSize=Move64}) =
1484            if targetArch <> Native32Bit
1485            then
1486            (
1487                (* N.B. There is related code in getConstant that deals with PC-relative values and
1488                   also checks the range of constants that need to be in the constant area. *)
1489                if source >= 0 andalso source < 0x100000000
1490                then (* Unsigned 32 bits.  We can use a 32-bit instruction to set the
1491                        value because it will zero extend to 64-bits.
1492                        This may also allow us to save a rex byte. *)
1493                let
1494                    val (rc, rx) = getReg output
1495                    val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE)
1496                in
1497                    opb @ word32Unsigned(LargeWord.fromLargeInt source)
1498                end
1499                else if source >= ~0x80000000 andalso source < 0
1500                then (* Signed 32-bits. *)
1501                    (* This is not scanned in 64-bit mode because 32-bit values aren't
1502                       big enough to contain addresses. *)
1503                    opRegPlus2(MOVL_32_A64, output, 0w0) @ int32Signed source
1504                else (* Too big for 32-bits; put it in the non-word area. *)
1505                    opConstantOperand(MOVL_A_R64, output)
1506            )
1507            else (* 32-bit mode. *)
1508            (
1509                (* The RTS scans for possible addresses in MOV instructions so we
1510                   can only use MOV if this is a tagged value.  If it isn't we have
1511                   to use something else such as XOR/ADD.  In particular this is used
1512                   before LOCK XADD for atomic inc/dec.
1513                   We expect Move to preserve the CC so shouldn't use anything that
1514                   affects it.  There was a previous comment that said that using
1515                   LEA wasn't a good idea.  Perhaps because it takes 6 bytes. *)
1516                if source mod 2 = 0
1517                then opIndexed(LEAL32, source, NONE, NoIndex, output)
1518                else
1519                let
1520                    val (rc, rx) = getReg output
1521                    val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE)
1522                in
1523                    opb @ int32Signed source
1524                end
1525            )
1526
1527        |   cgOp(Move{ source=NonAddressConstArg source, destination=RegisterArg output, moveSize=Move32}) =
1528            if targetArch <> Native32Bit
1529            then
1530            (
1531                (* N.B. There is related code in getConstant that deals with PC-relative values and
1532                   also checks the range of constants that need to be in the constant area. *)
1533                if source >= 0 andalso source < 0x100000000
1534                then (* Unsigned 32 bits.  We can use a 32-bit instruction to set the
1535                        value because it will zero extend to 64-bits.
1536                        This may also allow us to save a rex byte. *)
1537                let
1538                    val (rc, rx) = getReg output
1539                    val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE)
1540                in
1541                    opb @ word32Unsigned(LargeWord.fromLargeInt source)
1542                end
1543                else if source >= ~0x80000000 andalso source < 0
1544                then (* Signed 32-bits. *)
1545                    (* This is not scanned in 64-bit mode because 32-bit values aren't
1546                       big enough to contain addresses. *)
1547                    opRegPlus2(MOVL_32_A64, output, 0w0) @ int32Signed source
1548                else (* Too big for 32-bits; put it in the non-word area. *)
1549                    opConstantOperand(MOVL_A_R64, output)
1550            )
1551            else (* 32-bit mode. *)
1552            (
1553                (* The RTS scans for possible addresses in MOV instructions so we
1554                   can only use MOV if this is a tagged value.  If it isn't we have
1555                   to use something else such as XOR/ADD.  In particular this is used
1556                   before LOCK XADD for atomic inc/dec.
1557                   We expect Move to preserve the CC so shouldn't use anything that
1558                   affects it.  There was a previous comment that said that using
1559                   LEA wasn't a good idea.  Perhaps because it takes 6 bytes. *)
1560                if source mod 2 = 0
1561                then opIndexed(LEAL32, source, NONE, NoIndex, output)
1562                else
1563                let
1564                    val (rc, rx) = getReg output
1565                    val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE)
1566                in
1567                    opb @ int32Signed source
1568                end
1569            )
1570
1571        |   cgOp(Move{ source=AddressConstArg _, destination=RegisterArg output, moveSize=Move64 }) =
1572            (
1573                (* The constant area is currently PolyWords.  That means we MUST use
1574                   a 32-bit load in 32-in-64. *)
1575                targetArch = Native64Bit orelse raise InternalError "Move64 in 32-bit";
1576                (* Put address constants in the constant area. *)
1577                opConstantOperand(MOVL_A_R64, output)
1578            )
1579 
1580        |   cgOp(Move{ source=AddressConstArg _, destination=RegisterArg output, moveSize=Move32 }) =
1581            (
1582                case targetArch of
1583                    Native64Bit => raise InternalError "Move32 - AddressConstArg"
1584                
1585                |   ObjectId32Bit =>
1586                        (* Put address constants in the constant area. *)
1587                        (* The constant area is currently PolyWords.  That means we MUST use
1588                           a 32-bit load in 32-in-64. *)
1589                        opConstantOperand(MOVL_A_R32, output)
1590
1591                |   Native32Bit =>
1592                        (* Immediate constant *)
1593                    let
1594                        val (rc, _) = getReg output
1595                    in
1596                        opCodeBytes(MOVL_32_R rc, NONE) @ int32Signed(tag 0)
1597                    end
1598            )
1599
1600        |   cgOp(LoadAbsolute{ destination, ... }) =
1601            (
1602                (* Immediate address constant.  This is currently only used the special case of loading
1603                   the address of PolyX86GetThreadData in a callback when we don't have rbx in 32-in-64. *)
1604                case targetArch of
1605                    Native32Bit =>
1606                    let
1607                        val (rc, _) = getReg destination
1608                    in
1609                        opCodeBytes(MOVL_32_R rc, NONE) @ int32Signed(tag 0)
1610                    end
1611                
1612                |   Native64Bit => opConstantOperand(MOVL_A_R64, destination)
1613                
1614                |   ObjectId32Bit =>
1615                    let
1616                        val (rc, rx) = getReg destination
1617                    in
1618                        opCodeBytes(MOVL_64_R rc, SOME{w=true, r=false, b=rx, x=false}) @ largeWordToBytes(LargeWord.fromLargeInt(tag 0), 8)
1619                    end
1620                    
1621            )
1622
1623        |   cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move32 }) =
1624                opAddress(MOVL_A_R32, LargeInt.fromInt offset, base, index, output)
1625
1626        |   cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move64 }) =
1627                opAddress(MOVL_A_R64, LargeInt.fromInt offset, base, index, output)
1628
1629        |   cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8 }) =
1630                (* We don't need a REX.W bit here because the top 32-bits of a
1631                   64-bit register will always be zeroed. *)
1632                opAddress(MOVZB, LargeInt.fromInt offset, base, index, output)
1633
1634        |   cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move8 }) =
1635            let
1636                (* Zero extend an 8-bit value in a register to 32/64 bits. *)
1637                val (rrC, rrX) = getReg output
1638                val (rbC, rbX) = getReg source
1639                (* We don't need a REX.W bit here because the top 32-bits of a
1640                   64-bit register will always be zeroed but we may need a REX byte
1641                   if we're using esi or edi. *)
1642                val rexByte =
1643                    if rrC < 0w4 andalso not rrX andalso not rbX
1644                    then NONE
1645                    else if hostIsX64
1646                    then SOME {w=false, r=rrX, b=rbX, x=false}
1647                    else raise InternalError "Move8 with esi/edi"
1648            in
1649                opCodeBytes(MOVZB, rexByte) @ [modrm(Register, rrC, rbC)]
1650            end
1651
1652        |   cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8X32 }) =
1653                opAddress(MOVSXB32, LargeInt.fromInt offset, base, index, output)
1654
1655        |   cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8X64 }) =
1656                (* But we will need a Rex.W here. *)
1657                opAddress(MOVSXB64, LargeInt.fromInt offset, base, index, output)
1658
1659        |   cgOp(Move{moveSize=Move16, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) =
1660                (* No need for Rex.W *)
1661                opAddress(MOVZW, LargeInt.fromInt offset, base, index, output)
1662
1663        |   cgOp(Move{moveSize=Move16X32, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) =
1664                opAddress(MOVSXW32, LargeInt.fromInt offset, base, index, output)
1665
1666        |   cgOp(Move{moveSize=Move16X64, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) =
1667                (* But we do need Rex.W here *)
1668                opAddress(MOVSXW64, LargeInt.fromInt offset, base, index, output)
1669
1670        |   cgOp(Move{moveSize=Move32X64, source=RegisterArg source, destination=RegisterArg output }) =
1671                (* We should have a REX.W bit here. *)
1672                opReg(MOVSXD, output, source)
1673
1674        |   cgOp(Move{moveSize=Move32X64, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) =
1675                (* We should have a REX.W bit here. *)
1676                opAddress(MOVSXD, LargeInt.fromInt offset, base, index, output)
1677
1678        |   cgOp(Move{moveSize=Move32X64, ...}) = raise InternalError "cgOp: LoadNonWord Size32Bit"
1679
1680        |   cgOp(LoadAddress{ offset, base, index, output, opSize }) =
1681                (* This provides a mixture of addition and multiplication in a single
1682                   instruction. *)
1683                opIndexed(case opSize of OpSize64 => LEAL64 | OpSize32 => LEAL32, LargeInt.fromInt offset, base, index, output)
1684
1685        |   cgOp(ArithToGenReg{ opc, output, source=RegisterArg source, opSize }) =
1686                arithOpReg (opc, output, source, opSize=OpSize64)
1687
1688        |   cgOp(ArithToGenReg{ opc, output, source=NonAddressConstArg source, opSize }) =
1689            let
1690                (* On the X86/32 we use CMP with literal sources to compare with an
1691                   address and the RTS searches for them in the code.  Any
1692                   non-address constant must be tagged.  Most will be but we
1693                   might want to use this to compare with the contents of a
1694                   LargeWord value. *)
1695                val _ =
1696                    if hostIsX64 orelse is8BitL source orelse opc <> CMP orelse IntInf.andb(source, 1) = 1
1697                    then ()
1698                    else raise InternalError "CMP with constant that looks like an address"
1699            in
1700                immediateOperand(opc, output, source, opSize)
1701            end
1702
1703        |   cgOp(ArithToGenReg{ opc, output, source=AddressConstArg _, opSize }) =
1704            (* This is only used for opc=CMP to compare addresses for equality. *)
1705            if hostIsX64
1706            then (* We use this in 32-in-64 as well as native 64-bit. *)
1707                opConstantOperand(
1708                    (case opSize of OpSize64 => Arith64 | OpSize32 => Arith32) (opc, 0w3), output)
1709            else
1710            let
1711                val (rc, _) = getReg output
1712                val opb = opCodeBytes(Group1_32_A32 (* group1, 32 bit immediate *), NONE)
1713                val mdrm = modrm(Register, arithOpToWord opc, rc)
1714            in
1715                opb @ [mdrm] @ int32Signed(tag 0)
1716            end
1717
1718        |   cgOp(ArithToGenReg{ opc, output, source=MemoryArg{offset, base, index}, opSize }) =
1719                opAddress((case opSize of OpSize64 => Arith64 | OpSize32 => Arith32) (opc, 0w3),
1720                    LargeInt.fromInt offset, base, index, output)
1721
1722        |   cgOp(ArithByteMemConst{ opc, address={offset, base, index}, source }) =
1723                opIndexedPlus2(Group1_8_a (* group1, 8 bit immediate *),
1724                               LargeInt.fromInt offset, base, index, arithOpToWord opc) @ [source]
1725
1726        |   cgOp(ArithMemConst{ opc, address={offset, base, index}, source, opSize }) =
1727                if is8BitL source
1728                then (* Can use one byte immediate *) 
1729                    opIndexedPlus2(case opSize of OpSize64 => Group1_8_A64 | OpSize32 => Group1_8_A32 (* group1, 8 bit immediate *),
1730                               LargeInt.fromInt offset, base, index, arithOpToWord opc) @ [Word8.fromLargeInt source]
1731                else (* Need 32 bit immediate. *)
1732                    opIndexedPlus2(case opSize of OpSize64 => Group1_32_A64 | OpSize32 => Group1_32_A32(* group1, 32 bit immediate *), 
1733                               LargeInt.fromInt offset, base, index, arithOpToWord opc) @ int32Signed source
1734
1735        |   cgOp(ArithMemLongConst{ opc, address={offset, base, index}, ... }) =
1736                (* Currently this is always a comparison.  It is only valid in 32-bit mode because
1737                   the constant is only 32-bits. *)
1738                if hostIsX64
1739                then raise InternalError "ArithMemLongConst in 64-bit mode"
1740                else 
1741                let
1742                    val opb = opIndexedPlus2 (Group1_32_A32, LargeInt.fromInt offset, base, index, arithOpToWord opc)
1743                in
1744                    opb @ int32Signed(tag 0)
1745                end
1746
1747        |   cgOp(ShiftConstant { shiftType, output, shift, opSize }) =
1748                if shift = 0w1
1749                then opRegPlus2(case opSize of OpSize64 => Group2_1_A64 | OpSize32 => Group2_1_A32, output, shiftTypeToWord shiftType)
1750                else opRegPlus2(case opSize of OpSize64 => Group2_8_A64 | OpSize32 => Group2_8_A32, output, shiftTypeToWord shiftType) @ [shift]
1751
1752        |   cgOp(ShiftVariable { shiftType, output, opSize }) =
1753                opRegPlus2(case opSize of OpSize64 => Group2_CL_A64 | OpSize32 => Group2_CL_A32, output, shiftTypeToWord shiftType)
1754
1755        |   cgOp(TestByteBits{arg=RegisterArg reg, bits}) =
1756            let
1757            (* Test the bottom bit and jump depending on its value.  This is used
1758               for tag tests in arbitrary precision operations and also for testing
1759               for short/long values. *)
1760                val (regNum, rx) = getReg reg
1761            in
1762                if reg = eax
1763                then (* Special instruction for testing accumulator.  Can use an 8-bit test. *)
1764                    opCodeBytes(TEST_ACC8, NONE) @ [bits]
1765                else if hostIsX64
1766                then 
1767                let
1768                    (* We can use a REX code to force it to always use the low order byte. *)
1769                    val opb = opCodeBytes(Group3_a,
1770                        if rx orelse regNum >= 0w4 then SOME{w=false, r=false, b=rx, x=false} else NONE)
1771                    val mdrm = modrm (Register, 0w0 (* test *), regNum)
1772                in
1773                    opb @ [mdrm, bits]
1774                end
1775                else if reg = ebx orelse reg = ecx orelse reg = edx (* can we use an 8-bit test? *)
1776                then (* Yes. The register value refers to low-order byte. *)
1777                let
1778                    val opb = opCodeBytes(Group3_a, NONE)
1779                    val mdrm = modrm(Register, 0w0 (* test *), regNum)
1780                in
1781                    opb @ [mdrm, bits]
1782                end
1783                else
1784                let
1785                    val opb = opCodeBytes(Group3_A32, NONE)
1786                    val mdrm = modrm (Register, 0w0 (* test *), regNum)
1787                in
1788                    opb @ mdrm :: word32Unsigned(Word8.toLarge bits)
1789                end
1790            end
1791
1792        |   cgOp(TestByteBits{arg=MemoryArg{base, offset, index}, bits}) =
1793                (* Test the tag bit and set the condition code. *)
1794                opIndexedPlus2(Group3_a, LargeInt.fromInt offset, base, index, 0w0 (* test *)) @ [ bits]
1795
1796        |   cgOp(TestByteBits _) = raise InternalError "cgOp: TestByteBits"                
1797
1798        |   cgOp(ConditionalBranch{ test=opc, ... }) = opCodeBytes(CondJump32 opc, NONE) @ word32Unsigned 0w0
1799        
1800        |   cgOp(SetCondition{ output, test}) =
1801            let
1802                val (rrC, rx) = getReg output
1803                (* In 64-bit mode we can specify the low-order byte of RSI/RDI but we
1804                   must use a REX prefix.  This isn't possible in 32-bit mode. *)
1805            in
1806                if hostIsX64 orelse rrC < 0w4
1807                then
1808                let
1809                    val opb = opCodeBytes(SetCC test,
1810                        if rx orelse rrC >= 0w4 then SOME{w=false, r=false, b=rx, x=false} else NONE)
1811                    val mdrm = modrm (Register, 0w0, rrC)
1812                in
1813                    opb @ [mdrm]
1814                end
1815                else raise InternalError "High byte register"
1816            end
1817
1818        |   cgOp(CallRTS{rtsEntry, saveRegs}) = rtsCall(rtsEntry, saveRegs)
1819
1820        |   cgOp(RepeatOperation repOp) =
1821            let
1822                (* We don't explicitly clear the direction flag.  Should that be done? *)
1823                val opb = opCodeBytes(REP, NONE)
1824                (* Put in a rex prefix to force 64-bit mode. *)
1825                val optRex =
1826                    if case repOp of STOS64 => true | MOVS64 => true | _ => false
1827                    then [rex{w=true, r=false, b=false, x=false}]
1828                    else []
1829                val repOp = repOpsToWord repOp
1830            in
1831                opb @ optRex @ [repOp]
1832            end
1833
1834        |   cgOp(DivideAccR{arg, isSigned, opSize}) =
1835                opRegPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, arg, if isSigned then 0w7 else 0w6)
1836
1837        |   cgOp(DivideAccM{base, offset, isSigned, opSize}) =
1838                opPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, LargeInt.fromInt offset, base, if isSigned then 0w7 else 0w6)
1839
1840        |   cgOp(AtomicXAdd{address={offset, base, index}, output, opSize}) =
1841                (* Locked exchange-and-add.  We need the lock prefix before the REX prefix. *)
1842                opAddress(case opSize of OpSize64 => LOCK_XADD64 | OpSize32 => LOCK_XADD32, LargeInt.fromInt offset, base, index, output)
1843
1844        |   cgOp(PushToStack(RegisterArg reg)) =
1845            let
1846                val (rc, rx) = getReg reg
1847            in
1848                (* Always 64-bit but a REX prefix may be needed for the register. *)
1849                opCodeBytes(PUSH_R rc, if rx then SOME{w=false, b = true, x=false, r = false } else NONE)
1850            end
1851
1852        |   cgOp(PushToStack(MemoryArg{base, offset, index})) =
1853                opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w6 (* push *))
1854
1855        |   cgOp(PushToStack(NonAddressConstArg constnt)) = 
1856                if is8BitL constnt
1857                then opCodeBytes(PUSH_8, NONE) @ [Word8.fromLargeInt constnt]
1858                else if is32bit constnt
1859                then opCodeBytes(PUSH_32, NONE) @ int32Signed constnt
1860                else (* It won't fit in the immediate; put it in the non-address area. *)
1861                let
1862                    val opb = opCodeBytes(Group5, NONE)
1863                    val mdrm = modrm(Based0, 0w6 (* push *), 0w5 (* PC rel *))
1864                in
1865                    opb @ [mdrm] @ int32Signed(tag 0)
1866                end
1867
1868        |   cgOp(PushToStack(AddressConstArg _)) =
1869            (
1870                case targetArch of
1871                    Native64Bit => (* Put it in the constant area. *)
1872    		        let
1873                        val opb = opCodeBytes(Group5, NONE)
1874                        val mdrm = modrm(Based0, 0w6 (* push *), 0w5 (* PC rel *));
1875                    in
1876                        opb @ [mdrm] @ int32Signed(tag 0)
1877                    end
1878                |   Native32Bit => opCodeBytes(PUSH_32, NONE) @ int32Signed(tag 0)
1879                |   ObjectId32Bit =>
1880                    (* We can't do this.  The constant area contains 32-bit quantities
1881                       and 32-bit literals are sign-extended rather than zero-extended. *)
1882                        raise InternalError "PushToStack:AddressConstArg"
1883           )
1884
1885        |   cgOp(PopR reg ) =
1886                let
1887                    val (rc, rx) = getReg reg
1888                in
1889                    (* Always 64-bit but a REX prefix may be needed for the register.
1890                       Because the register is encoded in the instruction the rex bit for
1891                       the register is b not r. *)
1892                    opCodeBytes(POP_R rc, if rx then SOME{w=false, b = true, x=false, r = false } else NONE)
1893                end
1894
1895        |   cgOp(Move{source=RegisterArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move64}) =
1896                opAddress(MOVL_R_A64, LargeInt.fromInt offset, base, index, toStore)
1897
1898        |   cgOp(Move{source=RegisterArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move32}) =
1899                opAddress(MOVL_R_A32, LargeInt.fromInt offset, base, index, toStore)
1900
1901        |   cgOp(Move{source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move64 }) =
1902            (
1903                (* Short constant.  In 32-bit mode this is scanned as a possible address.  That means
1904                   we can't have an untagged constant in it.  That's not a problem in 64-bit mode.
1905                   There's a special check for using this to set the length word on newly allocated
1906                   memory. *)
1907                targetArch <> Native32Bit orelse toStore = 0 orelse toStore mod 2 = 1 orelse offset = ~ (Word.toInt wordSize)
1908                    orelse raise InternalError "cgOp: StoreConstToMemory not tagged";
1909                opAddressPlus2(MOVL_32_A64, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed toStore
1910            )
1911
1912        |   cgOp(Move{source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move32 }) =
1913            (
1914                (* Short constant.  In 32-bit mode this is scanned as a possible address.  That means
1915                   we can't have an untagged constant in it.  That's not a problem in 64-bit mode.
1916                   There's a special check for using this to set the length word on newly allocated
1917                   memory. *)
1918                targetArch <> Native32Bit orelse toStore = 0 orelse toStore mod 2 = 1 orelse offset = ~ (Word.toInt wordSize)
1919                    orelse raise InternalError "cgOp: StoreConstToMemory not tagged";
1920                opAddressPlus2(MOVL_32_A32, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed toStore
1921            )
1922
1923        |   cgOp(Move{source=AddressConstArg _, destination=MemoryArg{offset, base, index}, moveSize=Move32}) =
1924                (* This is not used for addresses even in 32-in-64.  We don't scan for addresses after MOVL_32_A. *)
1925                if targetArch <> Native32Bit
1926                then raise InternalError "StoreLongConstToMemory in 64-bit mode"
1927                else opAddressPlus2(MOVL_32_A32, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed (tag 0)
1928
1929        |   cgOp(Move{source=AddressConstArg _, destination=MemoryArg _, ...}) =
1930                raise InternalError "cgOp: Move - AddressConstArg => MemoryArg"
1931
1932        |   cgOp(Move{ moveSize = Move8, source=RegisterArg toStore, destination=MemoryArg{offset, base, index} }) =
1933            let
1934                val (rrC, _) = getReg toStore
1935                (* In 64-bit mode we can specify the low-order byte of RSI/RDI but we
1936                   must use a REX prefix.  This isn't possible in 32-bit mode. *)
1937                val opcode =
1938                    if hostIsX64 then MOVB_R_A64{forceRex= rrC >= 0w4}
1939                    else if rrC < 0w4 then MOVB_R_A32
1940                    else raise InternalError "High byte register"
1941            in
1942                opAddress(opcode, LargeInt.fromInt offset, base, index, toStore)
1943            end
1944
1945        |   cgOp(Move{ moveSize = Move16, source=RegisterArg toStore, destination=MemoryArg{offset, base, index}}) =
1946                opAddress(MOVL_R_A16, LargeInt.fromInt offset, base, index, toStore)
1947
1948        |   cgOp(Move{ moveSize = Move8, source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}}) =
1949                opAddressPlus2(MOVB_8_A, LargeInt.fromInt offset, base, index, 0w0) @
1950                    [Word8.fromLargeInt toStore]
1951
1952        |   cgOp(Move _) = raise InternalError "Move: Unimplemented arguments"
1953
1954            (* Allocation is dealt with by expanding the code. *)
1955        |   cgOp(AllocStore _) = raise InternalError "cgOp: AllocStore"
1956
1957        |   cgOp(AllocStoreVariable _) = raise InternalError "cgOp: AllocStoreVariable"
1958
1959        |   cgOp StoreInitialised = raise InternalError "cgOp: StoreInitialised"
1960
1961        |   cgOp(CallAddress(NonAddressConstArg _)) = (* Call to the start of the code.  Offset is patched in later. *)
1962                opCodeBytes (CALL_32, NONE) @ int32Signed 0
1963
1964        |   cgOp(CallAddress(AddressConstArg _)) =
1965            if targetArch = Native64Bit
1966            then
1967            let
1968                val opc = opCodeBytes(Group5, NONE)
1969                val mdrm = modrm(Based0, 0w2 (* call *), 0w5 (* PC rel *))
1970            in
1971                opc @ [mdrm] @ int32Signed(tag 0)
1972            end
1973            (* Because this is a relative branch we need to point this at itself.
1974               Until it is set to the relative offset of the destination it
1975               needs to contain an address within the code and this could
1976               be the last instruction. *)
1977            else opCodeBytes (CALL_32, NONE) @ int32Signed ~5
1978            
1979        |   cgOp(CallAddress(RegisterArg reg)) = opRegPlus2(Group5, reg, 0w2 (* call *))
1980     
1981        |   cgOp(CallAddress(MemoryArg{base, offset, index})) =
1982                opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w2 (* call *))
1983
1984        |   cgOp(JumpAddress(NonAddressConstArg _)) =
1985                (* Jump to the start of the current function.  Offset is patched in later. *)
1986                opCodeBytes (JMP_32, NONE) @ int32Signed 0
1987
1988        |   cgOp(JumpAddress (AddressConstArg _)) =
1989            if targetArch = Native64Bit
1990            then
1991            let
1992                val opb = opCodeBytes (Group5, NONE)
1993                val mdrm = modrm(Based0, 0w4 (* jmp *), 0w5 (* PC rel *))
1994            in
1995                opb @ [mdrm] @ int32Signed(tag 0)
1996            end
1997            else opCodeBytes (JMP_32, NONE) @ int32Signed ~5 (* As with Call. *)
1998            
1999        |   cgOp(JumpAddress (RegisterArg reg)) =
2000            (* Used as part of indexed case - not for entering a function. *)
2001                opRegPlus2(Group5, reg, 0w4 (* jmp *))
2002
2003        |   cgOp(JumpAddress(MemoryArg{base, offset, index})) =
2004                opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w4 (* jmp *))
2005    
2006        |   cgOp(ReturnFromFunction args) =
2007            if args = 0
2008            then opCodeBytes(RET, NONE)
2009            else
2010            let
2011                val offset = Word.fromInt args * nativeWordSize
2012            in
2013                opCodeBytes(RET_16, NONE) @ [wordToWord8 offset, wordToWord8(offset >> 0w8)]
2014            end
2015
2016        |   cgOp (RaiseException { workReg }) =
2017                opEA(if hostIsX64 then MOVL_A_R64 else MOVL_A_R32, LargeInt.fromInt memRegHandlerRegister, ebp, workReg) @
2018                    opAddressPlus2(Group5, 0, workReg, NoIndex, 0w4 (* jmp *))
2019
2020        |   cgOp(UncondBranch _) = opToInt JMP_32 :: word32Unsigned 0w0
2021
2022        |   cgOp(ResetStack{numWords, preserveCC}) =
2023            let
2024                val bytes = Word.toLargeInt(Word.fromInt numWords * nativeWordSize)
2025            in
2026                (* If we don't need to preserve the CC across the reset we use ADD since
2027                   it's shorter. *)
2028                if preserveCC
2029                then opEA(if hostIsX64 then LEAL64 else LEAL32, bytes, esp, esp)
2030                else immediateOperand(ADD, esp, bytes, if hostIsX64 then OpSize64 else OpSize32)
2031            end
2032
2033        |   cgOp(JumpLabel _) = [] (* No code. *)
2034
2035        |   cgOp(LoadLabelAddress{ output, ... }) =
2036            (* Load the address of a label.  Used when setting up an exception handler or
2037               in indexed cases. *)
2038            (* On X86/64 we can use pc-relative addressing to set the start of the handler.
2039               On X86/32 we have to load the address of the start of the code and add an offset. *)
2040            if hostIsX64
2041            then opConstantOperand(LEAL64, output)
2042            else
2043            let
2044                val (rc, _) = getReg output
2045            in
2046                opCodeBytes(MOVL_32_R rc , NONE) @ int32Signed(tag 0) @
2047                        opRegPlus2(Group1_32_A32, output, arithOpToWord ADD) @ int32Signed 0
2048            end
2049
2050        |   cgOp (FPLoadFromMemory {address={ base, offset, index }, precision}) =
2051            let
2052                val loadInstr =
2053                    case precision of
2054                        DoublePrecision => FPESC 0w5
2055                    |   SinglePrecision => FPESC 0w1
2056            in
2057                opAddressPlus2(loadInstr, LargeInt.fromInt offset, base, index, 0wx0)
2058            end
2059
2060        |   cgOp (FPLoadFromFPReg{source=FloatingPtReg fp, ...}) =
2061                (* Assume there's nothing currently on the stack. *)
2062                floatingPtOp({escape=0w1, md=0w3, nnn=0w0, rm= fp + 0w0}) (* FLD ST(r1) *)
2063
2064        |   cgOp (FPLoadFromConst {precision, ...} ) =
2065            (* The real constant here is actually the address of a memory
2066               object.  FLD takes the address as the argument and in 32-bit mode
2067               we use an absolute address.  In 64-bit mode we need to put the
2068               constant at the end of the code segment and use PC-relative
2069               addressing which happens to be encoded in the same way.
2070               There are special cases for zero and one but it's probably too
2071               much work to detect them. *)
2072            let
2073                val esc = case precision of SinglePrecision => 0w1 | DoublePrecision => 0w5
2074                val opb = opCodeBytes(FPESC esc, NONE) (* FLD [Constant] *)
2075                val mdrm = modrm (Based0, 0w0, 0w5 (* constant address/PC-relative *))
2076            in
2077                opb @ [mdrm] @ int32Signed(tag 0)
2078            end
2079
2080        |   cgOp (FPStoreToFPReg{ output=FloatingPtReg dest, andPop }) =
2081                (* Assume there's one item on the stack. *)
2082                floatingPtOp({escape=0w5, md=0w3, nnn=if andPop then 0wx3 else 0wx2,
2083                               rm = dest+0w1(* One item *)}) (* FSTP ST(n+1) *)
2084
2085        |   cgOp (FPStoreToMemory{address={ base, offset, index}, precision, andPop }) =
2086            let
2087                val storeInstr =
2088                    case precision of
2089                        DoublePrecision => FPESC 0w5
2090                    |   SinglePrecision => FPESC 0w1
2091                val subInstr = if andPop then 0wx3 else 0wx2
2092            in
2093                opAddressPlus2(storeInstr, LargeInt.fromInt offset, base, index, subInstr)
2094            end
2095
2096        |   cgOp (FPArithR{ opc, source = FloatingPtReg src}) =
2097                floatingPtOp({escape=0w0, md=0w3, nnn=fpOpToWord opc,
2098                        rm=src + 0w1 (* One item already there *)})
2099
2100        |   cgOp (FPArithConst{ opc, precision, ... }) =
2101                (* See comment on FPLoadFromConst *)
2102            let
2103                val fpesc = case precision of DoublePrecision => 0w4 | SinglePrecision => 0w0
2104                val opb = opCodeBytes(FPESC fpesc, NONE) (* FADD etc [constnt] *)
2105                val mdrm = modrm (Based0, fpOpToWord opc, 0w5 (* constant address *))
2106            in
2107                opb @ [mdrm] @ int32Signed(tag 0)
2108            end
2109
2110        |   cgOp (FPArithMemory{ opc, base, offset, precision }) =
2111            let
2112                val fpesc = case precision of DoublePrecision => 0w4 | SinglePrecision => 0w0
2113            in
2114                opPlus2(FPESC fpesc, LargeInt.fromInt offset, base, fpOpToWord opc) (* FADD/FMUL etc [r2] *)
2115            end
2116
2117        |   cgOp (FPUnary opc ) =
2118            let
2119                val {rm, nnn} = fpUnaryToWords opc
2120            in
2121                floatingPtOp({escape=0w1, md=0w3, nnn=nnn, rm=rm}) (* FCHS etc *)
2122            end
2123
2124        |   cgOp (FPStatusToEAX ) =
2125                opCodeBytes(FPESC 0w7, NONE) @ [0wxe0] (* FNSTSW AX *)
2126
2127        |   cgOp (FPFree(FloatingPtReg reg)) =
2128                floatingPtOp({escape=0w5, md=0w3, nnn=0w0, rm=reg}) (* FFREE FP(n) *)
2129
2130        |   cgOp (FPLoadInt{base, offset, opSize=OpSize64}) =
2131                (* fildl (esp) in 32-bit mode or fildq (esp) in 64-bit mode. *)
2132                opPlus2(FPESC 0w7, LargeInt.fromInt offset, base, 0w5)
2133
2134        |   cgOp (FPLoadInt{base, offset, opSize=OpSize32}) =
2135                (* fildl (esp) in 32-bit mode or fildq (esp) in 64-bit mode. *)
2136                opPlus2(FPESC 0w3, LargeInt.fromInt offset, base, 0w0)
2137
2138        |   cgOp (MultiplyR {source=RegisterArg srcReg, output, opSize}) =
2139                (* We use the 0F AF form of IMUL rather than the Group3 MUL or IMUL
2140                   because the former allows us to specify the destination register.
2141                   The Group3 forms produce double length results in RAX:RDX/EAX:EDX
2142                   but we only ever want the low-order half. *)
2143                opReg(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32 (* 2 byte opcode *), output, srcReg)
2144
2145        |   cgOp (MultiplyR {source=MemoryArg{base, offset, index}, output, opSize}) =
2146                (* This may be used for large-word multiplication. *)
2147                opAddress(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32 (* 2 byte opcode *), LargeInt.fromInt offset, base, index, output)
2148        
2149        |   cgOp(MultiplyR {source=NonAddressConstArg constnt, output, opSize}) =
2150                (* If the constant is an 8-bit or 32-bit value we are actually using a
2151                   three-operand instruction where the argument can be a register or memory
2152                   and the destination register does not need to be the same as the source. *)
2153                if is8BitL constnt
2154                then opReg(case opSize of OpSize64 => IMUL_C8_64 | OpSize32 => IMUL_C8_32, output, output) @ [Word8.fromLargeInt constnt]
2155                else if is32bit constnt
2156                then opReg(case opSize of OpSize64 => IMUL_C32_64 | OpSize32 => IMUL_C32_32, output, output) @ int32Signed constnt
2157                else opConstantOperand(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32, output)
2158
2159        |   cgOp(MultiplyR {source=AddressConstArg _, ...}) =
2160                raise InternalError "Multiply - address constant"
2161
2162        |   cgOp (XMMArith { opc, source=MemoryArg{base, offset, index}, output }) =
2163                mMXAddress(SSE2Ops opc, LargeInt.fromInt offset, base, index, output)
2164
2165        |   cgOp (XMMArith { opc, source=AddressConstArg _, output=SSE2Reg rrC }) =
2166            let
2167                (* The real constant here is actually the address of an 8-byte memory
2168                   object.  In 32-bit mode we put this address into the code and retain
2169                   this memory object.  In 64-bit mode we copy the real value out of the
2170                   memory object into the non-address constant area and use
2171                   PC-relative addressing.  These happen to be encoded the same
2172                   way. *)
2173                val opb = opCodeBytes(SSE2Ops opc, NONE)
2174                val mdrm = modrm (Based0, rrC, 0w5 (* constant address/PC-relative *))
2175            in
2176                opb @ [mdrm] @ int32Signed(tag 0)
2177            end
2178
2179        |   cgOp (XMMArith { opc, source=RegisterArg(SSE2Reg rrS), output=SSE2Reg rrC }) =
2180            let
2181                val oper = SSE2Ops opc
2182                val pref = opcodePrefix oper
2183                val esc = escapePrefix oper
2184                val opc = opToInt oper
2185                val mdrm = modrm(Register, rrC, rrS)
2186            in
2187                pref @ esc @ [opc, mdrm]
2188            end
2189
2190        |   cgOp (XMMArith { opc, source=NonAddressConstArg _, output=SSE2Reg rrC }) =
2191            let
2192                val _ = hostIsX64 orelse raise InternalError "XMMArith-NonAddressConstArg in 32-bit mode"
2193                (* This is currently used for 32-bit float arguments but can equally be
2194                   used for 64-bit values since the actual argument will always be put
2195                   in the 64-bit constant area. *)
2196                val opb = opCodeBytes(SSE2Ops opc, NONE)
2197                val mdrm = modrm (Based0, rrC, 0w5 (* constant address/PC-relative *))
2198            in
2199                opb @ [mdrm] @ int32Signed(tag 0)
2200            end
2201
2202        |   cgOp (XMMStoreToMemory { toStore, address={base, offset, index}, precision }) =
2203            let
2204                val oper =
2205                    case precision of
2206                        DoublePrecision => SSE2StoreDouble
2207                    |   SinglePrecision => SSE2StoreSingle
2208            in
2209                mMXAddress(oper, LargeInt.fromInt offset, base, index, toStore)
2210            end
2211
2212        |   cgOp (XMMConvertFromInt { source, output=SSE2Reg rrC, opSize }) =
2213            let
2214                (* The source is a general register and the output a XMM register. *)
2215                (* TODO: The source can be a memory location. *)
2216                val (rbC, rbX) = getReg source
2217                val oper = case opSize of OpSize64 => CVTSI2SD64 | OpSize32 => CVTSI2SD32
2218            in
2219                (* This is a special case with both an XMM and general register. *)
2220                opcodePrefix oper @ rexByte(oper, false, rbX, false) @
2221                    escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)]
2222            end
2223
2224        |   cgOp (SignExtendForDivide OpSize64) =
2225                opCodeBytes(CQO_CDQ64, SOME {w=true, r=false, b=false, x=false})
2226
2227        |   cgOp (SignExtendForDivide OpSize32) =
2228                opCodeBytes(CQO_CDQ32, NONE)
2229            
2230        |   cgOp (XChng { reg, arg=RegisterArg regY, opSize }) =
2231                opReg(case opSize of OpSize64 => XCHNG64 | OpSize32 => XCHNG32, reg, regY)
2232            
2233        |   cgOp (XChng { reg, arg=MemoryArg{offset, base, index}, opSize }) =
2234                opAddress(case opSize of OpSize64 => XCHNG64 | OpSize32 => XCHNG32, LargeInt.fromInt offset, base, index, reg)
2235                
2236        |   cgOp (XChng _) = raise InternalError "cgOp: XChng"
2237
2238        |   cgOp (Negative {output, opSize}) =
2239                opRegPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, output, 0w3 (* neg *))
2240
2241        |   cgOp (JumpTable{cases, jumpSize=ref jumpSize}) =
2242            let
2243                val _ = jumpSize = JumpSize8 orelse raise InternalError "cgOp: JumpTable"
2244                (* Make one jump for each case and pad it 8 bytes with Nops. *)
2245                fun makeJump (_, l) = opToInt JMP_32 :: word32Unsigned 0w0 @ [opToInt NOP, opToInt NOP, opToInt NOP] @ l
2246            in
2247                List.foldl makeJump [] cases
2248            end
2249
2250        |   cgOp(IndexedJumpCalc{ addrReg, indexReg, jumpSize=ref jumpSize }) =
2251            (
2252                jumpSize = JumpSize8 orelse raise InternalError "cgOp: IndexedJumpCalc";
2253                (* Should currently be JumpSize8 which requires a multiplier of 4 and
2254                   4 to be subtracted to remove the shifted tag. *)
2255                opAddress(if hostIsX64 then LEAL64 else LEAL32, ~4, addrReg, Index4 indexReg, addrReg)
2256            )
2257
2258        |   cgOp(MoveXMMRegToGenReg { source=SSE2Reg rrC, output }) =
2259            let
2260                (* The source is a XMM register and the output a general register. *)
2261                val (rbC, rbX) = getReg output
2262                val oper = MOVDFromXMM
2263            in
2264                (* This is a special case with both an XMM and general register. *)
2265                opcodePrefix oper @ rexByte(oper, false, rbX, false) @
2266                    escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)]
2267            end
2268
2269        |   cgOp(MoveGenRegToXMMReg { source, output=SSE2Reg rrC }) =
2270            let
2271                (* The source is a general register and the output a XMM register. *)
2272                val (rbC, rbX) = getReg source
2273                val oper = MOVQToXMM
2274            in
2275                (* This is a special case with both an XMM and general register. *)
2276                (* This needs to move the whole 64-bit value.  TODO: This is inconsistent
2277                   with MoveXMMRegToGenReg *)
2278                opcodePrefix oper @ rexByte(oper, false, rbX, false) @
2279                    escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)]
2280            end
2281
2282        |   cgOp(XMMShiftRight { output=SSE2Reg rrC, shift }) =
2283            let
2284                val oper = PSRLDQ
2285            in
2286                opcodePrefix oper @ escapePrefix oper @ [opToInt oper, modrm(Register, 0w3, rrC), shift]
2287            end
2288
2289        |   cgOp(FPLoadCtrlWord {base, offset, index}) =
2290                opIndexedPlus2(FPESC 0w1, LargeInt.fromInt offset, base, index, 0w5)
2291
2292        |   cgOp(FPStoreCtrlWord  {base, offset, index}) =
2293                opIndexedPlus2(FPESC 0w1, LargeInt.fromInt offset, base, index, 0w7)
2294
2295        |   cgOp(XMMLoadCSR  {base, offset, index}) =
2296                opIndexedPlus2(LDSTMXCSR, LargeInt.fromInt offset, base, index, 0w2)
2297
2298        |   cgOp(XMMStoreCSR  {base, offset, index}) =
2299                opIndexedPlus2(LDSTMXCSR, LargeInt.fromInt offset, base, index, 0w3)
2300
2301        |   cgOp(FPStoreInt {base, offset, index}) =
2302                (* fistp dword ptr [esp] in 32-bit mode or fistp qword ptr [rsp] in 64-bit mode. *)
2303                if hostIsX64
2304                then opIndexedPlus2(FPESC 0w7, LargeInt.fromInt offset, base, index, 0w7)
2305                else opIndexedPlus2(FPESC 0w3, LargeInt.fromInt offset, base, index, 0w3)
2306
2307        |   cgOp(XMMStoreInt {source, output, precision, isTruncate}) =
2308            let
2309                (* The destination is a general register.  The source is an XMM register or memory. *)
2310                val (rbC, rbX) = getReg output
2311                val oper =
2312                    case (hostIsX64, precision, isTruncate) of
2313                        (false, DoublePrecision, false) => CVTSD2SI32
2314                    |   (true, DoublePrecision, false)  => CVTSD2SI64
2315                    |   (false, SinglePrecision, false) => CVTSS2SI32
2316                    |   (true, SinglePrecision, false)  => CVTSS2SI64
2317                    |   (false, DoublePrecision, true)  => CVTTSD2SI32
2318                    |   (true, DoublePrecision, true)   => CVTTSD2SI64
2319                    |   (false, SinglePrecision, true)  => CVTTSS2SI32
2320                    |   (true, SinglePrecision, true)   => CVTTSS2SI64
2321            in
2322                case source of
2323                    MemoryArg{base, offset, index} =>
2324                        opAddress(oper, LargeInt.fromInt offset, base, index, output)
2325                |   RegisterArg(SSE2Reg rrS) =>
2326                        opcodePrefix oper @ rexByte(oper, rbX, false, false) @
2327                            escapePrefix oper @ [opToInt oper, modrm(Register, rbC, rrS)]
2328                |   _ => raise InternalError "XMMStoreInt: Not register or memory"
2329            end
2330
2331        |   cgOp(CondMove { test, output, source=RegisterArg source, opSize=OpSize32 }) =
2332                opReg(CMOV32 test, output, source)
2333
2334        |   cgOp(CondMove { test, output, source=RegisterArg source, opSize=OpSize64 }) =
2335                opReg(CMOV64 test, output, source)
2336
2337        |   cgOp(CondMove { test, output, source=NonAddressConstArg _, opSize }) =
2338            (
2339                (* We currently support only native-64 bit and put the constant in the
2340                   non-address constant area.  These are 64-bit values both in native
2341                   64-bit and in 32-in-64.  To support it in 32-bit mode we'd have to
2342                   put the constant in a single-word object and put its absolute
2343                   address into the code. *)
2344                targetArch <> Native32Bit orelse
2345                            raise InternalError "CondMove: constant in 32-bit mode";
2346                opConstantOperand((case opSize of OpSize32 => CMOV32 | OpSize64 => CMOV64) test, output)
2347            )
2348
2349        |   cgOp(CondMove { test, output, source=AddressConstArg _, opSize=OpSize64 }) =
2350                (* An address constant.  The opSize must match the size of a polyWord since
2351                   the value it going into the constant area. *)
2352            (
2353                targetArch = Native64Bit orelse raise InternalError "CondMove: AddressConstArg";
2354                opConstantOperand(CMOV64 test, output)
2355            )
2356
2357        |   cgOp(CondMove { test, output, source=AddressConstArg _, opSize=OpSize32 }) =
2358            (
2359                (* We only support address constants in 32-in-64. *)
2360                targetArch = ObjectId32Bit orelse raise InternalError "CondMove: AddressConstArg";
2361                opConstantOperand(CMOV32 test, output)
2362            )
2363
2364        |   cgOp(CondMove { test, output, source=MemoryArg{base, offset, index}, opSize=OpSize32 }) =
2365                opAddress(CMOV32 test, LargeInt.fromInt offset, base, index, output)
2366
2367        |   cgOp(CondMove { test, output, source=MemoryArg{base, offset, index}, opSize=OpSize64 }) =
2368                opAddress(CMOV64 test, LargeInt.fromInt offset, base, index, output)
2369
2370    in
2371        List.rev(List.foldl (fn (c, list) => Word8Vector.fromList(cgOp c) :: list) [] ops)
2372    end
2373    
2374    (* General function to process the code.  ic is the byte counter within the original code. *)
2375    fun foldCode foldFn n (ops, byteList) =
2376    let
2377        fun doFold(oper :: operList, bytes :: byteList, ic, acc) =
2378            doFold(operList, byteList, ic + Word.fromInt(Word8Vector.length bytes),
2379                foldFn(oper, bytes, ic, acc))
2380        |   doFold(_, _, _, n) = n
2381    in
2382        doFold(ops, byteList, 0w0, n)
2383    end
2384
2385    (* Go through the code and update branch and similar instructions with the destinations
2386       of the branches.  Long branches are converted to short where possible and the code
2387       is reprocessed.  That might repeat if the effect of shorting one branch allows
2388       another to be shortened. *)
2389    fun fixupLabels(ops, bytesList, labelCount) =
2390    let
2391        (* Label array - initialise to 0wxff... .  Every label should be defined
2392           but just in case, this is more likely to be detected in int32Signed. *)
2393        val labelArray = Array.array(labelCount, ~ 0w1)
2394
2395        (* First pass - Set the addresses of labels. *)
2396        fun setLabelAddresses(oper :: operList, bytes :: byteList, ic) =
2397            (
2398                case oper of
2399                    JumpLabel(Label{labelNo, ...}) => Array.update(labelArray, labelNo, ic)
2400                |   _ => ();
2401                setLabelAddresses(operList, byteList, ic + Word.fromInt(Word8Vector.length bytes))
2402            )
2403        |   setLabelAddresses(_, _, ic) = ic (* Return the length of the code. *)
2404
2405        fun fixup32(destination, bytes, ic) =
2406        let
2407            val brLength = Word8Vector.length bytes
2408            (* The offset is relative to the end of the branch instruction. *)
2409            val diff = Word.toInt destination - Word.toInt ic - brLength
2410        in
2411            Word8VectorSlice.concat[
2412                Word8VectorSlice.slice(bytes, 0, SOME(brLength-4)), (* The original opcode. *)
2413                Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt diff)))
2414            ]
2415        end
2416        
2417        fun fixupAddress(UncondBranch(Label{labelNo, ...}), bytes, ic, list) =
2418            let
2419                val destination = Array.sub(labelArray, labelNo)
2420                val brLength = Word8Vector.length bytes
2421                (* The offset is relative to the end of the branch instruction. *)
2422                val diff = Word.toInt destination - Word.toInt ic - brLength
2423            in
2424                if brLength = 2
2425                then (* It's a short branch.  Take the original operand and set the relative offset. *)
2426                    Word8Vector.fromList [opToInt JMP_8, byteSigned diff] :: list
2427                else if brLength <> 5
2428                then raise InternalError "fixupAddress"
2429                else (* 32-bit offset.  If it will fit in a byte we can use a short branch.
2430                        If this is a reverse branch we can actually use values up to -131
2431                        here because we've calculated using the end of the long branch. *)
2432                    if diff <= 127 andalso diff >= ~(128 + 3)
2433                then Word8Vector.fromList [opToInt JMP_8, 0w0 (* Fixed on next pass *)] :: list
2434                else Word8Vector.fromList(opToInt JMP_32 :: int32Signed(LargeInt.fromInt diff)) :: list
2435            end
2436
2437        |   fixupAddress(ConditionalBranch{label=Label{labelNo, ...}, test, ...}, bytes, ic, list) =
2438            let
2439                val destination = Array.sub(labelArray, labelNo)
2440                val brLength = Word8Vector.length bytes
2441                (* The offset is relative to the end of the branch instruction. *)
2442                val diff = Word.toInt destination - Word.toInt ic - brLength
2443            in
2444                if brLength = 2
2445                then (* It's a short branch.  Take the original operand and set the relative offset. *)
2446                    Word8Vector.fromList [opToInt(CondJump test), byteSigned diff]  :: list
2447                else if brLength <> 6
2448                then raise InternalError "fixupAddress"
2449                else if diff <= 127 andalso diff >= ~(128+4)
2450                then Word8Vector.fromList[opToInt(CondJump test), 0w0 (* Fixed on next pass *)] :: list
2451                else Word8Vector.fromList(opCodeBytes(CondJump32 test, NONE) @ int32Signed(LargeInt.fromInt diff))  :: list
2452            end
2453
2454        |   fixupAddress(LoadLabelAddress{ label=Label{labelNo, ...}, ... }, brCode, ic, list) =
2455            let
2456                val destination = Array.sub(labelArray, labelNo)
2457            in
2458                if hostIsX64
2459                then (* This is a relative offset on the X86/64. *)
2460                    fixup32(destination, brCode, ic) :: list
2461                else (* On X86/32 the address is relative to the start of the code so we simply put in
2462                        the destination address. *)
2463                    Word8VectorSlice.concat[
2464                        Word8VectorSlice.slice(brCode, 0, SOME(Word8Vector.length brCode-4)),
2465                        Word8VectorSlice.full(Word8Vector.fromList(int32Signed(Word.toLargeInt destination)))] :: list
2466            end
2467
2468        |   fixupAddress(JumpTable{cases, jumpSize as ref JumpSize8}, brCode: Word8Vector.vector, ic, list) =
2469            let
2470                (* Each branch is a 32-bit jump padded up to 8 bytes. *)
2471                fun processCase(Label{labelNo, ...} :: cases, offset, ic) =
2472                    fixup32(Array.sub(labelArray, labelNo),
2473                        Word8VectorSlice.vector(Word8VectorSlice.slice(brCode, offset, SOME 5)), ic) ::
2474                    Word8VectorSlice.vector(Word8VectorSlice.slice(brCode, offset+5, SOME 3)) ::
2475                    processCase(cases, offset+8, ic+0w8)
2476                |   processCase _ = []
2477                (* Could we use short branches?  If all of the branches were short the
2478                   table would be smaller so the offsets we use would be less.
2479                   Ignore backwards branches - could only occur if we have linked labels
2480                   in a loop. *)
2481                val newStartOfCode = ic + Word.fromInt(List.length cases * 6)
2482                fun tryShort(Label{labelNo, ...} :: cases, ic) =
2483                    let
2484                        val destination = Array.sub(labelArray, labelNo)
2485                    in
2486                        if destination > ic + 0w2 andalso destination - ic - 0w2 < 0w127
2487                        then tryShort(cases, ic+0w2)
2488                        else false
2489                    end
2490                |   tryShort _ = true
2491
2492                val newCases =
2493                    if tryShort(cases, newStartOfCode)
2494                    then
2495                    (
2496                        jumpSize := JumpSize2;
2497                        (* Generate a short branch table. *)
2498                        List.map(fn _ => Word8Vector.fromList [opToInt JMP_8, 0w0 (* Fixed on next pass *)]) cases
2499                    )
2500                    else processCase(cases, 0, ic)
2501            in
2502                Word8Vector.concat newCases :: list
2503            end
2504
2505        |   fixupAddress(JumpTable{cases, jumpSize=ref JumpSize2}, _, ic, list) =
2506            let
2507                (* Each branch is a short jump. *)
2508                fun processCase(Label{labelNo, ...} :: cases, offset, ic) =
2509                    let
2510                        val destination = Array.sub(labelArray, labelNo)
2511                        val brLength = 2
2512                        val diff = Word.toInt destination - Word.toInt ic - brLength
2513                    in
2514                        Word8Vector.fromList[opToInt JMP_8, byteSigned diff] :: processCase(cases, offset+2, ic+0w2)
2515                    end
2516                |   processCase _ = []
2517            in
2518                Word8Vector.concat(processCase(cases, 0, ic)) :: list
2519            end
2520
2521            (* If we've shortened a jump table we have to change the indexing. *)
2522        |   fixupAddress(IndexedJumpCalc{ addrReg, indexReg, jumpSize=ref JumpSize2 }, _, _, list) =
2523                (* On x86/32 it might be shorter to use DEC addrReg; ADD addrReg, indexReg. *)
2524                Word8Vector.fromList(opAddress(if hostIsX64 then LEAL64 else LEAL32, ~1, addrReg, Index1 indexReg, addrReg)) :: list
2525
2526        |   fixupAddress(CallAddress(NonAddressConstArg _), brCode, ic, list) =
2527            let
2528                val brLen = Word8Vector.length brCode
2529            in
2530                (* Call to the start of the code.  Offset is -(bytes to start). *)
2531                Word8VectorSlice.concat[
2532                    Word8VectorSlice.slice(brCode, 0, SOME(brLen-4)), (* The original opcode. *)
2533                    Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt(~(Word.toInt ic+brLen)))))
2534                ] :: list
2535            end
2536
2537        |   fixupAddress(JumpAddress(NonAddressConstArg _), brCode, ic, list) =
2538            let
2539                val brLen = Word8Vector.length brCode
2540            in
2541                (* Call to the start of the code.  Offset is -(bytes to start). *)
2542                Word8VectorSlice.concat[
2543                    Word8VectorSlice.slice(brCode, 0, SOME(brLen-4)), (* The original opcode. *)
2544                    Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt(~(Word.toInt ic+brLen)))))
2545                ] :: list
2546            end
2547
2548        |   fixupAddress(_, bytes, _, list) = bytes :: list
2549        
2550        fun reprocess(bytesList, lastCodeSize) =
2551        let
2552            val fixedList = List.rev(foldCode fixupAddress [] (ops, bytesList))
2553            val newCodeSize = setLabelAddresses(ops, fixedList, 0w0)
2554        in
2555            if newCodeSize = lastCodeSize
2556            then (fixedList, lastCodeSize)
2557            else if newCodeSize > lastCodeSize
2558            then raise InternalError "reprocess - size increased"
2559            else reprocess(fixedList, newCodeSize)
2560        end
2561    in
2562        reprocess(bytesList, setLabelAddresses(ops, bytesList, 0w0))
2563    end
2564 
2565    (* The handling of constants generally differs between 32- and 64-bits.  In 32-bits we put all constants
2566       inline and the GC processes the code to find the addresss.  For real values the "constant" is actually
2567       the address of the boxed real value.
2568       In 64-bit mode inline constants were used with the MOV instruction but this has now been removed.
2569       All constants are stored in one of two areas at the end of the
2570       code segment.  Non-addresses, including the actual values of reals, are stored in the non-address area
2571       and addresses go in the address area.  Only the latter is scanned by the GC.
2572       The address area is also used in 32-bit mode but only has the address of the function name and the
2573       address of the profile ref in it. *)
2574    datatype inline32constants =
2575        SelfAddress                             (* The address of the start of the code - inline absolute address 32-bit only *)
2576    |   InlineAbsoluteAddress of machineWord    (* An address in the code: 32-bit only *)
2577    |   InlineRelativeAddress of machineWord    (* A relative address: 32-bit only. *)
2578
2579    local
2580        (* Turn an integer constant into an 8-byte vector. *)
2581        fun intConst ival = LargeWord.fromLargeInt ival
2582
2583        (* Copy a real constant from memory into an 8-byte vector. *)
2584        fun realConst c =
2585        let
2586            val cAsAddr = toAddress c
2587            (* This may be a boxed real or, in 32-in-64 mode, a boxed float. *)
2588            val cLength = length cAsAddr * wordSize
2589            val _ = ((cLength = 0w8 orelse cLength = 0w4) andalso flags cAsAddr = F_bytes) orelse
2590                        raise InternalError "realConst: Not a real number"
2591            fun getBytes(i, a) =
2592                if i = 0w0 then a
2593                else getBytes(i-0w1, a*0w256 + Word8.toLargeWord(loadByte(cAsAddr, i-0w1)))
2594        in
2595            getBytes(cLength, 0w0)
2596        end
2597
2598        fun getConstant(Move{ source=NonAddressConstArg source, moveSize=Move32, ...}, bytes, ic, (inl, addr, na)) =
2599            if targetArch <> Native32Bit
2600            then
2601            (
2602                if source >= ~0x80000000 andalso source < 0x100000000
2603                then (* Signed or unsigned 32-bits. *) (inl, addr, na)
2604                else (* Too big for 32-bits. *)
2605                    (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na)
2606            )
2607            else (inl, addr, na) (* 32-bit mode.  The constant will always be inline even if we've had to use LEA r,c *)
2608
2609        |   getConstant(Move{ source=NonAddressConstArg source, moveSize=Move64, ...}, bytes, ic, (inl, addr, na)) =
2610            if targetArch <> Native32Bit
2611            then
2612            (
2613                if source >= ~0x80000000 andalso source < 0x100000000
2614                then (* Signed or unsigned 32-bits. *) (inl, addr, na)
2615                else (* Too big for 32-bits. *)
2616                    (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na)
2617            )
2618            else (inl, addr, na) (* 32-bit mode.  The constant will always be inline even if we've had to use XOR r,r; ADD r,c *)
2619
2620        |   getConstant(Move{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2621            if targetArch <> Native32Bit
2622            then (* Address constants go in the constant area. *)
2623                (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na)
2624            else ((ic + Word.fromInt(Word8Vector.length bytes) - wordSize, InlineAbsoluteAddress source) :: inl, addr, na)
2625
2626        |   getConstant(LoadAbsolute{value, ...}, bytes, ic, (inl, addr, na)) =
2627            if targetArch = Native64Bit
2628            then (* Address constants go in the constant area. *)
2629                (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, value) :: addr, na)
2630                (* This is the only case of an inline constant in 32-in-64 *)
2631            else ((ic + Word.fromInt(Word8Vector.length bytes) - nativeWordSize, InlineAbsoluteAddress value) :: inl, addr, na)
2632
2633        |   getConstant(ArithToGenReg{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2634                if is32bit source
2635                then (inl, addr, na)
2636                else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na)
2637
2638        |   getConstant(ArithToGenReg{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2639                if hostIsX64
2640                then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na)
2641                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na)
2642
2643        |   getConstant(ArithMemLongConst{ source, ... }, bytes, ic, (inl, addr, na)) = (* 32-bit only. *)
2644                ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na)
2645
2646        |   getConstant(PushToStack(NonAddressConstArg constnt), bytes, ic, (inl, addr, na)) =
2647                if is32bit constnt then (inl, addr, na)
2648                else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst constnt) ::  na)
2649
2650        |   getConstant(PushToStack(AddressConstArg constnt), bytes, ic, (inl, addr, na)) =
2651                if targetArch = Native64Bit
2652                then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, constnt) :: addr, na)
2653                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constnt) :: inl, addr, na)
2654
2655        |   getConstant(CallAddress(AddressConstArg w), bytes, ic, (inl, addr, na)) =
2656                if targetArch = Native64Bit
2657                then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, w) :: addr, na)
2658                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineRelativeAddress w) :: inl, addr, na)
2659
2660        |   getConstant(JumpAddress(AddressConstArg w), bytes, ic, (inl, addr, na)) =
2661                if targetArch = Native64Bit
2662                then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, w) :: addr, na)
2663                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineRelativeAddress w) :: inl, addr, na)
2664
2665        |   getConstant(LoadLabelAddress _, _, ic, (inl, addr, na)) =
2666                (* We need the address of the code itself but it's in the first of a pair of instructions. *)
2667                if hostIsX64 then (inl, addr, na) else ((ic + 0w1, SelfAddress) :: inl, addr, na)
2668
2669        |   getConstant(FPLoadFromConst{constant, ...}, bytes, ic, (inl, addr, na)) =
2670               if hostIsX64
2671               then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst constant) :: na)
2672               else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constant) :: inl, addr, na)
2673
2674        |   getConstant(FPArithConst{ source, ... }, bytes, ic, (inl, addr, na)) =
2675                if hostIsX64
2676                then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst source) :: na)
2677                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na)
2678
2679        |   getConstant(XMMArith { source=AddressConstArg constVal, ... }, bytes, ic, (inl, addr, na)) =
2680                (* Real.real constant or, with 32-bit words, a Real32.real constant. *)
2681                if hostIsX64
2682                then (inl, addr,  (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst constVal) :: na)
2683                else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constVal) :: inl, addr, na)
2684
2685        |   getConstant(XMMArith { source=NonAddressConstArg constVal, ... }, bytes, ic, (inl, addr, na)) =
2686                (* Real32.real constant in native 64-bit. *)
2687                (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst constVal) :: na)
2688
2689        |   getConstant(MultiplyR{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2690                if is32bit source
2691                then (inl, addr, na)
2692                else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na)
2693
2694        |   getConstant(CondMove{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2695            if targetArch <> Native32Bit
2696            then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na)
2697            else (inl, addr, na) (* 32-bit mode.  The constant will always be inline. *)
2698
2699        |   getConstant(CondMove{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) =
2700            if targetArch <> Native32Bit
2701            then (* Address constants go in the constant area. *)
2702                (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na)
2703            else ((ic + Word.fromInt(Word8Vector.length bytes) - wordSize, InlineAbsoluteAddress source) :: inl, addr, na)
2704
2705        |   getConstant(_, _, _, l) = l        
2706    in
2707        val getConstants = foldCode getConstant ([], [], [])
2708    end
2709
2710    (* It is convenient to have AllocStore and AllocStoreVariable as primitives at the higher
2711       level but at this point it's better to expand them into their basic instructions. *)
2712    fun expandComplexOperations(instrs, oldLabelCount) =
2713    let
2714        val labelCount = ref oldLabelCount
2715        fun mkLabel() = Label{labelNo= !labelCount} before labelCount := !labelCount + 1
2716
2717         (* On X86/64 the local pointer is in r15.  On X86/32 it's in memRegs. *)
2718        val localPointer =
2719            if hostIsX64 then RegisterArg r15 else MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex}
2720        
2721        val nativeWordOpSize = if hostIsX64 then OpSize64 else OpSize32
2722
2723        fun allocStoreCommonCode (resultReg, isVarAlloc, regSaveSet: genReg list) =
2724        let
2725            val compare =
2726                ArithToGenReg{opc=CMP, output=resultReg,
2727                    source=MemoryArg{base=ebp, offset=memRegLocalMbottom, index=NoIndex}, opSize=nativeWordOpSize}
2728            (* Normally we won't have run out of store so we want the default
2729               branch prediction to skip the test here. However doing that
2730               involves adding an extra branch which lengthens the code so
2731               it's probably not worth while. *)
2732            (* Just checking against the lower limit can fail
2733               in the situation where the heap pointer is at the low end of
2734               the address range and the store required is so large that the
2735               subtraction results in a negative number.  In that case it
2736               will be > (unsigned) lower_limit so in addition we have
2737               to check that the result is < (unsigned) heap_pointer.
2738               This actually happened on Windows with X86-64.
2739               In theory this can happen with fixed-size allocations as
2740               well as variable allocations but in practice fixed-size
2741               allocations are going to be small enough that it's not a
2742               problem.  *)
2743            val destLabel = mkLabel()
2744            val branches =
2745                if isVarAlloc
2746                then
2747                let
2748                    val extraLabel = mkLabel()
2749                in
2750                    [ConditionalBranch{test=JB, label=extraLabel},
2751                     ArithToGenReg{opc=CMP, output=resultReg, source=localPointer, opSize=nativeWordOpSize},
2752                     ConditionalBranch{test=JB, label=destLabel},
2753                     JumpLabel extraLabel]
2754                end
2755                else [ConditionalBranch{test=JNB, label=destLabel}]
2756            val callRts = CallRTS{rtsEntry=HeapOverflowCall, saveRegs=regSaveSet}
2757            val fixup = JumpLabel destLabel
2758            (* Update the heap pointer now we have the store.  This is also
2759               used by the RTS in the event of a trap to work out how much
2760               store was being allocated. *)
2761            val update =
2762                if hostIsX64 then Move{source=RegisterArg resultReg, destination=RegisterArg r15, moveSize=Move64}
2763                else Move{source=RegisterArg resultReg,
2764                        destination=MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex}, moveSize=Move32}
2765        in
2766            compare :: branches @ [callRts, fixup, update]
2767        end
2768        
2769        fun doExpansion([], code, _) = code
2770
2771        |   doExpansion(AllocStore {size, output, saveRegs} :: instrs, code, inAllocation) =
2772            let
2773                val _ = inAllocation andalso raise InternalError "doExpansion: Allocation started but not complete"
2774                val () = if List.exists (fn r => r = output) saveRegs then raise InternalError "AllocStore: in set" else ()
2775
2776                val startCode =
2777                    case targetArch of
2778                        Native64Bit =>
2779                        let
2780                            val bytes = (size + 1) * Word.toInt wordSize
2781                        in
2782                            [LoadAddress{output=output, offset = ~ bytes, base=SOME r15, index=NoIndex, opSize=OpSize64}]
2783                             (* TODO: What if it's too big to fit? *)
2784                        end
2785                    |   Native32Bit =>
2786                        let
2787                            val bytes = (size + 1) * Word.toInt wordSize
2788                        in
2789                            [Move{source=MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex},
2790                                destination=RegisterArg output, moveSize=Move32},
2791                             LoadAddress{output=output, offset = ~ bytes, base=SOME output, index=NoIndex, opSize=OpSize32}]
2792                        end
2793                    |   ObjectId32Bit =>
2794                        let
2795                            (* We must allocate an even number of words. *)
2796                            val heapWords = if Int.rem(size, 2) = 1 then size+1 else size+2
2797                            val bytes = heapWords * Word.toInt wordSize
2798                        in
2799                            [LoadAddress{output=output, offset = ~ bytes, base=SOME r15, index=NoIndex, opSize=OpSize64}]
2800                        end
2801                            
2802                val resultCode = startCode @ allocStoreCommonCode(output, false, saveRegs)
2803            in
2804                doExpansion(instrs, (List.rev resultCode) @ code, true)
2805            end
2806
2807        |   doExpansion(AllocStoreVariable {size, output, saveRegs} :: instrs, code, inAllocation) =
2808            let
2809                (* Allocates memory.  The "size" register contains the number of words as a tagged int. *)
2810                val _ = inAllocation andalso raise InternalError "doExpansion: Allocation started but not complete"
2811                val () = if List.exists (fn r => r = output) saveRegs then raise InternalError "AllocStore: in set" else ()
2812                (* Negate the length and add it to the current heap pointer. *)
2813                (* Compute the number of bytes into dReg. The length in sReg is the number
2814                   of words as a tagged value so we need to multiply it, add wordSize to
2815                   include one word for the header then subtract the, multiplied, tag.
2816                   We use LEA here but want to avoid having an empty base register. *)
2817                val _ = size = output andalso raise InternalError "AllocStoreVariable : same register for size and output"
2818                val startCode =
2819                    if wordSize = 0w8 (* 8-byte words *)
2820                    then
2821                    [
2822                        ArithToGenReg{opc=XOR, output=output, source=RegisterArg output, opSize=OpSize32 (* Rest is zeroed *)},
2823                        ArithToGenReg{opc=SUB, output=output, source=RegisterArg size, opSize=OpSize64},
2824                        LoadAddress{output=output, base=SOME r15, offset= ~(Word.toInt wordSize-4), index=Index4 output, opSize=OpSize64 }
2825                    ]
2826                    else (* 4 byte words *)
2827                    [
2828                        LoadAddress{output=output, base=SOME size, offset=Word.toInt wordSize-2,
2829                            index=Index1 size, opSize=nativeWordOpSize },
2830                        Negative{output=output, opSize=nativeWordOpSize},
2831                        ArithToGenReg{opc=ADD, output=output, source=localPointer, opSize=nativeWordOpSize}
2832                    ]
2833                (* If this is 32-in-64 we need to round down to the next 8-byte boundary. *)
2834                val roundCode =
2835                    if targetArch = ObjectId32Bit
2836                    then [ArithToGenReg{opc=AND, output=output, source=NonAddressConstArg ~8, opSize=OpSize64 }]
2837                    else []
2838                val resultCode = startCode @ roundCode @ allocStoreCommonCode(output, true, saveRegs)
2839            in
2840                doExpansion(instrs, (List.rev resultCode) @ code, true)
2841            end
2842
2843        |   doExpansion(StoreInitialised :: instrs, code, _) = doExpansion(instrs, code, false)
2844
2845        |   doExpansion(instr :: instrs, code, inAlloc) = doExpansion(instrs, instr::code, inAlloc)
2846        
2847        val expanded = List.rev(doExpansion(instrs, [], false))
2848    in
2849        (expanded, !labelCount)
2850    end
2851
2852
2853    fun printCode (Code{procName, printStream, ...}, seg) =
2854    let
2855        val print = printStream
2856        val ptr = ref 0w0;
2857        (* prints a string representation of a number *)
2858        fun printValue v =
2859            if v < 0 then (print "-"; print(LargeInt.toString  (~ v))) else print(LargeInt.toString v)
2860
2861        infix 3 +:= ;
2862        fun (x +:= y) = (x := !x + (y:word));
2863
2864        fun get16s (a, seg) : int =
2865        let
2866            val b0  = Word8.toInt (codeVecGet (seg, a));
2867            val b1  = Word8.toInt (codeVecGet (seg, a + 0w1));
2868            val b1' = if b1 >= 0x80 then b1 - 0x100 else b1;
2869        in
2870            (b1' * 0x100) + b0
2871        end
2872        
2873        fun get16u(a, seg) : int =
2874            Word8.toInt (codeVecGet (seg, a + 0w1)) * 0x100 + Word8.toInt (codeVecGet (seg, a))
2875        
2876        (* Get 1 unsigned byte from the given offset in the segment. *)
2877        fun get8u (a, seg) : Word8.word = codeVecGet (seg, a);
2878
2879        (* Get 1 signed byte from the given offset in the segment. *)
2880        fun get8s (a, seg) : int = Word8.toIntX (codeVecGet (seg, a));
2881     
2882        (* Get 1 signed 32 bit word from the given offset in the segment. *)
2883        fun get32s (a, seg) : LargeInt.int =
2884        let
2885            val b0  = Word8.toLargeInt (codeVecGet (seg, a));
2886            val b1  = Word8.toLargeInt (codeVecGet (seg, a + 0w1));
2887            val b2  = Word8.toLargeInt (codeVecGet (seg, a + 0w2));
2888            val b3  = Word8.toLargeInt (codeVecGet (seg, a + 0w3));
2889            val b3' = if b3 >= 0x80 then b3 - 0x100 else b3;
2890            val topHw    = (b3' * 0x100) + b2;
2891            val bottomHw = (b1 * 0x100) + b0;
2892        in
2893            (topHw * exp2_16) + bottomHw
2894        end
2895 
2896        fun get64s (a, seg) : LargeInt.int =
2897        let
2898            val b0  = Word8.toLargeInt (codeVecGet (seg, a));
2899            val b1  = Word8.toLargeInt (codeVecGet (seg, a + 0w1));
2900            val b2  = Word8.toLargeInt (codeVecGet (seg, a + 0w2));
2901            val b3  = Word8.toLargeInt (codeVecGet (seg, a + 0w3));
2902            val b4  = Word8.toLargeInt (codeVecGet (seg, a + 0w4));
2903            val b5  = Word8.toLargeInt (codeVecGet (seg, a + 0w5));
2904            val b6  = Word8.toLargeInt (codeVecGet (seg, a + 0w6));
2905            val b7  = Word8.toLargeInt (codeVecGet (seg, a + 0w7));
2906            val b7' = if b7 >= 0x80 then b7 - 0x100 else b7;
2907        in
2908            ((((((((b7' * 0x100 + b6) * 0x100 + b5) * 0x100 + b4) * 0x100 + b3)
2909                 * 0x100 + b2) * 0x100) + b1) * 0x100) + b0
2910        end
2911 
2912        fun print32 () = printValue (get32s (!ptr, seg)) before (ptr +:= 0w4)
2913        and print64 () = printValue (get64s (!ptr, seg)) before (ptr +:= 0w8)
2914        and print16 () = printValue (LargeInt.fromInt(get16s (!ptr, seg)) before (ptr +:= 0w2))
2915        and print8 () = printValue (LargeInt.fromInt(get8s (!ptr, seg)) before (ptr +:= 0w1))
2916 
2917        fun printJmp () =
2918        let
2919            val valu = get8s (!ptr, seg)  before ptr +:= 0w1
2920        in
2921            print (Word.fmt StringCvt.HEX (Word.fromInt valu + !ptr))
2922        end
2923 
2924        (* Print an effective address.  The register field may designate a general register
2925           or an xmm register depending on the instruction. *)
2926        fun printEAGeneral printRegister (rex, sz) =
2927        let
2928            val modrm = codeVecGet (seg, !ptr)
2929            val () = ptr +:= 0w1
2930            (* Decode the Rex prefix if present. *)
2931            val rexX = (rex andb8 0wx2) <> 0w0
2932            val rexB = (rex andb8 0wx1) <> 0w0
2933            val prefix =
2934                case sz of
2935                    SZByte  => "byte ptr "
2936                |   SZWord  => "word ptr "
2937                |   SZDWord => "dword ptr "
2938                |   SZQWord => "qword ptr "
2939        in
2940            case (modrm >>- 0w6, modrm andb8 0w7, hostIsX64) of
2941                (0w3, rm, _) => printRegister(rm, rexB, sz)
2942      
2943            |   (md, 0w4, _) =>
2944                let (* s-i-b present. *)
2945                    val sib = codeVecGet (seg, !ptr)
2946                    val () = ptr +:= 0w1
2947                    val ss    = sib >>- 0w6
2948                    val index = (sib  >>- 0w3) andb8 0w7
2949                    val base   = sib andb8 0w7
2950                in
2951                    print prefix;
2952
2953                    case (md, base, hostIsX64) of
2954                        (0w1, _, _) => print8 ()
2955                    |   (0w2, _, _) => print32 ()
2956                    |   (0w0, 0w5, _) => print32 () (* Absolute in 32-bit mode.  PC-relative in 64-bit ?? *)
2957                    |   _ => ();
2958          
2959                    print "[";
2960        
2961                    if md <> 0w0 orelse base <> 0w5
2962                    then
2963                    (
2964                        print (genRegRepr (mkReg (base, rexB), sz32_64));
2965                        if index = 0w4 then () else print ","
2966                    )
2967                    else ();
2968        
2969                    if index = 0w4 andalso not rexX (* No index. *)
2970                    then ()
2971                    else print (genRegRepr (mkReg(index, rexX), sz32_64) ^ 
2972                            (if ss = 0w0 then "*1"
2973                            else if ss = 0w1 then "*2"
2974                            else if ss = 0w2 then "*4"
2975                            else "*8"));
2976        
2977                    print "]"
2978                end
2979      
2980            |   (0w0, 0w5, false) => (* Absolute address.*) (print prefix; print32 ())
2981
2982            |   (0w0, 0w5, _) => (* PC-relative in 64-bit  *)
2983                        (print prefix; print ".+"; print32 ())
2984            
2985            |   (md, rm, _) => (* register plus offset. *)
2986                (
2987                    print prefix;
2988                    if md = 0w1 then print8 ()
2989                    else if md = 0w2 then print32 ()
2990                    else ();
2991         
2992                    print ("[" ^ genRegRepr (mkReg(rm, rexB), sz32_64) ^ "]")
2993                )
2994        end
2995        
2996        (* For most instructions we want to print a general register. *)
2997        val printEA =
2998            printEAGeneral (fn (rm, rexB, sz) => print (genRegRepr (mkReg(rm, rexB), sz)))
2999        and printEAxmm =
3000            printEAGeneral (fn (rm, _, _) => print (xmmRegRepr(SSE2Reg rm)))
3001 
3002        fun printArith opc =
3003            print
3004               (case opc of
3005                  0 => "add "
3006                | 1 => "or  "
3007                | 2 => "adc "
3008                | 3 => "sbb "
3009                | 4 => "and "
3010                | 5 => "sub "
3011                | 6 => "xor "
3012                | _ => "cmp "
3013               )
3014
3015        fun printGvEv (opByte, rex, rexR, sz) =
3016        let
3017            (* Register is in next byte. *)
3018            val nb = codeVecGet (seg, !ptr)
3019            val reg = (nb >>- 0w3) andb8 0w7
3020        in
3021            printArith(Word8.toInt((opByte div 0w8) mod 0w8));
3022            print "\t";
3023            print (genRegRepr (mkReg(reg, rexR), sz));
3024            print ",";
3025            printEA(rex, sz)
3026        end
3027        
3028        fun printMovCToR (opByte, sz, rexB) =
3029        (
3030            print "mov \t";
3031            print(genRegRepr (mkReg (opByte mod 0w8, rexB), sz));
3032            print ",";
3033            case sz of SZDWord => print32 () | SZQWord => print64 () | _ => print "???"
3034        )
3035        
3036        fun printShift (opByte, rex, sz) =
3037        let
3038            (* Opcode is determined by next byte. *)
3039            val nb = Word8.toInt (codeVecGet (seg, !ptr))
3040            val opc = (nb div 8) mod 8
3041        in
3042            print
3043               (case opc of
3044                  4 => "shl "
3045                | 5 => "shr "
3046                | 7 => "sar "
3047                | _ => "???"
3048               );
3049            print "\t";
3050            printEA(rex, sz);
3051            print ",";
3052            if opByte = opToInt Group2_1_A32 then print "1"
3053            else if opByte = opToInt Group2_CL_A32 then print "cl"
3054            else print8 ()
3055        end
3056        
3057        fun printFloat (opByte, rex) =
3058        let
3059            (* Opcode is in next byte. *)
3060            val opByte2  = codeVecGet (seg, !ptr)
3061            val nnn = (opByte2 >>- 0w3) andb8 0w7
3062            val escNo = opByte andb8 0wx7
3063        in
3064            if (opByte2 andb8 0wxC0) = 0wxC0
3065            then (* mod = 11 *)
3066            (
3067                case (escNo, nnn, opByte2 andb8 0wx7 (* modrm *)) of
3068                    (0w1, 0w4, 0w0) => print "fchs"
3069                |   (0w1, 0w4, 0w1) => print "fabs"
3070                |   (0w1, 0w5, 0w6) => print "fldz"
3071                |   (0w1, 0w5, 0w1) => print "flf1"
3072                |   (0w7, 0w4, 0w0) => print "fnstsw\tax"
3073                |   (0w1, 0w5, 0w0) => print "fld1"
3074                |   (0w1, 0w6, 0w3) => print "fpatan"
3075                |   (0w1, 0w7, 0w2) => print "fsqrt"
3076                |   (0w1, 0w7, 0w6) => print "fsin"
3077                |   (0w1, 0w7, 0w7) => print "fcos"
3078                |   (0w1, 0w6, 0w7) => print "fincstp"
3079                |   (0w1, 0w6, 0w6) => print "fdecstp"
3080                |   (0w3, 0w4, 0w2) => print "fnclex"
3081                |   (0w5, 0w2, rno) => print ("fst \tst(" ^ Word8.toString rno ^ ")")
3082                |   (0w5, 0w3, rno) => print ("fstp\tst(" ^ Word8.toString rno ^ ")")
3083                |   (0w1, 0w0, rno) => print ("fld \tst(" ^ Word8.toString rno ^ ")")
3084                |   (0w1, 0w1, rno) => print ("fxch\tst(" ^ Word8.toString rno ^ ")")
3085                |   (0w0, 0w3, rno) => print ("fcomp\tst(" ^ Word8.toString rno ^ ")")
3086                |   (0w0, 0w0, rno) => print ("fadd\tst,st(" ^ Word8.toString rno ^ ")")
3087                |   (0w0, 0w1, rno) => print ("fmul\tst,st(" ^ Word8.toString rno ^ ")")
3088                |   (0w0, 0w4, rno) => print ("fsub\tst,st(" ^ Word8.toString rno ^ ")")
3089                |   (0w0, 0w5, rno) => print ("fsubr\tst,st(" ^ Word8.toString rno ^ ")")
3090                |   (0w0, 0w6, rno) => print ("fdiv\tst,st(" ^ Word8.toString rno ^ ")")
3091                |   (0w0, 0w7, rno) => print ("fdivr\tst,st(" ^ Word8.toString rno ^ ")")
3092                |   (0w5, 0w0, rno) => print ("ffree\tst(" ^ Word8.toString rno ^ ")")
3093                |   _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2));
3094                ptr +:= 0w1
3095            )
3096            else (* mod = 00, 01, 10 *)
3097            (
3098                case (escNo, nnn) of
3099                    (0w0, 0w0) => (print "fadd\t"; printEA(rex, SZDWord)) (* Single precision. *)
3100                |   (0w0, 0w1) => (print "fmul\t"; printEA(rex, SZDWord))
3101                |   (0w0, 0w3) => (print "fcomp\t"; printEA(rex, SZDWord))
3102                |   (0w0, 0w4) => (print "fsub\t"; printEA(rex, SZDWord))
3103                |   (0w0, 0w5) => (print "fsubr\t"; printEA(rex, SZDWord))
3104                |   (0w0, 0w6) => (print "fdiv\t"; printEA(rex, SZDWord))
3105                |   (0w0, 0w7) => (print "fdivr\t"; printEA(rex, SZDWord))
3106                |   (0w1, 0w0) => (print "fld \t"; printEA(rex, SZDWord))
3107                |   (0w1, 0w2) => (print "fst\t"; printEA(rex, SZDWord))
3108                |   (0w1, 0w3) => (print "fstp\t"; printEA(rex, SZDWord))
3109                |   (0w1, 0w5) => (print "fldcw\t"; printEA(rex, SZWord)) (* Control word is 16 bits *)
3110                |   (0w1, 0w7) => (print "fstcw\t"; printEA(rex, SZWord)) (* Control word is 16 bits *)
3111                |   (0w3, 0w0) => (print "fild\t"; printEA(rex, SZDWord)) (* 32-bit int. *)
3112                |   (0w7, 0w5) => (print "fild\t"; printEA(rex, SZQWord)) (* 64-bit int. *)
3113                |   (0w3, 0w3) => (print "fistp\t"; printEA(rex, SZDWord)) (* 32-bit int. *)
3114                |   (0w7, 0w7) => (print "fistp\t"; printEA(rex, SZQWord)) (* 64-bit int. *)
3115                |   (0w4, 0w0) => (print "fadd\t"; printEA(rex, SZQWord)) (* Double precision. *)
3116                |   (0w4, 0w1) => (print "fmul\t"; printEA(rex, SZQWord))
3117                |   (0w4, 0w3) => (print "fcomp\t"; printEA(rex, SZQWord))
3118                |   (0w4, 0w4) => (print "fsub\t"; printEA(rex, SZQWord))
3119                |   (0w4, 0w5) => (print "fsubr\t"; printEA(rex, SZQWord))
3120                |   (0w4, 0w6) => (print "fdiv\t"; printEA(rex, SZQWord))
3121                |   (0w4, 0w7) => (print "fdivr\t"; printEA(rex, SZQWord))
3122                |   (0w5, 0w0) => (print "fld \t"; printEA(rex, SZQWord))
3123                |   (0w5, 0w2) => (print "fst\t"; printEA(rex, SZQWord))
3124                |   (0w5, 0w3) => (print "fstp\t"; printEA(rex, SZQWord))
3125                |   _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2))
3126            )
3127        end
3128        
3129        fun printJmp32 oper =
3130        let
3131            val valu = get32s (!ptr, seg) before (ptr +:= 0w4)
3132        in
3133            print oper; print "\t";
3134            print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu))
3135        end
3136
3137        fun printMask mask =
3138        let
3139            val wordMask = Word.fromInt mask
3140            fun printAReg n =
3141                if n = regs then ()
3142                else
3143                (
3144                    if (wordMask andb (0w1 << Word.fromInt n)) <> 0w0
3145                    then (print(regRepr(regN n)); print " ")
3146                    else ();
3147                    printAReg(n+1)
3148                )
3149        in
3150            printAReg 0
3151        end
3152
3153    in
3154
3155        if procName = "" (* No name *) then print "?" else print procName;
3156        print ":\n";
3157 
3158        while get8u (!ptr, seg) <> 0wxf4 (* HLT. *) do
3159        let
3160            val () = print (Word.fmt StringCvt.HEX (!ptr)) (* The address in hex. *)
3161            val () = print "\t"
3162
3163            (* See if we have a lock prefix. *)
3164            val () =
3165                if get8u (!ptr, seg) = 0wxF0
3166                then (print "lock "; ptr := !ptr + 0w1)
3167                else ()
3168                
3169            val legacyPrefix =
3170                let
3171                    val p = get8u (!ptr, seg)
3172                in
3173                    if p = 0wxF2 orelse p = 0wxF3 orelse p = 0wx66
3174                    then (ptr := !ptr + 0w1; p)
3175                    else 0wx0
3176                end
3177
3178            (* See if we have a REX byte. *)
3179            val rex =
3180            let
3181               val b = get8u (!ptr, seg);
3182            in
3183               if b >= 0wx40 andalso b <= 0wx4f
3184               then (ptr := !ptr + 0w1; b)
3185               else 0w0
3186            end
3187        
3188            val rexW = (rex andb8 0wx8) <> 0w0
3189            val rexR = (rex andb8 0wx4) <> 0w0
3190            val rexB = (rex andb8 0wx1) <> 0w0
3191
3192            val opByte = get8u (!ptr, seg) before ptr +:= 0w1
3193            
3194            val sizeFromRexW = if rexW then SZQWord else SZDWord
3195        in
3196            case opByte of
3197                0wx03 => printGvEv (opByte, rex, rexR, sizeFromRexW)
3198
3199            |   0wx0b => printGvEv (opByte, rex, rexR, sizeFromRexW)
3200
3201            |   0wx0f => (* ESCAPE *)
3202                let
3203                    (* Opcode is in next byte. *)
3204                    val opByte2  = codeVecGet (seg, !ptr)
3205                    val () = (ptr +:= 0w1)
3206                    
3207                    fun printcmov movop =
3208                    let
3209                        val nb = codeVecGet (seg, !ptr)
3210                        val reg = (nb >>- 0w3) andb8 0w7
3211                    in
3212                        print movop;
3213                        print "\t";
3214                        print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3215                        print ",";
3216                        printEA(rex, sizeFromRexW)
3217                    end
3218                in
3219                    case legacyPrefix of
3220                        0w0 =>
3221                        (
3222                            case opByte2 of
3223                                0wx2e =>
3224                                let (* ucomiss doesn't have a prefix. *)
3225                                    val nb = codeVecGet (seg, !ptr)
3226                                    val reg = SSE2Reg((nb >>- 0w3) andb8 0w7)
3227                                in
3228                                    print "ucomiss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord)
3229                                end
3230
3231                            |   0wx40 => printcmov "cmovo"
3232                            |   0wx41 => printcmov "cmovno"
3233                            |   0wx42 => printcmov "cmovb"
3234                            |   0wx43 => printcmov "cmovnb"
3235                            |   0wx44 => printcmov "cmove"
3236                            |   0wx45 => printcmov "cmovne"
3237                            |   0wx46 => printcmov "cmovna"
3238                            |   0wx47 => printcmov "cmova"
3239                            |   0wx48 => printcmov "cmovs"
3240                            |   0wx49 => printcmov "cmovns"
3241                            |   0wx4a => printcmov "cmovp"
3242                            |   0wx4b => printcmov "cmovnp" 
3243                            |   0wx4c => printcmov "cmovl"
3244                            |   0wx4d => printcmov "cmovge"
3245                            |   0wx4e => printcmov "cmovle"
3246                            |   0wx4f => printcmov "cmovg"
3247
3248                            |   0wxC1 =>
3249                                let
3250                                    val nb = codeVecGet (seg, !ptr);
3251                                    val reg = (nb >>- 0w3) andb8 0w7
3252                                in
3253                                    (* The address argument comes first in the assembly code. *)
3254                                    print "xadd\t";
3255                                    printEA (rex, sizeFromRexW);
3256                                    print ",";
3257                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW))
3258                                end
3259
3260                            |   0wxB6 =>
3261                                let
3262                                    val nb = codeVecGet (seg, !ptr);
3263                                    val reg = (nb >>- 0w3) andb8 0w7
3264                                in
3265                                    print "movzx\t";
3266                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3267                                    print ",";
3268                                    printEA (rex, SZByte)
3269                                end
3270
3271                            |   0wxB7 =>
3272                                let
3273                                    val nb = codeVecGet (seg, !ptr);
3274                                    val reg = (nb >>- 0w3) andb8 0w7
3275                                in
3276                                    print "movzx\t";
3277                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3278                                    print ",";
3279                                    printEA (rex, SZWord)
3280                                end
3281
3282                            |   0wxBE =>
3283                                let
3284                                    val nb = codeVecGet (seg, !ptr);
3285                                    val reg = (nb >>- 0w3) andb8 0w7
3286                                in
3287                                    print "movsx\t";
3288                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3289                                    print ",";
3290                                    printEA (rex, SZByte)
3291                                end
3292
3293                            |   0wxBF =>
3294                                let
3295                                    val nb = codeVecGet (seg, !ptr);
3296                                    val reg = (nb >>- 0w3) andb8 0w7
3297                                in
3298                                    print "movsx\t";
3299                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3300                                    print ",";
3301                                    printEA (rex, SZWord)
3302                                end
3303
3304                            |   0wxAE =>
3305                                let
3306                                    (* Opcode is determined by the next byte. *)
3307                                    val opByte2 = codeVecGet (seg, !ptr);
3308                                    val nnn = (opByte2 >>- 0w3) andb8 0w7
3309                                in
3310                                    case nnn of
3311                                        0wx2 => (print "ldmxcsr\t"; printEA(rex, SZDWord))
3312                                    |   0wx3 => (print "stmxcsr\t"; printEA(rex, SZDWord))
3313                                    |   _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2))
3314                                end
3315
3316                            |   0wxAF =>
3317                                let
3318                                    val nb = codeVecGet (seg, !ptr);
3319                                    val reg = (nb >>- 0w3) andb8 0w7
3320                                in
3321                                    print "imul\t";
3322                                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3323                                    print ",";
3324                                    printEA (rex, sizeFromRexW)
3325                                end
3326
3327                            |   0wx80 => printJmp32 "jo  "
3328                            |   0wx81 => printJmp32 "jno "
3329                            |   0wx82 => printJmp32 "jb  "
3330                            |   0wx83 => printJmp32 "jnb "
3331                            |   0wx84 => printJmp32 "je  "
3332                            |   0wx85 => printJmp32 "jne "
3333                            |   0wx86 => printJmp32 "jna "
3334                            |   0wx87 => printJmp32 "ja  "
3335                            |   0wx88 => printJmp32 "js  "
3336                            |   0wx89 => printJmp32 "jns "
3337                            |   0wx8a => printJmp32 "jp  "
3338                            |   0wx8b => printJmp32 "jnp " 
3339                            |   0wx8c => printJmp32 "jl  "
3340                            |   0wx8d => printJmp32 "jge "
3341                            |   0wx8e => printJmp32 "jle "
3342                            |   0wx8f => printJmp32 "jg  "
3343
3344                            |   0wx90 => (print "seto\t";  printEA (rex, SZByte))
3345                            |   0wx91 => (print "setno\t"; printEA (rex, SZByte))
3346                            |   0wx92 => (print "setb\t";  printEA (rex, SZByte))
3347                            |   0wx93 => (print "setnb\t"; printEA (rex, SZByte))
3348                            |   0wx94 => (print "sete\t";  printEA (rex, SZByte))
3349                            |   0wx95 => (print "setne\t"; printEA (rex, SZByte))
3350                            |   0wx96 => (print "setna\t"; printEA (rex, SZByte))
3351                            |   0wx97 => (print "seta\t";  printEA (rex, SZByte))
3352                            |   0wx98 => (print "sets\t";  printEA (rex, SZByte))
3353                            |   0wx99 => (print "setns\t"; printEA (rex, SZByte))
3354                            |   0wx9a => (print "setp\t";  printEA (rex, SZByte))
3355                            |   0wx9b => (print "setnp\t"; printEA (rex, SZByte)) 
3356                            |   0wx9c => (print "setl\t";  printEA (rex, SZByte))
3357                            |   0wx9d => (print "setge\t"; printEA (rex, SZByte))
3358                            |   0wx9e => (print "setle\t"; printEA (rex, SZByte))
3359                            |   0wx9f => (print "setg\t";  printEA (rex, SZByte))
3360                            
3361                            |   _ => (print "esc\t"; printValue(Word8.toLargeInt opByte2))
3362                        )
3363                    
3364                    |   0wxf2 => (* SSE2 instruction *)
3365                        let
3366                            val nb = codeVecGet (seg, !ptr)
3367                            val rr = (nb >>- 0w3) andb8 0w7
3368                            val reg = SSE2Reg rr
3369                        in
3370                            case opByte2 of
3371                                0wx10 => ( print "movsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3372                            |   0wx11 => ( print "movsd\t"; printEAxmm(rex, SZQWord); print ","; print(xmmRegRepr reg)  )
3373                            |   0wx2a => ( print "cvtsi2sd\t"; print(xmmRegRepr reg); print ","; printEA(rex, sizeFromRexW)  )
3374                            |   0wx2c =>
3375                                    ( print "cvttsd2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW)  )
3376                            |   0wx2d =>
3377                                    ( print "cvtsd2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW)  )
3378                            |   0wx58 => ( print "addsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3379                            |   0wx59 => ( print "mulsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3380                            |   0wx5a => ( print "cvtsd2ss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3381                            |   0wx5c => ( print "subsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3382                            |   0wx5e => ( print "divsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3383                            |   b => (print "F2\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b))
3384                        end
3385
3386                    |   0wxf3 => (* SSE2 instruction. *)
3387                        let
3388                            val nb = codeVecGet (seg, !ptr)
3389                            val rr = (nb >>- 0w3) andb8 0w7
3390                            val reg = SSE2Reg rr
3391                        in
3392                            case opByte2 of
3393                                0wx10 => ( print "movss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) )
3394                            |   0wx11 => ( print "movss\t"; printEAxmm(rex, SZDWord); print ","; print(xmmRegRepr reg)  )
3395                            |   0wx2c =>
3396                                    ( print "cvttss2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW)  )
3397                            |   0wx2d =>
3398                                    ( print "cvtss2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW)  )
3399                            |   0wx5a => ( print "cvtss2sd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord)  )
3400                            |   0wx58 => ( print "addss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) )
3401                            |   0wx59 => ( print "mulss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) )
3402                            |   0wx5c => ( print "subss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) )
3403                            |   0wx5e => ( print "divss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) )
3404                            |   b => (print "F3\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b))
3405                        end
3406
3407                    |   0wx66 => (* SSE2 instruction *)
3408                        let
3409                            val nb = codeVecGet (seg, !ptr)
3410                            val reg = SSE2Reg((nb >>- 0w3) andb8 0w7)
3411                        in
3412                            case opByte2 of
3413                                0wx2e => ( print "ucomisd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3414                            |   0wx54 => ( print "andpd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3415                            |   0wx57 => ( print "xorpd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) )
3416                            |   0wx6e => ( print (if rexW then "movq\t" else "movd\t"); print(xmmRegRepr reg); print ","; printEA(rex, sizeFromRexW) )
3417                            |   0wx7e => ( print (if rexW then "movq\t" else "movd\t"); printEA(rex, sizeFromRexW); print ","; print(xmmRegRepr reg)  )
3418                            |   0wx73 => ( print "psrldq\t"; printEAxmm(rex, SZQWord); print ","; print8 ())
3419                            |   b => (print "66\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b))
3420                        end
3421
3422                    |   _ => (print "esc\t"; printValue(Word8.toLargeInt opByte2))
3423                end (* ESCAPE *)
3424
3425            |   0wx13 => printGvEv (opByte, rex, rexR, sizeFromRexW)
3426
3427            |   0wx1b => printGvEv (opByte, rex, rexR, sizeFromRexW)
3428
3429            |   0wx23 => printGvEv (opByte, rex, rexR, sizeFromRexW)
3430
3431            |   0wx2b => printGvEv (opByte, rex, rexR, sizeFromRexW)
3432
3433            |   0wx33 => printGvEv (opByte, rex, rexR, sizeFromRexW)
3434
3435            |   0wx3b => printGvEv (opByte, rex, rexR, sizeFromRexW)
3436
3437                (* Push and Pop.  These are 64-bit on X86/64 whether there is REX prefix or not. *)
3438            |   0wx50 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3439            |   0wx51 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3440            |   0wx52 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3441            |   0wx53 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3442            |   0wx54 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3443            |   0wx55 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3444            |   0wx56 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3445            |   0wx57 => print ("push\t" ^  genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3446
3447            |   0wx58 => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3448            |   0wx59 => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3449            |   0wx5a => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3450            |   0wx5b => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3451            |   0wx5c => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3452            |   0wx5d => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3453            |   0wx5e => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3454            |   0wx5f => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64))
3455            
3456            |   0wx63 => (* MOVSXD. This is ARPL in 32-bit mode but that's never used here. *)
3457                let
3458                    val nb = codeVecGet (seg, !ptr)
3459                    val reg = (nb >>- 0w3) andb8 0w7
3460                in
3461                    print "movsxd\t";
3462                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3463                    print ",";
3464                    printEA(rex, SZDWord)
3465                end
3466
3467            |   0wx68 => (print "push\t"; print32 ())
3468            |   0wx69 =>
3469                let
3470                    (* Register is in next byte. *)
3471                    val nb = codeVecGet (seg, !ptr)
3472                    val reg = (nb >>- 0w3) andb8 0w7
3473                in
3474                    print "imul\t"; print(genRegRepr (mkReg(reg, rexR), sizeFromRexW)); print ",";
3475                    printEA(rex, sizeFromRexW); print ","; print32 ()
3476                end
3477            |   0wx6a => (print "push\t"; print8 ())
3478            |   0wx6b =>
3479                let
3480                    (* Register is in next byte. *)
3481                    val nb = codeVecGet (seg, !ptr)
3482                    val reg = (nb >>- 0w3) andb8 0w7
3483                in
3484                    print "imul\t"; print(genRegRepr (mkReg(reg, rexR), sizeFromRexW)); print ",";
3485                    printEA(rex, sizeFromRexW); print ","; print8 ()
3486                end
3487
3488            |   0wx70 => (print "jo  \t"; printJmp())
3489            |   0wx71 => (print "jno \t"; printJmp())
3490            |   0wx72 => (print "jb  \t"; printJmp())
3491            |   0wx73 => (print "jnb \t"; printJmp())
3492            |   0wx74 => (print "je  \t"; printJmp())
3493            |   0wx75 => (print "jne \t"; printJmp())
3494            |   0wx76 => (print "jna \t"; printJmp())
3495            |   0wx77 => (print "ja  \t"; printJmp())
3496            |   0wx78 => (print "js  \t"; printJmp())
3497            |   0wx79 => (print "jns \t"; printJmp())
3498            |   0wx7a => (print "jp  \t"; printJmp())
3499            |   0wx7b => (print "jnp \t"; printJmp())
3500            |   0wx7c => (print "jl  \t"; printJmp())
3501            |   0wx7d => (print "jge \t"; printJmp())
3502            |   0wx7e => (print "jle \t"; printJmp())
3503            |   0wx7f => (print "jg  \t"; printJmp())
3504
3505            |   0wx80 => (* Group1_8_a *)
3506                let (* Memory, byte constant *)
3507                    (* Opcode is determined by next byte. *)
3508                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3509                in
3510                    printArith ((nb div 8) mod 8);
3511                    print "\t";
3512                    printEA(rex, SZByte);
3513                    print ",";
3514                    print8 ()
3515                end
3516
3517            |   0wx81 =>
3518                let (* Memory, 32-bit constant *)
3519                    (* Opcode is determined by next byte. *)
3520                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3521                in
3522                    printArith ((nb div 8) mod 8);
3523                    print "\t";
3524                    printEA(rex, sizeFromRexW);
3525                    print ",";
3526                    print32 ()
3527                end
3528
3529            |   0wx83 =>
3530                let (* Word memory, 8-bit constant *)
3531                    (* Opcode is determined by next byte. *)
3532                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3533                in
3534                    printArith ((nb div 8) mod 8);
3535                    print "\t";
3536                    printEA(rex, sizeFromRexW);
3537                    print ",";
3538                    print8 ()
3539                end
3540
3541            |   0wx87 =>
3542                let (* xchng *)
3543                    (* Register is in next byte. *)
3544                    val nb = codeVecGet (seg, !ptr)
3545                    val reg = (nb >>- 0w3) andb8 0w7
3546                in
3547                    print "xchng \t";
3548                    printEA(rex, sizeFromRexW);
3549                    print ",";
3550                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW))
3551                end
3552
3553            |   0wx88 =>
3554                let (* mov eb,gb i.e a store *)
3555                    (* Register is in next byte. *)
3556                    val nb = Word8.toInt (codeVecGet (seg, !ptr));
3557                    val reg = (nb div 8) mod 8;
3558                in
3559                    print "mov \t";
3560                    printEA(rex, SZByte);
3561                    print ",";
3562                    if rexR
3563                    then print ("r" ^ Int.toString(reg+8) ^ "B")
3564                    else case reg of
3565                        0 => print "al"
3566                    |   1 => print "cl"
3567                    |   2 => print "dl"
3568                    |   3 => print "bl"
3569                         (* If there is a REX byte these select the low byte of the registers. *)
3570                    |   4 => print (if rex = 0w0 then "ah" else "sil")
3571                    |   5 => print (if rex = 0w0 then "ch" else "dil")
3572                    |   6 => print (if rex = 0w0 then "dh" else "bpl")
3573                    |   7 => print (if rex = 0w0 then "bh" else "spl")
3574                    |   _ => print ("r" ^ Int.toString reg)
3575                end
3576
3577            |   0wx89 =>
3578                let (* mov ev,gv i.e. a store *)
3579                    (* Register is in next byte. *)
3580                    val nb = codeVecGet (seg, !ptr)
3581                    val reg = (nb >>- 0w3) andb8 0w7
3582                in
3583                    print "mov \t";
3584                    (* This may have an opcode prefix. *)
3585                    printEA(rex, if legacyPrefix = 0wx66 then SZWord else sizeFromRexW);
3586                    print ",";
3587                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW))
3588                end
3589         
3590            |   0wx8b =>
3591                let (* mov gv,ev i.e. a load *)
3592                    (* Register is in next byte. *)
3593                    val nb = codeVecGet (seg, !ptr)
3594                    val reg = (nb >>- 0w3) andb8 0w7
3595                in
3596                    print "mov \t";
3597                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3598                    print ",";
3599                    printEA(rex, sizeFromRexW)
3600                end
3601
3602            |   0wx8d =>
3603                let (* lea gv.M *)
3604                    (* Register is in next byte. *)
3605                    val nb = codeVecGet (seg, !ptr)
3606                    val reg = (nb >>- 0w3) andb8 0w7
3607                in
3608                    print "lea \t";
3609                    print (genRegRepr (mkReg(reg, rexR), sizeFromRexW));
3610                    print ",";
3611                    printEA(rex, sizeFromRexW)
3612                end
3613         
3614            |   0wx8f => (print "pop \t"; printEA(rex, sz32_64))
3615            |   0wx90 => print "nop"
3616            
3617            |   0wx99 => if rexW then print "cqo" else print "cdq"
3618            
3619            |   0wx9e => print "sahf\n"
3620
3621            |   0wxa4 => (if legacyPrefix = 0wxf3 then print "rep " else (); print "movsb")
3622            |   0wxa5 => (if legacyPrefix = 0wxf3 then print "rep " else (); print "movsl")
3623            |   0wxa6 => (if legacyPrefix = 0wxf3 then print "repe " else (); print "cmpsb")
3624
3625            |   0wxa8 => (print "test\tal,"; print8 ())
3626
3627            |   0wxaa => (if legacyPrefix = 0wxf3 then print "rep " else (); print "stosb")
3628            |   0wxab =>
3629                (
3630                    if legacyPrefix = 0wxf3 then print "rep " else ();
3631                    if rexW then print "stosq" else print "stosl"
3632                )
3633
3634            |   0wxb8 => printMovCToR (opByte, sizeFromRexW, rexB)
3635            |   0wxb9 => printMovCToR (opByte, sizeFromRexW, rexB)
3636            |   0wxba => printMovCToR (opByte, sizeFromRexW, rexB)
3637            |   0wxbb => printMovCToR (opByte, sizeFromRexW, rexB)
3638            |   0wxbc => printMovCToR (opByte, sizeFromRexW, rexB)
3639            |   0wxbd => printMovCToR (opByte, sizeFromRexW, rexB)
3640            |   0wxbe => printMovCToR (opByte, sizeFromRexW, rexB)
3641            |   0wxbf => printMovCToR (opByte, sizeFromRexW, rexB)
3642   
3643            |   0wxc1 => (* Group2_8_A *) printShift (opByte, rex, sizeFromRexW)
3644
3645            |   0wxc2 => (print "ret \t"; print16 ())
3646            |   0wxc3 => print "ret"
3647         
3648            |   0wxc6 => (* move 8-bit constant to memory *)
3649                (
3650                    print "mov \t";
3651                    printEA(rex, SZByte);
3652                    print ",";
3653                    print8 ()
3654                )
3655
3656            |   0wxc7 => (* move 32/64-bit constant to memory *)
3657                (
3658                    print "mov \t";
3659                    printEA(rex, sizeFromRexW);
3660                    print ",";
3661                    print32 ()
3662                )
3663            
3664            |   0wxca => (* Register mask *)
3665                let
3666                    val mask = get16u (!ptr, seg) before (ptr +:= 0w2)
3667                in
3668                    print "SAVE\t";
3669                    printMask mask
3670                end
3671
3672            |   0wxcd => (* Register mask *)
3673                let
3674                    val mask = get8u (!ptr, seg) before (ptr +:= 0w1)
3675                in
3676                    print "SAVE\t";
3677                    printMask(Word8.toInt mask)
3678                end
3679
3680            |   0wxd1 => (* Group2_1_A *) printShift (opByte, rex, sizeFromRexW)
3681
3682            |   0wxd3 => (* Group2_CL_A *) printShift (opByte, rex, sizeFromRexW)
3683           
3684            |   0wxd8 => printFloat (opByte, rex) (* Floating point escapes *)
3685            |   0wxd9 => printFloat (opByte, rex)
3686            |   0wxda => printFloat (opByte, rex)
3687            |   0wxdb => printFloat (opByte, rex)
3688            |   0wxdc => printFloat (opByte, rex)
3689            |   0wxdd => printFloat (opByte, rex)
3690            |   0wxde => printFloat (opByte, rex)
3691            |   0wxdf => printFloat (opByte, rex)
3692
3693            |   0wxe8 =>
3694                let (* 32-bit relative call. *)
3695                    val valu = get32s (!ptr, seg) before (ptr +:= 0w4)
3696                in
3697                    print "call\t";
3698                    print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu))
3699                end
3700
3701            |   0wxe9 =>
3702                let (* 32-bit relative jump. *)
3703                    val valu = get32s (!ptr, seg) before (ptr +:= 0w4)
3704                in
3705                    print "jmp \t";
3706                    print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu))
3707                end
3708
3709            |   0wxeb => (print "jmp \t"; printJmp())
3710            
3711            |   0wxf4 => print "hlt" (* Marker to indicate end-of-code. *)
3712        
3713            |   0wxf6 => (* Group3_a *)
3714                let
3715                    (* Opcode is determined by next byte. *)
3716                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3717                    val opc = (nb div 8) mod 8
3718                in
3719                    print
3720                      (case opc of
3721                         0 => "test"
3722                       | 3 => "neg"
3723                       | _ => "???"
3724                      );
3725                    print "\t";
3726                    printEA(rex, SZByte);
3727                    if opc = 0 then (print ","; print8 ()) else ()
3728                end
3729
3730            |   0wxf7 => (* Group3_A *)
3731                let
3732                    (* Opcode is determined by next byte. *)
3733                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3734                    val opc = (nb div 8) mod 8
3735                in
3736                    print
3737                      (case opc of
3738                         0 => "test"
3739                       | 3 => "neg "
3740                       | 4 => "mul "
3741                       | 5 => "imul"
3742                       | 6 => "div "
3743                       | 7 => "idiv"
3744                       | _ => "???"
3745                      );
3746                    print "\t";
3747                    printEA(rex, sizeFromRexW);
3748                    (* Test has an immediate operand.  It's 32-bits even in 64-bit mode. *)
3749                    if opc = 0 then (print ","; print32 ()) else ()
3750                end
3751         
3752            |   0wxff => (* Group5 *)
3753                let
3754                    (* Opcode is determined by next byte. *)
3755                    val nb = Word8.toInt (codeVecGet (seg, !ptr))
3756                    val opc = (nb div 8) mod 8
3757                in
3758                    print
3759                      (case opc of
3760                         2 => "call"
3761                       | 4 => "jmp "
3762                       | 6 => "push"
3763                       | _ => "???"
3764                      );
3765                    print "\t";
3766                    printEA(rex, sz32_64) (* None of the cases we use need a prefix. *)
3767                end
3768 
3769            |   _ => print(Word8.fmt StringCvt.HEX opByte);
3770      
3771            print "\n"
3772        end; (* end of while loop *)
3773
3774        print "\n"
3775
3776    end (* printCode *);
3777
3778    (* Although this is used locally it must be defined at the top level
3779       otherwise a new RTS function will be compiler every time the
3780       containing function is called *)
3781    val sortFunction: (machineWord * word) array -> bool =
3782        RunCall.rtsCallFast1 "PolySortArrayOfAddresses"
3783
3784    (* This actually does the final code-generation. *)
3785    fun generateCode
3786        {ops=operations,
3787         code=cvec as Code{procName, printAssemblyCode, printStream, profileObject, ...},
3788         labelCount, resultClosure} : unit =
3789    let
3790        val (expanded, newLabelCount) = expandComplexOperations (operations, labelCount)
3791
3792        val () = printLowLevelCode(expanded, cvec)
3793        local
3794            val initialBytesList = codeGenerate expanded
3795        in
3796            (* Fixup labels and shrink long branches to short. *)
3797            val (bytesList, codeSize) = fixupLabels(expanded, initialBytesList, newLabelCount)
3798        end
3799
3800        local
3801            (* Extract the constants and the location of the references from the code. *)
3802            val (inlineConstants, addressConstants, nonAddressConstants) = getConstants(expanded, bytesList)
3803            
3804            (* Sort the non-address constants to remove duplicates.  There don't seem to be
3805               many in practice.
3806               Since we're not actually interested in the order but only
3807               sorting to remove duplicates we can use a stripped-down Quicksort. *)
3808            fun sort([], out) = out
3809            |   sort((addr, median) :: tl, out) = partition(median, tl, [addr], [], [], out)
3810
3811            and partition(median, [], addrs, less, greater, out) =
3812                    sort(less, sort(greater, (addrs, median) :: out))
3813            |   partition(median, (entry as (addr, value)) :: tl, addrs, less, greater, out) =
3814                    if value = median
3815                    then partition(median, tl, addr::addrs, less, greater, out)
3816                    else if value < median
3817                    then partition(median, tl, addrs, entry :: less, greater, out)
3818                    else partition(median, tl, addrs, less, entry :: greater, out)
3819            
3820            (* Non-address constants.  We can't use any ordering on them because a GC could
3821               change the values half way through the sort.  Instead we use a simple search
3822               for a small number of constants and use an RTS call for larger numbers.  We
3823               want to avoid quadratic cost when there are large numbers. *)
3824
3825            val sortedConstants =
3826                if List.length addressConstants < 10
3827                then
3828                let
3829                    fun findDups([], out) = out
3830                    |   findDups((addr, value) :: tl, out) =
3831                        let
3832                            fun partition(e as (a, v), (eq, neq)) =
3833                                if PolyML.pointerEq(value, v)
3834                                then (a :: eq, neq)
3835                                else (eq, e :: neq)
3836                            val (eqAddr, neq) = List.foldl partition ([addr], []) tl
3837                        in
3838                            findDups(neq, (eqAddr, value) :: out)
3839                        end
3840                in
3841                    findDups(addressConstants, [])
3842                end
3843                else
3844                let
3845                    fun swap (a, b) = (b, a)
3846                    val arrayToSort: (machineWord * word) array =
3847                        Array.fromList (List.map swap addressConstants)
3848                    val _ = sortFunction arrayToSort
3849                    
3850                    fun makeList((v, a), []) = [([a], v)]
3851                    |   makeList((v, a), l as (aa, vv) :: tl) =
3852                        if PolyML.pointerEq(v, vv)
3853                        then (a :: aa, vv) :: tl
3854                        else ([a], v) :: l
3855                in
3856                    Array.foldl makeList [] arrayToSort
3857                end
3858        in
3859            val inlineConstants = inlineConstants
3860            and addressConstants = sortedConstants
3861            and nonAddressConstants = sort(nonAddressConstants, [])
3862        end
3863
3864        (* Get the number of constants that need to be added to the address area. *)
3865        val constsInConstArea = List.length addressConstants
3866
3867        local
3868            (* Add one byte for the HLT and round up to a number of words. *)
3869            val endOfCode = (codeSize+nativeWordSize) div nativeWordSize * (nativeWordSize div wordSize)
3870            val numOfNonAddrWords = Word.fromInt(List.length nonAddressConstants)
3871            (* Each entry in the non-address constant area is 8 bytes. *)
3872            val intSize = 0w8 div wordSize
3873        in
3874            val endOfByteArea = endOfCode + numOfNonAddrWords * intSize
3875            (* +4 for no of consts. function name, profile object and offset to start of consts. *)
3876            val segSize = endOfByteArea + Word.fromInt constsInConstArea + 0w4
3877        end
3878
3879        (* Create a byte vector and copy the data in.  This is a byte area and not scanned
3880           by the GC so cannot contain any addresses. *)
3881        val byteVec = byteVecMake segSize
3882        val ic = ref 0w0
3883        
3884        local
3885            fun genByte (ival: Word8.word) = set8u (ival, !ic, byteVec) before ic := !ic + 0w1
3886        in
3887            fun genBytes l = Word8Vector.app (fn i => genByte i) l
3888            val () = List.app (fn b => genBytes b) bytesList
3889            val () = genBytes(Word8Vector.fromList(opCodeBytes(HLT, NONE))) (* Marker - this is used by ScanConstants in the RTS. *)
3890        end
3891    
3892        (* Align ic onto a fullword boundary. *)
3893        val ()   = ic := ((!ic + nativeWordSize - 0w1) andb ~nativeWordSize)
3894        
3895        (* Copy the non-address constants.  These are only used in 64-bit mode and are
3896           either real constants or integers that are too large to fit in a 32-bit
3897           inline constants.  We don't use this for real constants in 32-bit mode because
3898           we don't have relative addressing.  Instead a real constant is the inline
3899           address of a boxed real number. *)
3900        local
3901            fun putNonAddrConst(addrs, constant) =
3902                let
3903                    val addrOfConst = ! ic
3904                    val () = genBytes(Word8Vector.fromList(largeWordToBytes(constant, 8)))
3905                    fun setAddr addr = set32s(Word.toLargeInt(addrOfConst - addr - 0w4), addr, byteVec)
3906                in
3907                    List.app setAddr addrs
3908                end
3909        in
3910            val () = List.app putNonAddrConst nonAddressConstants
3911        end
3912
3913        val _ = bytesToWords(! ic) = endOfByteArea orelse raise InternalError "mismatch"
3914        
3915        (* Put in the number of constants. This must go in before we actually put
3916           in any constants.  In 32-bit mode there are only two constants: the 
3917           function name and the profile object.
3918           All other constants are in the code. *)
3919        local
3920            val lastWord = wordsToBytes(endOfByteArea + 0w3 + Word.fromInt constsInConstArea)
3921
3922            fun setBytes(_, _, 0) = ()
3923            |   setBytes(ival, offset, count) =
3924                (
3925                    byteVecSet(byteVec, offset, Word8.fromLargeInt(ival mod 256));
3926                    setBytes(ival div 256, offset+0w1, count-1)
3927                )
3928        in
3929            val () = setBytes(LargeInt.fromInt(2 + constsInConstArea), wordsToBytes endOfByteArea, Word.toInt wordSize)
3930            (* Set the last word of the code to the (negative) byte offset of the start of the code area
3931               from the end of this word. *)
3932            val () = setBytes(Word.toLargeIntX(wordsToBytes endOfByteArea - lastWord), lastWord, Word.toInt wordSize)
3933        end;
3934
3935        (* We've put in all the byte data so it is safe to convert this to a mutable code
3936           cell that can contain addresses and will be scanned by the GC. *)
3937        val codeSeg = byteVecToCodeVec(byteVec, resultClosure)
3938
3939        (* Various RTS functions assume that the first constant is the function name.
3940           The profiler assumes that the second word is the address of the mutable that
3941           contains the profile count. *)
3942        val () = codeVecPutWord (codeSeg, endOfByteArea + 0w1, toMachineWord procName)
3943        (* Next the profile object. *)
3944        val () = codeVecPutWord (codeSeg, endOfByteArea + 0w2, profileObject)
3945    in
3946        let
3947            fun setBytes(_, _, 0w0) = ()
3948            |   setBytes(b, addr, count) =
3949                (
3950                    codeVecSet (codeSeg, addr, wordToWord8 b);
3951                    setBytes(b >> 0w8, addr+0w1, count-0w1)
3952                )
3953
3954            (*  Inline constants - native 32-bit only plus one special case in 32-in-64 *)
3955            fun putInlConst (addrs, SelfAddress) =
3956                    (* Self address goes inline. *)
3957                    codeVecPutConstant (codeSeg, addrs, toMachineWord(codeVecAddr codeSeg), ConstAbsolute)
3958            |   putInlConst (addrs, InlineAbsoluteAddress m) =
3959                    codeVecPutConstant (codeSeg, addrs, m, ConstAbsolute)
3960            |   putInlConst (addrs, InlineRelativeAddress m) =
3961                    codeVecPutConstant (codeSeg, addrs, m, ConstX86Relative)
3962            
3963            val _ = List.app putInlConst inlineConstants
3964
3965            (* Address constants - native 64-bit and 32-in-64. *)
3966            fun putAddrConst ((addrs, m), constAddr) =
3967            (*  Put the constant in the constant area and set the original address
3968                to be the relative offset to the constant itself. *)
3969            (
3970                codeVecPutWord (codeSeg, constAddr, m);
3971                (* Put in the 32-bit offset - always unsigned since the destination
3972                   is after the reference. *)
3973                List.app(fn addr => setBytes(constAddr * wordSize - addr - 0w4, addr, 0w4)) addrs;
3974                constAddr+0w1
3975            )
3976
3977            (* Put the constants.  Any values in the constant area start at +3 i.e. after the profile. *)
3978            val _ = List.foldl putAddrConst (endOfByteArea+0w3) addressConstants
3979
3980            val () = 
3981                if printAssemblyCode
3982                then (* print out the code *)
3983                (
3984                    printCode(cvec, codeSeg);
3985                    printStream "\n\n"
3986                )
3987            else ()
3988        in
3989            (* Finally lock the code. *)
3990            codeVecLock(codeSeg, resultClosure)
3991        end (* the result *)
3992    end (* generateCode *)
3993 
3994    structure Sharing =
3995    struct
3996        type code           = code
3997        and  reg            = reg
3998        and  genReg         = genReg
3999        and  fpReg          = fpReg
4000        and  addrs          = addrs
4001        and  operation      = operation
4002        and  regSet         = RegSet.regSet
4003        and  label          = label
4004        and  branchOps      = branchOps
4005        and  arithOp        = arithOp
4006        and  shiftType      = shiftType
4007        and  repOps         = repOps
4008        and  fpOps          = fpOps
4009        and  fpUnaryOps     = fpUnaryOps
4010        and  sse2Operations = sse2Operations
4011        and  opSize         = opSize
4012        and  closureRef     = closureRef
4013    end
4014
4015end (* struct *) (* CODECONS *);
4016