1(*
2    Copyright David C. J. Matthews 2016-20
3
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License version 2.1 as published by the Free Software Foundation.
7    
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Lesser General Public License for more details.
12    
13    You should have received a copy of the GNU Lesser General Public
14    License along with this library; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16*)
17
18functor X86ICodeToX86Code(
19
20    structure X86CODE: X86CODESIG
21
22    structure X86OPTIMISE:
23    sig
24        type operation
25        type code
26        type operations = operation list
27        type closureRef
28
29        val generateCode: {code: code, ops: operations, labelCount: int, resultClosure: closureRef } -> unit
30
31        structure Sharing:
32        sig
33            type operation = operation
34            type code = code
35            type closureRef = closureRef
36        end
37    end
38
39    structure DEBUG: DEBUG
40    
41    structure ICODE: ICodeSig
42    structure IDENTIFY: X86IDENTIFYREFSSIG
43    structure INTSET: INTSETSIG
44    structure PRETTY: PRETTYSIG
45
46    structure STRONGLY:
47        sig
48            val stronglyConnectedComponents: {nodeAddress: 'a -> int, arcs: 'a -> int list } -> 'a list -> 'a list list
49        end
50    
51    sharing X86CODE.Sharing = ICODE.Sharing = X86OPTIMISE.Sharing = IDENTIFY.Sharing = INTSET
52): X86ICODEGENERATESIG =
53struct
54    open IDENTIFY
55    open ICODE
56
57    open X86CODE
58
59    open Address
60    
61    exception InternalError = Misc.InternalError
62    
63    fun asGenReg(GenReg r) = r
64    |   asGenReg _ = raise InternalError "asGenReg"
65    
66    and asFPReg(FPReg r) = r
67    |   asFPReg _ = raise InternalError "asFPReg"
68   
69    and asXMMReg(XMMReg r) = r
70    |   asXMMReg _ = raise InternalError "asXMMReg"
71
72    (* tag a short constant *)
73    fun tag c = 2 * c + 1
74
75    local
76        val regs =
77            case targetArch of
78                Native32Bit     => [edi, esi, edx, ecx, ebx, eax]
79            |   Native64Bit     => [r14, r13, r12, r11, r10, r9, r8, edi, esi, edx, ecx, ebx, eax]
80            |   ObjectId32Bit   => [r14, r13, r12, r11, r10, r9, r8, edi, esi, edx, ecx, eax]
81    in
82        val generalRegisters = List.map GenReg regs
83    end
84
85    fun opSizeToMove OpSize32 = Move32 | opSizeToMove OpSize64 = Move64
86
87    fun icodeToX86Code{blocks, functionName, stackRequired, debugSwitches, allocatedRegisters, resultClosure, ...} =
88    let        
89        fun argAsGenReg(RegisterArg(GenReg r)) = r
90        |   argAsGenReg _ = raise InternalError "argAsGenReg"
91
92        fun sourceAsGenRegOrMem(RegisterArg(GenReg r)) = RegisterArg r
93        |   sourceAsGenRegOrMem(MemoryArg{offset, base=baseReg, index}) =
94                MemoryArg{base=baseReg, offset=offset, index=index}
95        |   sourceAsGenRegOrMem(NonAddressConstArg v) = NonAddressConstArg v
96        |   sourceAsGenRegOrMem(AddressConstArg v) = AddressConstArg v
97        |   sourceAsGenRegOrMem _ = raise InternalError "sourceAsGenRegOrMem"
98
99        and sourceAsXMMRegOrMem(RegisterArg(XMMReg r)) = RegisterArg r
100        |   sourceAsXMMRegOrMem(MemoryArg{offset, base=baseReg, index}) =
101                MemoryArg{base=baseReg, offset=offset, index=index}
102        |   sourceAsXMMRegOrMem(NonAddressConstArg v) = NonAddressConstArg v
103        |   sourceAsXMMRegOrMem(AddressConstArg v) = AddressConstArg v
104        |   sourceAsXMMRegOrMem _ = raise InternalError "sourceAsGenRegOrMem"
105
106        (* Moves and loads. *)
107        fun llLoadArgument({ source, dest=GenReg destReg, kind=Move64Bit}, code) =
108                Move { source=sourceAsGenRegOrMem source, destination=RegisterArg destReg, moveSize=Move64 } :: code
109
110        |   llLoadArgument({ source=MemoryArg mLoc, dest=GenReg destReg, kind=MoveByte}, code) = (* Load from memory. *)
111                Move{moveSize=Move8, source=MemoryArg mLoc, destination=RegisterArg destReg} :: code
112
113        |   llLoadArgument({ source=MemoryArg mLoc, dest=GenReg destReg, kind=Move16Bit}, code) = (* Load from memory. *)
114                Move{moveSize=Move16, source=MemoryArg mLoc, destination=RegisterArg destReg} :: code
115
116        |   llLoadArgument({ source, dest=GenReg destReg, kind=Move32Bit}, code) = (* Load from memory. *)
117                Move { source=sourceAsGenRegOrMem source, destination=RegisterArg destReg, moveSize=Move32 } :: code
118
119                (* Load a floating point value. *)
120        |   llLoadArgument({source=MemoryArg{offset, base=baseReg, index}, dest=FPReg fpReg, kind=MoveDouble}, code) =
121                moveToOutputFP(fpReg,
122                   FPLoadFromMemory{ address={base=baseReg, offset=offset, index=index}, precision=DoublePrecision } :: code)
123
124        |   llLoadArgument({source=AddressConstArg addrConst, dest=FPReg fpReg, kind=MoveDouble}, code) =
125                moveToOutputFP(fpReg, FPLoadFromConst{ constant= addrConst, precision=DoublePrecision } :: code)
126        
127        |   llLoadArgument({source=RegisterArg(FPReg fpSrc), dest=FPReg fpDest, kind=MoveDouble}, code) =
128                (* Moving from one FP reg to another.  Even if we are moving from FP0 we still do a load
129                   because FPStoreToFPReg adds one to the register number to account for one value on the
130                   stack. *)
131                moveToOutputFP(fpDest, FPLoadFromFPReg{source=fpSrc, lastRef=false} :: code)
132
133                (* Load or move from an XMM reg. *)
134        |   llLoadArgument({source, dest=XMMReg xmmRegReg, kind=MoveDouble}, code) =
135                XMMArith { opc= SSE2MoveDouble, source=sourceAsXMMRegOrMem source, output=xmmRegReg } :: code
136
137                (* Load a floating point value. *)
138        |   llLoadArgument({source=MemoryArg{offset, base=baseReg, index}, dest=FPReg fpReg, kind=MoveFloat}, code) =
139                moveToOutputFP(fpReg,
140                    FPLoadFromMemory{ address={ base=baseReg, offset=offset, index=index }, precision=SinglePrecision } :: code)
141
142        |   llLoadArgument({source=AddressConstArg addrConst, dest=FPReg fpReg, kind=MoveFloat}, code) =
143                moveToOutputFP(fpReg, FPLoadFromConst{ constant= addrConst, precision=SinglePrecision } :: code)
144
145                (* Load or move from an XMM reg. *)
146        |   llLoadArgument({source, dest=XMMReg xmmRegReg, kind=MoveFloat}, code) =
147                XMMArith { opc= SSE2MoveFloat, source=sourceAsXMMRegOrMem source, output=xmmRegReg } :: code
148
149            (* Any other combinations are not allowed. *)
150        |   llLoadArgument _ = raise InternalError "codeGenICode: LoadArgument"
151        
152        (* Unless the destination is FP0 we need to store and pop. *)
153        and moveToOutputFP(fpDest, code) =
154            if fpDest = fp0 then code
155            else FPStoreToFPReg{output=fpDest, andPop=true} :: code
156
157                (* Store to memory *)
158        fun llStoreArgument{ source=RegisterArg(GenReg sourceReg), base, offset, index, kind=Move64Bit} =
159                Move{source=RegisterArg sourceReg, destination=MemoryArg {base=base, offset=offset, index=index}, moveSize=opSizeToMove OpSize64}
160
161        |   llStoreArgument{ source=RegisterArg(GenReg sourceReg), base, offset, index, kind=MoveByte} =
162                Move{moveSize=Move8, source=RegisterArg sourceReg, destination=MemoryArg {base=base, offset=offset, index=index}} 
163
164        |   llStoreArgument{ source=RegisterArg(GenReg sourceReg), base, offset, index, kind=Move16Bit} =
165                Move{moveSize=Move16, source=RegisterArg sourceReg, destination=MemoryArg {base=base, offset=offset, index=index}}
166
167        |   llStoreArgument{ source=RegisterArg(GenReg sourceReg), base, offset, index, kind=Move32Bit} =
168                Move{source=RegisterArg sourceReg, destination=MemoryArg {base=base, offset=offset, index=index}, moveSize=opSizeToMove OpSize32}
169
170                (* Store a short constant to memory *)
171        |   llStoreArgument{ source=NonAddressConstArg srcValue, base, offset, index, kind=Move64Bit} =
172                Move{source=NonAddressConstArg srcValue, destination=MemoryArg {base=base, offset=offset, index=index}, moveSize=Move64}
173                
174        |   llStoreArgument{ source=NonAddressConstArg srcValue, base, offset, index, kind=Move32Bit} =
175                Move{source=NonAddressConstArg srcValue, destination=MemoryArg {base=base, offset=offset, index=index}, moveSize=Move32}
176
177        |   llStoreArgument{ source=NonAddressConstArg srcValue, base, offset, index, kind=MoveByte} =
178                Move{moveSize=Move8, source=NonAddressConstArg srcValue, destination=MemoryArg{base=base, offset=offset, index=index}}
179
180                (* Store a long constant to memory *)
181        |   llStoreArgument{ source=AddressConstArg srcValue, base, offset, index, kind} =
182            (
183                (* This Move must be of a polyWord size. *)
184                case (kind, polyWordOpSize) of
185                    (Move64Bit, OpSize64) => ()
186                |   (Move32Bit, OpSize32) => ()
187                |   _ => raise InternalError "Move of AddressConstArg";
188                Move{moveSize=opSizeToMove polyWordOpSize, source=AddressConstArg srcValue, destination=MemoryArg {base=base, offset=offset, index=index}}
189            )
190
191                (* Store a floating point value. *)
192        |   llStoreArgument{source=RegisterArg(FPReg fpReg), offset, base=baseReg, index, kind=MoveDouble} =
193            let
194                val _ = fpReg = fp0 orelse raise InternalError "llStoreArgument: Store FPReg <> fp0"
195            in
196                 FPStoreToMemory{ address={ base=baseReg, offset=offset, index=index}, precision=DoublePrecision, andPop=true }
197            end
198
199        |   llStoreArgument{source=RegisterArg(XMMReg xmmRegReg), offset, base=baseReg, index, kind=MoveDouble} =
200                 XMMStoreToMemory { toStore=xmmRegReg, address={base=baseReg, offset=offset, index=index}, precision=DoublePrecision }
201
202                (* Store a floating point value. *)
203        |   llStoreArgument{source=RegisterArg(FPReg fpReg), offset, base=baseReg, index, kind=MoveFloat} =
204            let
205                val _ = fpReg = fp0 orelse raise InternalError "llStoreArgument: Store FPReg <> fp0"
206            in
207                 FPStoreToMemory{address={ base=baseReg, offset=offset, index=index}, precision=SinglePrecision, andPop=true }
208            end
209
210        |   llStoreArgument{source=RegisterArg(XMMReg xmmRegReg), offset, base=baseReg, index, kind=MoveFloat} =
211                 XMMStoreToMemory { toStore=xmmRegReg, address={base=baseReg, offset=offset, index=index}, precision=SinglePrecision }
212
213        |   llStoreArgument _ = raise InternalError "llStoreArgument: StoreArgument"
214        
215        val numBlocks = Vector.length blocks
216
217        fun getAllocatedReg r = Vector.sub(allocatedRegisters, r)
218        
219        val getAllocatedGenReg = asGenReg o getAllocatedReg
220        and getAllocatedFPReg = asFPReg o getAllocatedReg
221        and getAllocatedXMMReg = asXMMReg o getAllocatedReg
222
223        fun codeExtIndex NoMemIndex = NoIndex
224        |   codeExtIndex(MemIndex1(PReg r)) = Index1(getAllocatedGenReg r)
225        |   codeExtIndex(MemIndex2(PReg r)) = Index2(getAllocatedGenReg r)
226        |   codeExtIndex(MemIndex4(PReg r)) = Index4(getAllocatedGenReg r)
227        |   codeExtIndex(MemIndex8(PReg r)) = Index8(getAllocatedGenReg r)
228        |   codeExtIndex ObjectIndex = raise InternalError "codeExtIndex: ObjectIndex"
229
230        local
231            fun codeExtArgument getReg (RegisterArgument(PReg r)) = RegisterArg(getReg r)
232            |   codeExtArgument _ (AddressConstant m) = AddressConstArg m
233            |   codeExtArgument _ (IntegerConstant i) = NonAddressConstArg i
234            |   codeExtArgument _ (MemoryLocation{base=PReg bReg, offset, index=ObjectIndex, cache=NONE}) =
235                    MemoryArg{base=ebx, index=Index4(getAllocatedGenReg bReg), offset=offset}
236            |   codeExtArgument _ (MemoryLocation{base=PReg bReg, offset, index, cache=NONE}) =
237                    MemoryArg{base=getAllocatedGenReg bReg, offset=offset, index=codeExtIndex index}
238            |   codeExtArgument getReg (MemoryLocation{cache=SOME(PReg r), ...}) = RegisterArg(getReg r)
239            |   codeExtArgument _ (StackLocation{wordOffset, cache=NONE, ...}) =
240                    MemoryArg{base=esp, offset=wordOffset*Word.toInt nativeWordSize, index=NoIndex}
241            |   codeExtArgument getReg (StackLocation{cache=SOME(PReg r), ...}) = RegisterArg(getReg r)
242            |   codeExtArgument _ (ContainerAddr _) = raise InternalError "codeExtArgument - ContainerAddr"
243        in
244            val codeExtArgument = codeExtArgument getAllocatedReg
245            and codeExtArgumentAsGenReg = codeExtArgument getAllocatedGenReg
246            and codeExtArgumentAsFPReg = codeExtArgument getAllocatedFPReg
247            and codeExtArgumentAsXMMReg = codeExtArgument getAllocatedXMMReg
248        end
249
250        fun codeCallKind Recursive = NonAddressConstArg 0 (* Jump to the start *)
251        |   codeCallKind (ConstantCode v) = AddressConstArg v
252        |   codeCallKind FullCall =
253            (
254                case targetArch of
255                    ObjectId32Bit => MemoryArg{base=ebx, index=Index4 edx, offset=0}
256                |   _ => MemoryArg{base=edx, index=NoIndex, offset=0}
257            )
258
259        (* Move unless the registers are the same. *)
260        fun moveIfNecessary({src, dst, kind}, code) =
261            if src = dst then code
262            else llLoadArgument({source=RegisterArg src, dest=dst, kind=kind}, code)
263        
264        fun opSizeToIMove OpSize64 = Move64Bit
265        |   opSizeToIMove OpSize32 = Move32Bit
266
267        datatype llsource =
268            StackSource of int
269        |   OtherSource of reg regOrMemoryArg
270
271        fun sourceToX86Code(OtherSource r) = r
272        |   sourceToX86Code(StackSource wordOffset) = MemoryArg{base=esp, offset=wordOffset*Word.toInt nativeWordSize, index=NoIndex}
273
274        local
275            fun indexRegister NoIndex = NONE
276            |   indexRegister (Index1 r) = SOME r
277            |   indexRegister (Index2 r) = SOME r
278            |   indexRegister (Index4 r) = SOME r
279            |   indexRegister (Index8 r) = SOME r
280            (* The registers are numbered from 0.  Choose values that don't conflict with
281               the stack addresses. *)
282            fun regNo r = ~1 - nReg r
283            type node = {src: llsource, dst: destinations }
284            
285            fun nodeAddress({dst=RegDest r, ...}: node) = regNo r
286            |   nodeAddress({dst=StackDest a, ...}) = a
287            
288            fun arcs({src=StackSource wordOffset, ...}: node) = [wordOffset]
289            |   arcs{src=OtherSource(RegisterArg r), ...} = [regNo r]
290            |   arcs{src=OtherSource(MemoryArg{base, index, ...}), ...} =
291                    (case indexRegister index of NONE => [regNo(GenReg base)] | SOME r => [regNo(GenReg base), regNo(GenReg r)])
292            |   arcs _ = []
293        in
294            val stronglyConnected = STRONGLY.stronglyConnectedComponents { nodeAddress=nodeAddress, arcs=arcs }
295        end
296        
297        (* This is a general function for moving values into registers or to the stack
298           where it is possible that the source values might also be in use as destinations.
299           The stack is used for destinations only for tail recursive calls. *)
300        fun moveMultipleValues(moves, workReg: reg option, code) =
301        let
302            val _ =
303                if List.exists(fn {dst=StackDest _, ...} => true | _ => false) moves andalso not(isSome workReg) then raise InternalError "no work reg" else ()
304 
305            fun moveValues ([], code) = code (* We're done. *)
306
307            |   moveValues (arguments, code) =
308                let
309                    (* stronglyConnectedComponents does two things.  It detects loops where
310                       it's not possible to move items without breaking the loop but more
311                       importantly it orders the dependencies so that if there are no loops we
312                       can load the source and store it in the destination knowing that
313                       we won't overwrite anything we might later need. *)
314                    
315                    val ordered = stronglyConnected arguments
316
317                    fun isFPReg(GenReg _) = false
318                    |   isFPReg(XMMReg _) = true
319                    |   isFPReg(FPReg _) = true
320                    
321                    fun moveEachValue ([], code) = code
322
323                    |   moveEachValue ([{dst=RegDest reg, src as OtherSource(RegisterArg r)}] :: rest, code) =
324                            (* Source and dest are both regs - only move if they're different. *)
325                            if r = reg
326                            then moveEachValue(rest, code)
327                            else moveEachValue(rest,
328                                    llLoadArgument({source=sourceToX86Code src, dest=reg, kind=if isFPReg reg then MoveDouble else moveNativeWord}, code))
329
330                    |   moveEachValue ([{dst=RegDest reg, src as StackSource _}] :: rest, code) =
331                            (* If loading from the stack always use native word.  The value could be a stack address. *)
332                            moveEachValue(rest, llLoadArgument({source=sourceToX86Code src, dest=reg, kind=moveNativeWord}, code))
333
334                    |   moveEachValue ([{dst=RegDest reg, src}] :: rest, code) =
335                            (* Load from store or a constant.  Have to use movePolyWord if it's an address constant. *)
336                            moveEachValue(rest, llLoadArgument({source=sourceToX86Code src, dest=reg, kind=movePolyWord}, code))
337
338                    |   moveEachValue ([{dst=StackDest _, src=OtherSource(MemoryArg _ )}] :: _, _) =
339                            raise InternalError "moveEachValue - MemoryArgument"
340
341                    |   moveEachValue ([{dst=StackDest addr, src as StackSource wordOffset}] :: rest, code) =
342                            (* Copy a stack location - needs a load and store unless the address is the same. *)
343                            if addr = wordOffset
344                            then moveEachValue(rest, code)
345                            else
346                            let
347                                val workReg = valOf workReg
348                            in
349                                moveEachValue(rest,
350                                    llStoreArgument{source=RegisterArg workReg, base=esp, index=NoIndex,
351                                                offset = addr*Word.toInt nativeWordSize, kind=moveNativeWord} ::
352                                       llLoadArgument({source=sourceToX86Code src, dest=workReg, kind=moveNativeWord}, code))
353                            end
354
355                    |   moveEachValue ([{dst=StackDest addr, src}] :: rest, code) =
356                            (* Store from a register or a constant. *)
357                            moveEachValue(rest,
358                                llStoreArgument{
359                                    source=sourceToX86Code src, base=esp, index=NoIndex, offset = addr*Word.toInt nativeWordSize, kind=moveNativeWord} :: code)
360
361                    |   moveEachValue((cycle as first :: _ :: _) :: rest, code) =
362                        (* We have a cycle. *)
363                        let
364                            (* We need to exchange some of the arguments.  Doing an exchange here will
365                               set the destination with the correct source.  However we have to process
366                               every subsequent entry with the swapped registers.  That may well mean that
367                               one of those entries becomes trivial.  Using XCHG means that we can move
368                               N registers in N-1 exchanges.
369                               We also need to rerun stronglyConnectedComponents on at least the rest of
370                               this cycle.  It's easiest to flatten the rest and do everything. *)
371                            (* Try to find either a register-register move or a register-stack move.
372                               If not use the first.  If there's a stack-register move there will
373                               also be a register-stack so we don't need to look for both. *)
374                            val {dst=selectDst, src=selectSrc} =
375                                case List.find(fn {src=OtherSource(RegisterArg _), dst=RegDest _} => true | _ => false) cycle of
376                                    SOME found => found
377                                |   _ =>
378                                    (
379                                        case List.find(fn {dst=RegDest _, ...} => true | _ => false) cycle of
380                                            SOME found => found
381                                        |   NONE => first
382                                    )
383                            (* This includes this entry but after the swap we'll eliminate it. *)
384                            val flattened = List.foldl(fn (a, b) => a @ b) [] (cycle :: rest)
385                            val destAsSource =
386                                case selectDst of
387                                    RegDest reg => OtherSource(RegisterArg reg)
388                                |   StackDest s => StackSource s
389
390                            (* Source is not an equality type.  We can't currently handle the
391                               situation where the source is a memory location. *)
392                            fun match(OtherSource(RegisterArg r1), OtherSource(RegisterArg r2)) = r1 = r2
393                            |   match(StackSource s1, StackSource s2) = s1 = s2
394                            |   match(OtherSource(MemoryArg _), _) = raise InternalError "moveEachValue: cycle"
395                            |   match _ = false
396                            
397                            fun swapSources{src, dst} =
398                                if match(src, selectSrc) then {src=destAsSource, dst=dst}
399                                else if match(src, destAsSource) then {src=selectSrc, dst=dst}
400                                else {src=src, dst=dst}
401                            (* Try to use register to register exchange if we can.
402                               A register-to-memory exchange involves a bus lock and we'd
403                               like to avoid that. *)
404                            val exchangeCode =
405                                case (selectDst, selectSrc) of
406                                    (RegDest(GenReg regA), OtherSource(RegisterArg(GenReg regB))) =>
407                                        XChng { reg=regA, arg=RegisterArg regB, opSize=nativeWordOpSize } :: code
408
409                                |   (RegDest(XMMReg regA), OtherSource(RegisterArg(XMMReg regB))) =>
410                                        (* This is the only case where we can have a cycle with SSE2 regs.
411                                           There are various ways of doing it but XORs are probably the easiest. *)
412                                        XMMArith{opc=SSE2Xor, source=RegisterArg regA, output=regB} ::
413                                        XMMArith{opc=SSE2Xor, source=RegisterArg regB, output=regA} ::
414                                        XMMArith{opc=SSE2Xor, source=RegisterArg regA, output=regB} :: code
415
416                                |   (RegDest _, OtherSource(RegisterArg _)) =>
417                                        raise InternalError "moveEachValue: invalid register combination"
418                                        
419                                |   (RegDest regA, src as StackSource addr) =>
420                                    let
421                                        val workReg = valOf workReg
422                                    in
423                                        llStoreArgument{source=RegisterArg workReg, base=esp, index=NoIndex,
424                                                offset = addr*Word.toInt nativeWordSize, kind=moveNativeWord} ::
425                                            XChng { reg=asGenReg regA, arg=RegisterArg(asGenReg workReg), opSize=nativeWordOpSize } ::
426                                                llLoadArgument({source=sourceToX86Code src, dest=workReg, kind=moveNativeWord}, code)
427                                    end
428
429                                |   (StackDest addr, OtherSource(RegisterArg regA)) =>
430                                    let
431                                        (* This doesn't actually occur because we always find the case above. *)
432                                        val workReg = valOf workReg
433                                    in
434                                        llStoreArgument{source=RegisterArg workReg, base=esp, index=NoIndex,
435                                                offset = addr*Word.toInt nativeWordSize, kind=moveNativeWord} ::
436                                            XChng { reg=asGenReg regA, arg=RegisterArg (asGenReg workReg), opSize=nativeWordOpSize } ::
437                                                llLoadArgument({
438                                                    source=MemoryArg{base=esp, offset=addr*Word.toInt nativeWordSize, index=NoIndex}, dest=workReg, kind=moveNativeWord}, code)
439                                    end
440
441                                |   (StackDest addr1, StackSource addr2) =>
442                                    let
443                                        val workReg = valOf workReg
444                                        (* This can still happen if we have argument registers that need to be
445                                           loaded from stack locations and those argument registers happen to
446                                           contain the values to be stored into those stack locations.
447                                           e.g. ebx => S8; eax => S7; S8 => eax; S7 => eax.
448                                           Eliminating the registers results in a cycle.
449                                           It may be possible to avoid this by excluding the argument
450                                           registers (eax; ebx; r8; r9; r10) from holding values in the
451                                           area to be overwritten. *)
452                                    in
453                                        llStoreArgument{source=RegisterArg workReg, base=esp, index=NoIndex,
454                                                offset = addr1*Word.toInt nativeWordSize, kind=moveNativeWord} ::
455                                            XChng { reg=asGenReg workReg,
456                                                arg=MemoryArg{base=esp, offset=addr2*Word.toInt nativeWordSize, index=NoIndex},
457                                                opSize=nativeWordOpSize } ::
458                                                llLoadArgument({
459                                                    source=MemoryArg{base=esp, offset=addr1*Word.toInt nativeWordSize, index=NoIndex}, dest=workReg, kind=moveNativeWord}, code)
460                                    end
461                                
462                                |   _ => raise InternalError "moveEachValue: cycle"
463                                    
464                        in
465                            moveValues(List.map swapSources flattened, exchangeCode)
466                        end
467
468                    |   moveEachValue(([]) :: _, _) = (* This should not happen - avoid warning. *)
469                            raise InternalError "moveEachValue - empty set"
470                in
471                    moveEachValue(ordered, code)
472                end
473        in
474            moveValues(moves, code)
475        end
476
477        (* Where we have multiple specific registers as either source or
478           destination there is the potential that a destination register
479           if currently in use as a source. *) 
480        fun moveMultipleRegisters(regPairList, code) =
481        let
482            val regPairsAsDests =
483                List.map(fn {src, dst} => {src=OtherSource(RegisterArg src), dst=RegDest dst}) regPairList
484        in
485            moveMultipleValues(regPairsAsDests, NONE, code)
486        end
487
488        val outputLabelCount = ref 0
489        val blockToLabelMap = Array.array(numBlocks, ~1)
490
491        fun makeLabel() = Label{labelNo = ! outputLabelCount} before outputLabelCount := !outputLabelCount + 1
492       
493        fun getBlockLabel blockNo =
494            case Array.sub(blockToLabelMap, blockNo) of
495                ~1 =>
496                let
497                    val label as Label{labelNo} = makeLabel()
498                    val () = Array.update(blockToLabelMap, blockNo, labelNo)
499                in label end
500            |   n => Label{labelNo=n}
501
502        (* The profile object is a single mutable with the F_bytes bit set. *)
503        local
504            val v = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
505            fun clear 0w0 = ()
506            |   clear i = (assignByte(v, i-0w1, 0w0); clear (i-0w1))
507            val () = clear wordSize
508        in
509            val profileObject = toMachineWord v
510        end
511        (* Switch to indicate if we want to trace where live data has been allocated. *)
512        val addAllocatingFunction =
513            DEBUG.getParameter DEBUG.profileAllocationTag debugSwitches = 1
514
515        fun llAllocateMemoryOperation ({ size, flags, dest, saveRegs}, code) =
516        let
517            val toReg = asGenReg dest
518            val preserve = saveRegs
519
520            (* Allocate memory.  N.B. Instructions are in reverse order. *)
521            fun allocStore{size, flags, output, preserve} =
522            if targetArch = Native64Bit andalso flags <> 0w0
523            then
524                [Move{moveSize=Move8, source=NonAddressConstArg(Word8.toLargeInt flags), destination=MemoryArg {offset= ~1, base=output, index=NoIndex}},
525                 Move{source=NonAddressConstArg(LargeInt.fromInt size),
526                      destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=output, index=NoIndex},
527                      moveSize=opSizeToMove polyWordOpSize},
528                 AllocStore{size=size, output=output, saveRegs=preserve}]
529            else
530            let
531                val lengthWord = IntInf.orb(IntInf.fromInt size, IntInf.<<(Word8.toLargeInt flags, 0w24))
532            in
533                [Move{source=NonAddressConstArg lengthWord, destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=output, index=NoIndex},
534                      moveSize=opSizeToMove polyWordOpSize},
535                 AllocStore{size=size, output=output, saveRegs=preserve}]
536            end
537
538            val allocCode =
539                (* If we need to add the profile object *)
540                if addAllocatingFunction
541                then
542                    allocStore {size=size+1, flags=Word8.orb(flags, Address.F_profile), output=toReg, preserve=preserve} @
543                        [Move{moveSize=opSizeToMove polyWordOpSize, source=AddressConstArg profileObject,
544                              destination=MemoryArg {base=toReg, offset=size*Word.toInt wordSize, index=NoIndex}}]
545                else allocStore {size=size, flags=flags, output=toReg, preserve=preserve}
546            
547            (* Convert to an object index if necessary. *)
548            val convertToObjId =
549                if targetArch = ObjectId32Bit
550                then [ ShiftConstant{ shiftType=SHR, output=toReg, shift=0w2, opSize=OpSize64 },
551                       ArithToGenReg{ opc=SUB, output=toReg, source=RegisterArg ebx, opSize=nativeWordOpSize } ]
552                else []
553        in
554            convertToObjId @ allocCode @ code
555        end
556
557        (* Check the stack limit "register".  This is used both at the start of a function for genuine
558           stack checking but also in a loop to check for an interrupt.  We need to save the registers
559           even across an interrupt because it can be used if another thread wants a GC. *)
560        fun testRegAndTrap(reg, entryPt, saveRegs) =
561        let
562            (* Normally we won't have a stack overflow so we will skip the check. *)
563            val skipCheckLab = makeLabel()
564        in
565            (* Need it in reverse order. *)
566            [
567                JumpLabel skipCheckLab,
568                CallRTS{rtsEntry=entryPt, saveRegs=saveRegs},
569                ConditionalBranch{test=JNB, label=skipCheckLab},
570                ArithToGenReg{ opc=CMP, output=reg, source=MemoryArg{offset=memRegStackLimit, base=ebp, index=NoIndex}, opSize=nativeWordOpSize }
571            ]
572        end
573        
574        local
575            val numRegisters = Vector.length allocatedRegisters
576            val uses = Array.array(numRegisters, false)
577            fun used(PReg r) = Array.update(uses, r, true)
578            fun isUsed(PReg r) = Array.sub(uses, r)
579
580            (* Set the registers used by the sources.  This differs from getInstructionState in that we don't set
581               the base register of a memory location to "used" if we can use the cache. *)
582            fun argUses(RegisterArgument rarg) = used rarg
583            |   argUses(MemoryLocation { cache=SOME cr, ...}) = used cr
584            |   argUses(MemoryLocation { base, index, cache=NONE, ...}) = (used base; indexUses index)
585            |   argUses(StackLocation { cache=SOME rarg, ...}) = used rarg
586            |   argUses _ = ()
587    
588            and indexUses NoMemIndex = ()
589            |   indexUses(MemIndex1 arg) = used arg
590            |   indexUses(MemIndex2 arg) = used arg
591            |   indexUses(MemIndex4 arg) = used arg
592            |   indexUses(MemIndex8 arg) = used arg
593            |   indexUses ObjectIndex = ()
594
595            (* LoadArgument, TagValue, CopyToCache, UntagValue and BoxValue are eliminated if their destination
596               is not used.  In that case their source are not used either. *)
597            fun instructionUses(LoadArgument { source, dest, ...}) = if isUsed dest then argUses source else ()
598            |   instructionUses(StoreArgument{ source, base, index, ...}) = (argUses source; used base; indexUses index)
599            |   instructionUses(LoadMemReg _) = ()
600            |   instructionUses(StoreMemReg {source, ...}) = used source
601            |   instructionUses(BeginFunction _) = ()
602            |   instructionUses(FunctionCall{regArgs, stackArgs, ...}) = (List.app(argUses o #1) regArgs; List.app argUses stackArgs)
603            |   instructionUses(TailRecursiveCall{regArgs, stackArgs, ...}) = (List.app(argUses o #1) regArgs; List.app(argUses o #src) stackArgs)
604            |   instructionUses(AllocateMemoryOperation _) = ()
605            |   instructionUses(AllocateMemoryVariable{size, ...}) = used size
606            |   instructionUses(InitialiseMem{size, addr, init}) = (used size; used addr; used init)
607            |   instructionUses(InitialisationComplete) = ()
608            |   instructionUses(BeginLoop) = ()
609            |   instructionUses(JumpLoop{regArgs, stackArgs, ...}) = (List.app(argUses o #1) regArgs; List.app(argUses o #1) stackArgs)
610            |   instructionUses(RaiseExceptionPacket{packetReg}) = used packetReg
611            |   instructionUses(ReserveContainer _) = ()
612            |   instructionUses(IndexedCaseOperation{testReg, ...}) = used testReg
613            |   instructionUses(LockMutable{addr}) = used addr
614            |   instructionUses(WordComparison{arg1, arg2, ...}) = (used arg1; argUses arg2)
615            |   instructionUses(CompareLiteral{arg1, ...}) = argUses arg1
616            |   instructionUses(CompareByteMem{arg1={base, index, ...}, ...}) = (used base; indexUses index)
617            |   instructionUses(PushExceptionHandler _) = ()
618            |   instructionUses(PopExceptionHandler _) = ()
619            |   instructionUses(BeginHandler _) = ()
620            |   instructionUses(ReturnResultFromFunction{resultReg, ...}) = used resultReg
621            |   instructionUses(ArithmeticFunction{operand1, operand2, ...}) = (used operand1; argUses operand2)
622            |   instructionUses(TestTagBit{arg, ...}) = argUses arg
623            |   instructionUses(PushValue {arg, ...}) = argUses arg
624            |   instructionUses(CopyToCache{source, dest, ...}) = if isUsed dest then used source else ()
625            |   instructionUses(ResetStackPtr _) = ()
626            |   instructionUses(StoreToStack {source, ...}) = argUses source
627            |   instructionUses(TagValue{source, dest, ...}) = if isUsed dest then used source else ()
628            |   instructionUses(UntagValue{dest, cache=SOME cacheR, ...}) = if isUsed dest then used cacheR else ()
629            |   instructionUses(UntagValue{source, dest, cache=NONE, ...}) = if isUsed dest then used source else ()
630            |   instructionUses(LoadEffectiveAddress{base, index, ...}) = (case base of SOME bReg => used bReg | NONE => (); indexUses index)
631            |   instructionUses(ShiftOperation{operand, shiftAmount, ...}) = (used operand; argUses shiftAmount)
632            |   instructionUses(Multiplication{operand1, operand2, ...}) = (used operand1; argUses operand2)
633            |   instructionUses(Division{dividend, divisor, ...}) = (used dividend; argUses divisor)
634            |   instructionUses(AtomicExchangeAndAdd{base, source}) = (used base; used source)
635            |   instructionUses(BoxValue{source, dest, ...}) = if isUsed dest then used source else ()
636            |   instructionUses(CompareByteVectors{vec1Addr, vec2Addr, length, ...}) = (used vec1Addr; used vec2Addr; used length)
637            |   instructionUses(BlockMove{srcAddr, destAddr, length, ...}) = (used srcAddr; used destAddr; used length)
638            |   instructionUses(X87Compare{arg1, arg2, ...}) = (used arg1; argUses arg2)
639            |   instructionUses(SSE2Compare{arg1, arg2, ...}) = (used arg1; argUses arg2)
640            |   instructionUses(X87FPGetCondition _) = ()
641            |   instructionUses(X87FPArith{arg1, arg2, ...}) = (used arg1; argUses arg2)
642            |   instructionUses(X87FPUnaryOps{source, ...}) = used source
643            |   instructionUses(X87Float{source, ...}) = argUses source
644            |   instructionUses(SSE2Float{source, ...}) = argUses source
645            |   instructionUses(SSE2FPUnary{source, ...}) = argUses source
646            |   instructionUses(SSE2FPBinary{arg1, arg2, ...}) = (used arg1; argUses arg2)
647            |   instructionUses(TagFloat{source, dest, ...}) = if isUsed dest then used source else ()
648            |   instructionUses(UntagFloat{dest, cache=SOME cacheR, ...}) = if isUsed dest then used cacheR else ()
649            |   instructionUses(UntagFloat{source, dest, cache=NONE, ...}) = if isUsed dest then argUses source else ()
650            |   instructionUses(GetSSE2ControlReg _) = ()
651            |   instructionUses(SetSSE2ControlReg{source}) = used source
652            |   instructionUses(GetX87ControlReg _) = ()
653            |   instructionUses(SetX87ControlReg{source}) = used source
654            |   instructionUses(X87RealToInt{source, ...}) = used source
655            |   instructionUses(SSE2RealToInt{source, ...}) = argUses source
656            |   instructionUses(SignExtend32To64{source, dest}) = if isUsed dest then argUses source else ()
657            |   instructionUses(TouchArgument{source}) = used source
658
659            (* Depth-first scan. *)
660            val visited = Array.array(numBlocks, false)
661
662            fun processBlocks blockNo =
663            if Array.sub(visited, blockNo)
664            then ()  (* Done or currently being done. *)
665            else
666            let
667                val () = Array.update(visited, blockNo, true)
668                val ExtendedBasicBlock { flow, block,...} = Vector.sub(blocks, blockNo)
669                val () =
670                    (* Process the dependencies first. *)
671                    case flow of
672                        ExitCode => ()
673                    |   Unconditional m => processBlocks m
674                    |   Conditional {trueJump, falseJump, ...} =>
675                            (processBlocks trueJump; processBlocks falseJump)
676                    |   IndexedBr cases => List.app processBlocks cases
677                    |   SetHandler{ handler, continue } =>
678                            (processBlocks handler; processBlocks continue)
679                    |   UnconditionalHandle _ => ()
680                    |   ConditionalHandle { continue, ...} => processBlocks continue
681                (* Now this block. *)
682            in
683                List.foldr(fn ({instr, ...}, ()) => instructionUses instr) () block
684            end
685
686        in
687            val () = processBlocks 0
688            val isUsed = isUsed
689        end
690        
691        (* Return the register part of a cached item. *)
692        fun decache(StackLocation{cache=SOME r, ...}) = RegisterArgument r
693        |   decache(MemoryLocation{cache=SOME r, ...}) = RegisterArgument r
694        |   decache arg = arg
695        
696        (* Only get the registers that are actually used. *)
697        val getSaveRegs = List.mapPartial(fn (reg as PReg r) => if isUsed reg then SOME(getAllocatedGenReg r) else NONE)
698                
699        fun codeExtended _ ({instr=LoadArgument{source, dest as PReg dreg, kind}, ...}, code) =
700            if not (isUsed dest)
701            then code
702            else
703            let
704                val realDestReg = getAllocatedReg dreg
705            in
706                case source of
707                    RegisterArgument(PReg sreg) =>
708                    (* Register to register move.  Try to use the same register for the source as the destination
709                       to eliminate the instruction. *)
710                        (* If the source is the same as the destination we don't need to do anything. *)
711                        moveIfNecessary({src=getAllocatedReg sreg, dst=realDestReg, kind=kind}, code)
712
713                |   MemoryLocation{cache=SOME(PReg sreg), ...} =>
714                    (* This is also a register to register move but because the original load is from
715                       memory it could be a byte or short precision value. *)
716                    let
717                        val moveKind =
718                            case kind of
719                                Move64Bit => Move64Bit
720                            |   MoveByte => Move32Bit
721                            |   Move16Bit => Move32Bit
722                            |   Move32Bit => Move32Bit
723                            |   MoveFloat => MoveFloat
724                            |   MoveDouble => MoveDouble
725                    in
726                        moveIfNecessary({src=getAllocatedReg sreg, dst=realDestReg, kind=moveKind}, code)
727                    end
728                
729                    (* TODO: Isn't this covered by codeExtArgument?  It looks like it was added in the
730                       32-in-64 changes.  *)
731                |   StackLocation{cache=SOME(PReg sreg), ...} =>
732                        moveIfNecessary({src=getAllocatedReg sreg, dst=realDestReg, kind=kind}, code)
733
734                |   source as StackLocation _ => (* Always use native loads from the stack. *)
735                        llLoadArgument({source=codeExtArgument source, dest=realDestReg, kind=moveNativeWord}, code)
736
737                |   source => (* Loads of constants or from an address. *)
738                        llLoadArgument({source=codeExtArgument source, dest=realDestReg, kind=kind}, code)
739            end
740
741        |   codeExtended _ ({instr=StoreArgument{ source, base=PReg bReg, offset, index, kind, ... }, ...}, code) =
742            let
743                val (baseReg, indexVal) =
744                    case index of
745                        ObjectIndex => (ebx, Index4(getAllocatedGenReg bReg))
746                    |   _ => (getAllocatedGenReg bReg, codeExtIndex index)
747            in
748                case (decache source, kind) of
749                    (RegisterArgument(PReg sReg), MoveByte) =>
750                    if targetArch <> Native32Bit
751                    then
752                        llStoreArgument{
753                            source=codeExtArgument source, base=baseReg, offset=offset, index=indexVal, kind=MoveByte} :: code
754                    else
755                    (* This is complicated on X86/32.  We can't use edi or esi for the store registers.  Instead
756                       we reserve ecx (see special case in "identify") and use that if we have to. *)
757                    let
758                        val realStoreReg = getAllocatedReg sReg
759                        val (moveCode, storeReg) =
760                            if realStoreReg = GenReg edi orelse realStoreReg = GenReg esi
761                            then (moveIfNecessary({src=realStoreReg, dst=GenReg ecx, kind=moveNativeWord}, code), GenReg ecx)
762                            else (code, realStoreReg)
763                    in
764                        llStoreArgument{
765                            source=RegisterArg storeReg, base=baseReg, offset=offset, index=indexVal, kind=MoveByte} ::
766                                moveCode
767                    end
768                    
769                |   _ =>
770                        llStoreArgument{
771                            source=codeExtArgument source, base=baseReg, offset=offset, index=indexVal, kind=kind} :: code
772            end
773
774        |   codeExtended _ ({instr=LoadMemReg { offset, dest=PReg pr, kind}, ...}, code) =
775            (* Load from the "memory registers" pointed at by rbp. *)
776                llLoadArgument({source=MemoryArg{base=rbp, offset=offset, index=NoIndex}, dest=getAllocatedReg pr, kind=kind}, code)
777
778        |   codeExtended _ ({instr=StoreMemReg { offset, source=PReg pr, kind}, ...}, code) =
779                (* Store into the "memory register". *)
780                llStoreArgument{
781                    source=RegisterArg(getAllocatedReg pr), base=rbp, offset=offset, index=NoIndex, kind=kind} ::
782                        code
783
784        |   codeExtended _ ({instr=BeginFunction{regArgs, ...}, ...}, code) =
785            let
786                val minStackCheck = 20
787                val saveRegs = List.mapPartial(fn (_, GenReg r) => SOME r | _ => NONE) regArgs
788                val preludeCode =
789                    if stackRequired >= minStackCheck
790                    then
791                    let
792                        (* Compute the necessary amount in edi and compare that. *)
793                        val stackByteAdjust = ~ (Word.toInt nativeWordSize) * stackRequired
794                        val testEdiCode =
795                            testRegAndTrap (edi, StackOverflowCallEx, saveRegs)
796                    in
797                        (* N.B. In reverse order. *)
798                        testEdiCode @ [LoadAddress{output=edi, base=SOME esp, index=NoIndex, offset=stackByteAdjust, opSize=nativeWordOpSize}]
799                    end
800     
801                    else testRegAndTrap (esp, StackOverflowCall, saveRegs)
802
803                val usedRegs = List.filter (isUsed o #1) regArgs
804                fun mkPair(PReg pr, rr) = {src=rr,dst=getAllocatedReg pr}
805                val regPairs = List.map mkPair usedRegs
806            in
807                moveMultipleRegisters(regPairs, preludeCode @ code)
808            end
809
810        |   codeExtended _ ({instr=TailRecursiveCall{callKind, regArgs=oRegArgs, stackArgs=oStackArgs, stackAdjust, currStackSize, workReg=PReg wReg}, ...}, code) =
811            let
812                val regArgs = List.map (fn (arg, reg) => (decache arg, reg)) oRegArgs
813                and stackArgs = List.map(fn {src, stack } => {src=decache src, stack=stack}) oStackArgs
814                val workReg = getAllocatedReg wReg
815                
816                (* We must leave stack entries as stack entries for the moment. *)
817                fun codeArg(StackLocation{wordOffset, cache=NONE, ...}) = StackSource wordOffset
818                |   codeArg arg = OtherSource(codeExtArgument arg)
819
820                val extStackArgs = map (fn {stack, src} => {dst=StackDest(stack+currStackSize), src=codeArg src}) stackArgs
821                val extRegArgs = map (fn (a, r) => {src=codeArg a, dst=RegDest r}) regArgs
822
823                (* Tail recursive calls are complicated because we generally have to overwrite the existing stack.
824                   That means storing the arguments in the right order to avoid overwriting a
825                   value that we are using for a different argument. *)
826                fun codeTailCall(arguments: {dst: destinations, src: llsource} list, stackAdjust, code) =
827                if stackAdjust < 0
828                then
829                let
830                    (* If the function we're calling takes more arguments on the stack than the
831                       current function we will have to extend the stack.  Do that by pushing the
832                       argument whose offset is at -1.  Then adjust all the offsets and repeat. *)
833                    val {src=argM1, ...} = valOf(List.find(fn {dst=StackDest ~1, ...} => true | _ => false) arguments)
834                    fun renumberArgs [] = []
835                    |   renumberArgs ({dst=StackDest ~1, ...} :: args) = renumberArgs args (* Remove the one we've done. *)
836                    |   renumberArgs ({dst, src} :: args) =
837                        let
838                            val newDest = case dst of StackDest d => StackDest(d+1) | regDest => regDest
839                            val newSrc =
840                                case src of
841                                    StackSource wordOffset => StackSource(wordOffset+1)
842                                |   other => other
843                        in
844                            {dst=newDest, src=newSrc} :: renumberArgs args
845                        end
846                in
847                    codeTailCall(renumberArgs arguments, stackAdjust+1,
848                        PushToStack(sourceAsGenRegOrMem(sourceToX86Code argM1)) :: code)
849                end
850                else
851                let
852                    val loadArgs = moveMultipleValues(arguments, SOME workReg, code)
853                in
854                    if stackAdjust = 0
855                    then loadArgs
856                    else ResetStack{numWords=stackAdjust, preserveCC=false} :: loadArgs
857                end
858            in
859                JumpAddress(codeCallKind callKind) ::
860                    codeTailCall(extStackArgs @ extRegArgs, stackAdjust+currStackSize, code)
861            end
862
863        |   codeExtended _ ({instr=FunctionCall{callKind, regArgs=oRegArgs, stackArgs=oStackArgs, dest=PReg dReg, realDest, saveRegs}, ...}, code) =
864            let
865                val regArgs = List.map (fn (arg, reg) => (decache arg, reg)) oRegArgs
866                and stackArgs = List.map decache oStackArgs
867                
868                val destReg = getAllocatedReg dReg
869                
870                
871                fun pushStackArgs ([], _, code) = code
872                
873                |   pushStackArgs (ContainerAddr {stackOffset, ...} ::args, argNum, code) =
874                    let
875                        val adjustedAddr = stackOffset+argNum
876                        (* If there is an offset relative to rsp we need to add this in. *)
877                        val addOffset =
878                            if adjustedAddr = 0
879                            then []
880                            else [ArithMemConst{opc=ADD, address={offset=0, base=esp, index=NoIndex},
881                                        source=LargeInt.fromInt(adjustedAddr*Word.toInt nativeWordSize), opSize=nativeWordOpSize}]
882                    in
883                        pushStackArgs(args, argNum+1, addOffset @ PushToStack(RegisterArg esp) :: code)
884                    end
885                    
886                |   pushStackArgs (StackLocation {wordOffset, container, field, ...} ::args, argNum, code) =
887                    let
888                        (* Have to adjust the offsets of stack arguments. *)
889                        val adjusted =
890                            StackLocation{wordOffset=wordOffset+argNum, container=container, field=field+argNum,
891                                                  cache=NONE}
892                    in
893                        pushStackArgs(args, argNum+1, PushToStack(codeExtArgumentAsGenReg adjusted) :: code)
894                    end
895
896                |   pushStackArgs (arg::args, argNum, code) =
897                        pushStackArgs(args, argNum+1, PushToStack(codeExtArgumentAsGenReg arg) :: code)
898
899                val pushedArgs = pushStackArgs(stackArgs, 0, code (* Initial code *))
900                (* We have to adjust any stack offset to account for the arguments we've pushed. *)
901                val numStackArgs = List.length stackArgs
902                
903                (* We don't currently allow the arguments to be memory locations and instead
904                   force them into registers.  That may be simpler especially if we can get the
905                   values directly into the required register. *)
906                fun getRegArgs(RegisterArgument(PReg pr), reg) =
907                        SOME{dst=reg, src=getAllocatedReg pr}
908                |   getRegArgs(StackLocation {cache=SOME(PReg pr), ...}, reg) =
909                        SOME{dst=reg, src=getAllocatedReg pr}
910                |   getRegArgs(MemoryLocation _, _) = raise InternalError "FunctionCall - MemoryLocation"
911                |   getRegArgs _ = NONE
912                
913                val loadRegArgs =
914                    moveMultipleRegisters(List.mapPartial getRegArgs regArgs, pushedArgs)
915
916                (* These are all items we can load without requiring a source register.
917                   That includes loading from the stack. *)
918                fun getConstArgs((AddressConstant m, reg), code) =
919                        llLoadArgument({source=AddressConstArg m, dest=reg, kind=movePolyWord}, code)
920                |   getConstArgs((IntegerConstant i, reg), code) =
921                        llLoadArgument({source=NonAddressConstArg i, dest=reg, kind=movePolyWord}, code)
922                |   getConstArgs((StackLocation { cache=SOME _, ...}, _), code) = code
923                |   getConstArgs((StackLocation { wordOffset, ...}, reg), code) =
924                        llLoadArgument({source=MemoryArg{offset=(wordOffset+numStackArgs)*Word.toInt nativeWordSize, base=esp, index=NoIndex},
925                                          dest=reg, kind=moveNativeWord}, code)
926                |   getConstArgs((ContainerAddr {stackOffset, ...}, reg), code) =
927                        if stackOffset+numStackArgs = 0
928                        then llLoadArgument({source=RegisterArg(GenReg esp), dest=reg, kind=moveNativeWord}, code)
929                        else LoadAddress{ output=asGenReg reg, offset=(stackOffset+numStackArgs)*Word.toInt nativeWordSize, base=SOME esp,
930                                          index=NoIndex, opSize=nativeWordOpSize } :: code
931                |   getConstArgs((RegisterArgument _, _), code) = code
932                |   getConstArgs((MemoryLocation _, _), code) = code
933                val loadConstArgs = List.foldl getConstArgs loadRegArgs regArgs
934                
935                (* Push the registers before the call and pop them afterwards. *)
936                fun makeSaves([], code) = CallAddress(codeCallKind callKind) :: code
937                |   makeSaves(PReg reg::regs, code) =
938                    let
939                        val areg = getAllocatedGenReg reg
940                        val _ = areg = eax andalso raise InternalError "codeExtended: eax in save regs"
941                        val _ = if List.exists(fn (_, r) => r = GenReg areg) regArgs then raise InternalError "codeExtended: arg reg in save regs" else ()
942                    in
943                        PopR areg :: makeSaves(regs, PushToStack(RegisterArg areg) :: code)
944                    end
945
946            in
947                moveIfNecessary({dst=destReg, src=realDest, kind=case realDest of GenReg _ => moveNativeWord | _ => MoveDouble},
948                    makeSaves(saveRegs, loadConstArgs)) 
949            end
950
951        |   codeExtended _ ({instr=AllocateMemoryOperation{ size, flags, dest=PReg dReg, saveRegs}, ...}, code) =
952            let
953                val preserve = getSaveRegs saveRegs
954            in
955                llAllocateMemoryOperation({ size=size, flags=flags, dest=getAllocatedReg dReg, saveRegs=preserve}, code)
956            end
957
958        |   codeExtended _ ({instr=AllocateMemoryVariable{size=PReg size, dest=PReg dest, saveRegs}, ...}, code) =
959            let
960                (* Simple case - no initialiser. *)
961                val saveRegs = getSaveRegs saveRegs
962                val sReg = getAllocatedGenReg size and dReg = getAllocatedGenReg dest
963                val _ = sReg <> dReg
964                            orelse raise InternalError "codeGenICode-AllocateMemoryVariable"
965
966                val allocCode =
967                [
968                    (* Store it as the length field. *)
969                    Move{source=RegisterArg sReg, moveSize=opSizeToMove polyWordOpSize,
970                         destination=MemoryArg {base=dReg, offset= ~ (Word.toInt wordSize), index=NoIndex}},
971                    (* Untag the length *)
972                    ShiftConstant{ shiftType=SHR, output=sReg, shift=0w1, opSize=polyWordOpSize},
973                    (* Allocate the memory *)
974                    AllocStoreVariable{ size=sReg, output=dReg, saveRegs=saveRegs}
975                ]
976                (* Convert to an object index if necessary. *)
977                val convertToObjId =
978                    if targetArch = ObjectId32Bit
979                    then [ ShiftConstant{ shiftType=SHR, output=dReg, shift=0w2, opSize=OpSize64 },
980                           ArithToGenReg{ opc=SUB, output=dReg, source=RegisterArg ebx, opSize=nativeWordOpSize } ]
981                    else []
982            in
983                convertToObjId @ allocCode @ code
984            end
985
986        |   codeExtended _ ({instr=InitialiseMem{size=PReg sReg, addr=PReg aReg, init=PReg iReg}, ...}, code) =
987                (* We are going to use rep stosl/q to set the memory.
988                   That requires the length to be in ecx, the initialiser to be in eax and
989                   the destination to be edi. *)
990                RepeatOperation (if polyWordOpSize = OpSize64 then STOS64 else STOS32)::
991                    moveIfNecessary({src=getAllocatedReg iReg, dst=GenReg eax, kind=moveNativeWord},
992                        moveIfNecessary({src=getAllocatedReg aReg, dst=GenReg edi, kind=moveNativeWord},
993                            moveIfNecessary({src=getAllocatedReg sReg, dst=GenReg ecx, kind=moveNativeWord}, code)))
994
995        |   codeExtended _ ({instr=InitialisationComplete, ...}, code) = StoreInitialised :: code
996
997        |   codeExtended _ ({instr=BeginLoop, ...}, code) = code
998
999        |   codeExtended _ ({instr=JumpLoop{regArgs, stackArgs, checkInterrupt, workReg}, ...}, code) =
1000            let
1001                val workReg = Option.map (fn PReg r => getAllocatedReg r) workReg
1002                (* TODO: Make the sources and destinations "friends". *)
1003                (* We must leave stack entries as stack entries for the moment as with TailCall. *)
1004                fun codeArg(StackLocation{wordOffset, ...}) = StackSource wordOffset
1005                |   codeArg arg = OtherSource(codeExtArgument arg)
1006                val extStackArgs = map (fn (src, stack, _) => {dst=StackDest stack, src=codeArg src}) stackArgs
1007                val extRegArgs = map (fn (a, PReg r) => {src=codeArg a, dst=RegDest(getAllocatedReg r)}) regArgs
1008                val checkCode =
1009                    case checkInterrupt of
1010                        NONE => []
1011                    |   SOME saveRegs => testRegAndTrap (esp, StackOverflowCall, getSaveRegs saveRegs)
1012            in
1013                checkCode @ moveMultipleValues(extStackArgs @ extRegArgs, workReg, code)
1014            end
1015
1016        |   codeExtended _ ({instr=RaiseExceptionPacket{ packetReg=PReg preg }, ...}, code) =
1017                (* We need a work register here.  It can be any register other than eax since
1018                   we don't preserve registers across calls. *)
1019                RaiseException { workReg=ecx } :: moveIfNecessary({src=getAllocatedReg preg, dst=GenReg eax, kind=moveNativeWord}, code)
1020
1021        |   codeExtended _ ({instr=ReserveContainer{size, ...}, ...}, code) =
1022                (* The memory must be cleared in case we have a GC. *)
1023                List.tabulate(size, fn _ => PushToStack(NonAddressConstArg(tag 0))) @ code
1024
1025        |   codeExtended {flow} ({instr=IndexedCaseOperation{testReg=PReg tReg, workReg=PReg wReg}, ...}, code) =
1026            let
1027                val testReg = getAllocatedReg tReg
1028                val workReg = getAllocatedReg wReg
1029                val _ = testReg <> workReg orelse raise InternalError "IndexedCaseOperation - same registers"
1030                val rReg = asGenReg testReg and wReg = asGenReg workReg
1031                val _ = rReg <> wReg orelse raise InternalError "IndexedCaseOperation - same registers"
1032                (* This should only be within a block with an IndexedBr flow type. *)
1033                val cases =
1034                    case flow of IndexedBr cases => cases | _ => raise InternalError "codeGenICode: IndexedCaseOperation"
1035                val caseLabels = map getBlockLabel cases
1036                val startJumpTable = makeLabel()
1037                (* Compute the jump address.  The index is a tagged
1038                   integer so it is already multiplied by 2.  We need to
1039                   multiply by four to get the correct size. Subtract off the
1040                   shifted tag. *)
1041                val jumpSize = ref JumpSize8
1042            in
1043                JumpTable{cases=caseLabels, jumpSize=jumpSize} :: JumpLabel startJumpTable :: JumpAddress(RegisterArg wReg) ::
1044                    IndexedJumpCalc{ addrReg=wReg, indexReg=rReg, jumpSize=jumpSize } ::
1045                    LoadLabelAddress{label=startJumpTable, output=wReg} :: code
1046            end
1047
1048        |   codeExtended _ ({instr=LockMutable{addr=PReg pr}, ...}, code) =
1049            let
1050                val (bReg, index) =
1051                    if targetArch = ObjectId32Bit
1052                    then (ebx, Index4(asGenReg(getAllocatedReg pr)))
1053                    else (asGenReg(getAllocatedReg pr), NoIndex)
1054            in
1055                (* Mask off the mutable bit. *)
1056                ArithByteMemConst{opc=AND, address={base=bReg, offset= ~1, index=index}, source=0wxff - F_mutable} :: code
1057            end
1058
1059        |   codeExtended _ ({instr=WordComparison{ arg1=PReg pr, arg2, opSize, ... }, ...}, code) =
1060                ArithToGenReg {opc=CMP, output=asGenReg(getAllocatedReg pr), source=codeExtArgumentAsGenReg arg2, opSize=opSize} :: code
1061
1062        |   codeExtended _ ({instr=CompareLiteral{ arg1, arg2, opSize, ... }, ...}, code) =
1063            (
1064                case decache arg1 of (* N.B. We MUST decache since we're assuming that the base reg is not used. *)
1065                    RegisterArgument(PReg pr) =>
1066                        ArithToGenReg {opc=CMP, output=asGenReg(getAllocatedReg pr), source=NonAddressConstArg arg2, opSize=opSize} :: code
1067                |   MemoryLocation{base=PReg br, offset, index=ObjectIndex, ...} =>
1068                        ArithMemConst{ opc=CMP,
1069                            address={offset=offset, base=ebx, index=Index4(asGenReg(getAllocatedReg br))}, source=arg2, opSize=opSize } :: code
1070                |   MemoryLocation{base=PReg br, index, offset, ...} =>
1071                        ArithMemConst{ opc=CMP,
1072                            address={offset=offset, base=asGenReg(getAllocatedReg br), index=codeExtIndex index}, source=arg2, opSize=opSize } :: code
1073                |   StackLocation{wordOffset, ...} =>
1074                        ArithMemConst{ opc=CMP, address={offset=wordOffset*Word.toInt nativeWordSize, base=esp, index=NoIndex}, source=arg2, opSize=opSize } :: code
1075                |   _ => raise InternalError "CompareLiteral"
1076            )
1077
1078        |   codeExtended _ ({instr=CompareByteMem{ arg1={base=PReg br, offset, index}, arg2, ... }, ...}, code) =
1079            let
1080                val (bReg, index) =
1081                    case index of
1082                        ObjectIndex => (ebx, Index4(asGenReg(getAllocatedReg br)))
1083                    |   _ => (asGenReg(getAllocatedReg br), codeExtIndex index)
1084            in
1085                ArithByteMemConst{ opc=CMP, address={offset=offset, base=bReg, index=index}, source=arg2 } :: code
1086            end
1087
1088            (* Set up an exception handler. *)
1089        |   codeExtended {flow} ({instr=PushExceptionHandler{workReg=PReg hReg}, ...}, code) =
1090            let (* Set up an exception handler. *)
1091                val workReg=getAllocatedReg hReg
1092                (* Although we're pushing this to the stack we need to use LEA on the
1093                   X86/64 and some arithmetic on the X86/32.  We need a work reg for that. *)
1094                val handleReg = asGenReg workReg
1095                (* This should only be within a block with a SetHandler flow type. *)
1096                val handleLabel =
1097                    case flow of
1098                        SetHandler{ handler, ...} => handler
1099                    |   _ => raise InternalError "codeGenICode: PushExceptionHandler"
1100                val labelRef = getBlockLabel handleLabel
1101                (* Set up the handler by pushing the old handler to the stack, pushing the
1102                   entry point and setting the handler address to the current stack pointer. *)
1103            in
1104                (
1105                    Move{source=RegisterArg esp, destination=MemoryArg {offset=memRegHandlerRegister, base=ebp, index=NoIndex},
1106                         moveSize=opSizeToMove nativeWordOpSize} ::
1107                    PushToStack(RegisterArg handleReg) ::
1108                    LoadLabelAddress{ label=labelRef, output=handleReg} ::
1109                    PushToStack(MemoryArg{base=ebp, offset=memRegHandlerRegister, index=NoIndex}) :: code)
1110            end
1111
1112            (* Pop an exception handler at the end of a handled section.  Executed if no exception has been raised.
1113               This removes items from the stack. *)
1114        |   codeExtended _ ({instr=PopExceptionHandler{workReg=PReg wReg, ...}, ...}, code) =
1115            let
1116                val workReg = getAllocatedReg wReg
1117                val wReg = asGenReg workReg
1118            in
1119                (* The stack pointer has been adjusted to just above the two words that were stored
1120                   in PushExceptionHandler. *)
1121                (
1122                    Move{source=RegisterArg wReg, destination=MemoryArg {offset=memRegHandlerRegister, base=ebp, index=NoIndex},
1123                         moveSize=opSizeToMove nativeWordOpSize} ::
1124                    PopR wReg ::
1125                    ResetStack{numWords=1, preserveCC=false} :: code)
1126            end
1127
1128            (* Start of a handler.  Sets the address associated with PushExceptionHandler and
1129               provides a register for the packet.*) 
1130        |   codeExtended _ ({instr=BeginHandler{packetReg=PReg pReg, workReg=PReg wReg}, ...}, code) =
1131            let
1132                (* The exception packet is in rax. *)
1133                val realPktReg = getAllocatedReg pReg
1134                val realWorkreg = getAllocatedGenReg wReg
1135                (* The code here is almost the same as PopExceptionHandler.  The only real difference
1136                   is that PopExceptionHandler needs to pass the result of executing the handled code
1137                   which could be in any register.  This code needs to transmit the exception packet
1138                   and that is always in rax. *)
1139                val beginHandleCode =
1140                    Move{source=RegisterArg realWorkreg, destination=MemoryArg {offset=memRegHandlerRegister, base=ebp, index=NoIndex},
1141                         moveSize=opSizeToMove nativeWordOpSize} ::
1142                    PopR realWorkreg :: ResetStack{numWords=1, preserveCC=false} ::
1143                    Move{ source=MemoryArg{base=ebp, offset=memRegHandlerRegister, index=NoIndex},
1144                          destination=RegisterArg esp, moveSize=opSizeToMove nativeWordOpSize } :: code
1145            in
1146                moveIfNecessary({src=GenReg eax, dst=realPktReg, kind=moveNativeWord }, beginHandleCode)
1147            end
1148
1149        |   codeExtended _ ({instr=ReturnResultFromFunction { resultReg=PReg resReg, realReg, numStackArgs }, ...}, code) =
1150            let
1151                val resultReg = getAllocatedReg resReg
1152                (* If for some reason it's not in the right register we have to move it there. *)
1153            in
1154                ReturnFromFunction numStackArgs :: moveIfNecessary({src=resultReg, dst=realReg, kind=moveNativeWord}, code)
1155            end
1156
1157        |   codeExtended _ ({instr=ArithmeticFunction{oper=SUB, resultReg=PReg resReg, operand1=PReg op1Reg,
1158                                            operand2, opSize, ...}, ...}, code) =
1159            (* Subtraction - this is special because it can only be done one way round.  The first argument must
1160               be in a register. *)
1161            let
1162                val realDestReg = getAllocatedReg resReg
1163                val realOp1Reg = getAllocatedReg op1Reg
1164            in
1165                ArithToGenReg { opc=SUB, output=asGenReg realDestReg, source=codeExtArgumentAsGenReg operand2, opSize=opSize } ::
1166                    moveIfNecessary({src=realOp1Reg, dst=realDestReg, kind=opSizeToIMove opSize}, code)
1167            end
1168
1169        |   codeExtended _ ({instr=ArithmeticFunction{oper, resultReg=PReg resReg, operand1=PReg op1Reg, operand2, opSize, ...}, ...}, code) =
1170            (
1171                case decache operand2 of
1172                    RegisterArgument(PReg op2Reg) =>
1173                    (* Arithmetic operation with both arguments as registers.  These operations are all symmetric so
1174                       we can try to put either argument into the result reg and then do the operation on the other arg. *)
1175                    let
1176                        val realDestReg = getAllocatedGenReg resReg
1177                        val realOp1Reg = getAllocatedGenReg op1Reg
1178                        and realOp2Reg = getAllocatedGenReg op2Reg
1179                        val (operandReg, moveInstr) =
1180                            if realOp1Reg = realDestReg
1181                            then (realOp2Reg, code)
1182                            else if realOp2Reg = realDestReg
1183                            then (realOp1Reg, code)
1184                            else (realOp2Reg, Move{source=RegisterArg realOp1Reg, destination=RegisterArg realDestReg, moveSize=opSizeToMove opSize} :: code)
1185                    in
1186                        ArithToGenReg { opc=oper, output=realDestReg, source=RegisterArg operandReg, opSize=opSize } :: moveInstr
1187                    end
1188
1189                |   operand2 =>
1190                    (* Arithmetic operation with the first argument in a register and the second a constant or memory location. *)
1191                    let
1192                        val realDestReg = getAllocatedReg resReg
1193                        val realOp1Reg = getAllocatedReg op1Reg
1194                        val op2Arg = codeExtArgumentAsGenReg operand2
1195                        (* If we couldn't put it in the result register we have to copy it there. *)
1196                    in
1197                        ArithToGenReg { opc=oper, output=asGenReg realDestReg, source=op2Arg, opSize=opSize } ::
1198                            moveIfNecessary({src=realOp1Reg, dst=realDestReg, kind=opSizeToIMove opSize}, code)
1199                    end
1200            )
1201
1202        |   codeExtended _ ({instr=TestTagBit{arg, ...}, ...}, code) =
1203                TestByteBits{arg=codeExtArgumentAsGenReg arg, bits=0w1} :: code
1204
1205        |   codeExtended _ ({instr=PushValue {arg, ...}, ...}, code) = PushToStack(codeExtArgumentAsGenReg arg) :: code
1206
1207        |   codeExtended _ ({instr=CopyToCache{source=PReg sreg, dest as PReg dreg, kind}, ...}, code) =
1208            if not (isUsed dest)
1209            then code
1210            else
1211            let
1212                val realDestReg = getAllocatedReg dreg
1213                (* Get the source register using the current destination as a preference. *)
1214                val realSrcReg = getAllocatedReg sreg
1215            in
1216                (* If the source is the same as the destination we don't need to do anything. *)
1217                moveIfNecessary({src=realSrcReg, dst=realDestReg, kind=kind}, code)
1218            end
1219
1220        |   codeExtended _ ({instr=ResetStackPtr {numWords, preserveCC}, ...}, code) =
1221            (
1222                numWords >= 0 orelse raise InternalError "codeGenICode: ResetStackPtr - negative offset";
1223                ResetStack{numWords=numWords, preserveCC=preserveCC} :: code
1224            )
1225
1226        |   codeExtended _ ({instr=StoreToStack{ source, stackOffset, ... }, ...}, code) =
1227                llStoreArgument{
1228                    source=codeExtArgument source, base=esp, offset=stackOffset*Word.toInt nativeWordSize, index=NoIndex, kind=moveNativeWord} :: code
1229
1230        |   codeExtended _ ({instr=TagValue{source=PReg srcReg, dest as PReg dReg, opSize, ...}, ...}, code) =
1231            if not (isUsed dest)
1232            then code
1233            else
1234            let
1235                val regResult = asGenReg(getAllocatedReg dReg)
1236                val realSReg = asGenReg(getAllocatedReg srcReg)
1237            in
1238                (* N.B. Using LEA with a base register and an index multiplier of 1 is shorter than
1239                   using no base register and a multiplier of two. *)
1240                (* TODO: If the value we're tagging is a byte or a 16-bit value we can use OpSize32 and possibly
1241                   save the Rex byte. *)
1242                LoadAddress{ output=regResult, offset=1, base=SOME realSReg, index=Index1 realSReg, opSize=opSize } :: code
1243            end
1244
1245        |   codeExtended _ ({instr=UntagValue{dest as PReg dReg, cache=SOME(PReg cacheReg), opSize, ...}, ...}, code) =
1246            if not (isUsed dest)
1247            then code
1248            else moveIfNecessary({src=getAllocatedReg cacheReg, dst=getAllocatedReg dReg, kind=opSizeToIMove opSize}, code)
1249
1250        |   codeExtended _ ({instr=UntagValue{source=PReg sReg, dest as PReg dReg, isSigned, opSize, ...}, ...}, code) =
1251            if not (isUsed dest)
1252            then code
1253            else
1254            let
1255                val regResult = getAllocatedReg dReg
1256                val realSReg = getAllocatedReg sReg
1257            in
1258                (* For most cases we're going to be using a 32-bit word if this is 32-in-64.  The exception
1259                   is when converting a word to a signed large-word.  *)
1260                ShiftConstant{ shiftType=if isSigned then SAR else SHR, output=asGenReg regResult, shift=0w1, opSize=opSize } ::
1261                    moveIfNecessary({src=realSReg, dst=regResult, kind=opSizeToIMove opSize}, code)
1262            end
1263
1264        |   codeExtended _ ({instr=LoadEffectiveAddress{base, offset, index=ObjectIndex, dest=PReg dReg, opSize}, ...}, code) =
1265            let
1266                val destReg = asGenReg(getAllocatedReg dReg)
1267                val bReg =
1268                    case base of
1269                        SOME(PReg br) => asGenReg(getAllocatedReg br)
1270                    |   NONE => raise InternalError "LoadEffectiveAddress - ObjectIndex but no base"
1271            in
1272                LoadAddress{ output=destReg, offset=offset, base=SOME ebx, index=Index4 bReg, opSize=opSize } :: code
1273           end
1274
1275        |   codeExtended _ ({instr=LoadEffectiveAddress{base, offset, index, dest=PReg dReg, opSize}, ...}, code) =
1276            let
1277                val destReg = asGenReg(getAllocatedReg dReg)
1278                val bReg =
1279                    case base of
1280                        SOME(PReg br) => SOME(asGenReg(getAllocatedReg br))
1281                    |   NONE => NONE
1282                val indexR = codeExtIndex index
1283            in
1284                LoadAddress{ output=destReg, offset=offset, base=bReg, index=indexR, opSize=opSize } :: code
1285            end
1286
1287        |   codeExtended _ ({instr=ShiftOperation{shift, resultReg=PReg resReg, operand=PReg operReg, shiftAmount=IntegerConstant i, opSize, ...}, ...}, code) =
1288            let
1289                val realDestReg = getAllocatedReg resReg
1290                val realOpReg = getAllocatedReg operReg
1291            in
1292                ShiftConstant{ shiftType=shift, output=asGenReg realDestReg, shift=Word8.fromLargeInt i, opSize=opSize } ::
1293                    moveIfNecessary({src=realOpReg, dst=realDestReg, kind=opSizeToIMove opSize}, code)
1294            end
1295
1296        |   codeExtended _ ({instr=ShiftOperation{shift, resultReg=PReg resReg, operand=PReg operReg,
1297                                        shiftAmount=RegisterArgument(PReg shiftReg), opSize, ...}, ...}, code) =
1298            let
1299                val realDestReg = getAllocatedReg resReg
1300                val realShiftReg = getAllocatedReg shiftReg
1301                val realOpReg = getAllocatedReg operReg
1302                (* We want the shift in ecx.  We may not have got it there but the register
1303                   should be free.  The shift is masked to 5 or 6 bits so we have to
1304                   check for larger shift values at a higher level.*)
1305            in
1306                ShiftVariable{ shiftType=shift, output=asGenReg realDestReg, opSize=opSize } ::
1307                    moveIfNecessary({src=realOpReg, dst=realDestReg, kind=opSizeToIMove opSize},
1308                        moveIfNecessary({src=realShiftReg, dst=GenReg ecx, kind=Move32Bit (* < 64*)}, code))
1309           end
1310
1311        |   codeExtended _ ({instr=ShiftOperation _, ...}, _) = raise InternalError "codeExtended - ShiftOperation"
1312
1313        |   codeExtended _ ({instr=
1314                Multiplication{resultReg=PReg resReg, operand1=PReg op1Reg,
1315                               operand2, opSize, ...}, ...}, code) =
1316            (
1317                case decache operand2 of
1318                    RegisterArgument(PReg op2Reg) =>
1319                    let
1320                        (* Treat exactly the same as ArithmeticFunction. *)
1321                        val realDestReg = getAllocatedGenReg resReg
1322                        val realOp1Reg = getAllocatedGenReg op1Reg
1323                        and realOp2Reg = getAllocatedGenReg op2Reg
1324                        val (operandReg, moveInstr) =
1325                            if realOp1Reg = realDestReg
1326                            then (realOp2Reg, code)
1327                            else if realOp2Reg = realDestReg
1328                            then (realOp1Reg, code)
1329                            else (realOp2Reg, Move{source=RegisterArg realOp1Reg, destination=RegisterArg realDestReg, moveSize=opSizeToMove opSize} :: code)
1330                    in
1331                        MultiplyR { source=RegisterArg operandReg, output=realDestReg, opSize=opSize } :: moveInstr
1332                    end
1333                |   operand2 =>
1334                    (* Multiply operation with the first argument in a register and the second a constant or memory location. *)
1335                    let
1336                        val realDestReg = getAllocatedReg resReg
1337                        val realOp1Reg = getAllocatedReg op1Reg
1338                        val op2Arg = codeExtArgumentAsGenReg operand2
1339                    in
1340                        MultiplyR { output=asGenReg realDestReg, source=op2Arg, opSize=opSize } ::
1341                            moveIfNecessary({src=realOp1Reg, dst=realDestReg, kind=opSizeToIMove opSize}, code)
1342                    end
1343            )
1344
1345        |   codeExtended _ ({instr=Division{isSigned, dividend=PReg regDivid, divisor, quotient=PReg regQuot,
1346                                  remainder=PReg regRem, opSize}, ...}, code) =
1347            let
1348                (* TODO: This currently only supports the dividend in a register.  LargeWord division will
1349                   generally load the argument from a box so we could support a memory argument for that
1350                   case.  Word and integer values will always have to be detagged. *)
1351                (* Division is specific as to the registers.  The dividend must be eax, quotient is
1352                   eax and the remainder is edx. *)
1353                val realDiviReg = getAllocatedReg regDivid
1354                val realQuotReg = getAllocatedReg regQuot
1355                val realRemReg = getAllocatedReg regRem
1356                val divisorArg = codeExtArgument divisor
1357                val divisorReg = argAsGenReg divisorArg
1358                val _ = divisorReg <> eax andalso divisorReg <> edx orelse raise InternalError "codeGenICode: Division"
1359                (* rdx needs to be set to the high order part of the dividend.  For signed
1360                   division that means sign-extending rdx, for unsigned division we clear it.
1361                   We only need a 32-bit clear since the top 32-bits are cleared anyway. *)
1362                val setRDX =
1363                    if isSigned then SignExtendForDivide opSize
1364                    else ArithToGenReg{ opc=XOR, output=edx, source=RegisterArg edx, opSize=OpSize32 }
1365            in
1366                (* We may need to move one or more of the registers although normally that
1367                   won't be necessary.  Almost certainly only either the remainder or the
1368                   quotient will actually be used. *)
1369                moveMultipleRegisters([{src=GenReg eax, dst=realQuotReg}, {src=GenReg edx, dst=realRemReg}],
1370                    DivideAccR {arg=divisorReg, isSigned=isSigned, opSize=opSize} :: setRDX ::
1371                        moveIfNecessary({src=realDiviReg, dst=GenReg eax, kind=opSizeToIMove opSize}, code))
1372            end
1373
1374        |   codeExtended _ ({instr=AtomicExchangeAndAdd{base=PReg bReg, source=PReg sReg}, ...}, code) =
1375            let
1376                val baseReg = asGenReg (getAllocatedReg bReg) and outReg = asGenReg (getAllocatedReg sReg)
1377                val address =
1378                    if targetArch = ObjectId32Bit
1379                    then {base=ebx, index=Index4 baseReg, offset=0}
1380                    else {base=baseReg, index=NoIndex, offset=0}
1381            in
1382                AtomicXAdd{address=address, output=outReg, opSize=polyWordOpSize} :: code
1383            end
1384
1385        |   codeExtended _ ({instr=BoxValue{boxKind, source=PReg sReg, dest as PReg dReg, saveRegs}, ...}, code) =
1386            if not (isUsed dest)
1387            then code
1388            else
1389            let
1390                val preserve = getSaveRegs saveRegs
1391                val (srcReg, boxSize, moveKind) =
1392                    case boxKind of
1393                        BoxLargeWord => (getAllocatedReg sReg, Word.toInt(nativeWordSize div wordSize), moveNativeWord)
1394                    |   BoxX87Double => (getAllocatedReg sReg, Word.toInt(0w8 div wordSize), MoveDouble)
1395                    |   BoxX87Float => (getAllocatedReg sReg, Word.toInt(0w4 div wordSize), MoveFloat)
1396                    |   BoxSSE2Double => (getAllocatedReg sReg, Word.toInt(0w8 div wordSize), MoveDouble)
1397                    |   BoxSSE2Float => (getAllocatedReg sReg, Word.toInt(0w4 div wordSize), MoveFloat)
1398                val dstReg = getAllocatedReg dReg
1399                val (bReg, index) =
1400                    if targetArch = ObjectId32Bit
1401                    then (ebx, Index4(asGenReg dstReg))
1402                    else (asGenReg dstReg, NoIndex)
1403            in
1404                StoreInitialised ::
1405                    llStoreArgument{ source=RegisterArg srcReg, offset=0, base=bReg, index=index, kind=moveKind} ::
1406                        llAllocateMemoryOperation({ size=boxSize, flags=0wx1, dest=dstReg, saveRegs=preserve}, code)
1407            end
1408
1409        |   codeExtended _ ({instr=CompareByteVectors{vec1Addr=PReg v1Reg, vec2Addr=PReg v2Reg, length=PReg lReg, ...}, ...}, code) =
1410                (* There's a complication here.  CompareByteVectors generates REPE CMPSB to compare
1411                   the vectors but the condition code is only set if CMPSB is executed at least
1412                   once.  If the value in RCX/ECX is zero it will never be executed and the
1413                   condition code will be unchanged.  We want the result to be "equal" in that
1414                   case so we need to ensure that is the case.  It's quite possible that the
1415                   condition code has just been set by shifting RCX/ECX to remove the tag in which
1416                   case it will have set "equal" if the value was zero.  We use CMP R/ECX,R/ECX which
1417                   is two bytes in 32-bit.
1418                   If we knew the length was non-zero (e.g. a constant) we could avoid this. *)
1419                RepeatOperation CMPS8 :: ArithToGenReg {opc=CMP, output=ecx, source=RegisterArg ecx, opSize=OpSize32} ::
1420                    moveIfNecessary({src=getAllocatedReg v1Reg, dst=GenReg esi, kind=moveNativeWord},
1421                        moveIfNecessary({src=getAllocatedReg v2Reg, dst=GenReg edi, kind=moveNativeWord},
1422                            moveIfNecessary({src=getAllocatedReg lReg, dst=GenReg ecx, kind=moveNativeWord}, code)))
1423
1424        |   codeExtended _ ({instr=BlockMove{srcAddr=PReg sReg, destAddr=PReg dReg, length=PReg lReg, isByteMove}, ...}, code) =
1425                (* We may need to move these into the appropriate registers.  They have been reserved but it's
1426                   still possible the values could be in something else. *)
1427                RepeatOperation(if isByteMove then MOVS8 else if polyWordOpSize = OpSize64 then MOVS64 else MOVS32) ::
1428                    moveIfNecessary({src=getAllocatedReg sReg, dst=GenReg esi, kind=moveNativeWord},
1429                        moveIfNecessary({src=getAllocatedReg dReg, dst=GenReg edi, kind=moveNativeWord},
1430                            moveIfNecessary({src=getAllocatedReg lReg, dst=GenReg ecx, kind=moveNativeWord}, code)))
1431
1432        |   codeExtended _ ({instr=X87Compare{arg1=PReg argReg, arg2, isDouble, ...}, ...}, code) =
1433            let
1434                val fpReg = getAllocatedFPReg argReg
1435                val _ = fpReg = fp0 orelse raise InternalError "codeGenICode: CompareFloatingPt not fp0"
1436                (* This currently pops the value. *)
1437                val precision = if isDouble then DoublePrecision else SinglePrecision
1438            in
1439                case codeExtArgumentAsFPReg arg2 of
1440                    RegisterArg fpReg2 => FPArithR{opc=FCOMP, source=fpReg2} :: code
1441                |   MemoryArg{offset, base=baseReg, index=NoIndex} =>
1442                        FPArithMemory{opc=FCOMP, base=baseReg, offset=offset, precision=precision} :: code
1443                |   AddressConstArg const =>
1444                        FPArithConst{opc=FCOMP, source = const, precision=precision} :: code
1445                |   _ => raise InternalError "codeGenICode: CompareFloatingPt: TODO"
1446            end
1447
1448        |   codeExtended _ ({instr=SSE2Compare{arg1=PReg argReg, arg2, isDouble, ...}, ...}, code) =
1449            let
1450                val xmmReg = getAllocatedXMMReg argReg
1451                val arg2Code = codeExtArgumentAsXMMReg arg2
1452            in
1453                XMMArith { opc= if isDouble then SSE2CompDouble else SSE2CompSingle, output=xmmReg, source=arg2Code} :: code
1454            end
1455
1456        |   codeExtended _ ({instr=X87FPGetCondition{dest=PReg dReg, ...}, ...}, code) =
1457                moveIfNecessary({src=GenReg eax, dst=getAllocatedReg dReg, kind=Move32Bit},
1458                    FPStatusToEAX :: code)
1459
1460        |   codeExtended _ ({instr=X87FPArith{opc, resultReg=PReg resReg, arg1=PReg op1Reg, arg2, isDouble}, ...}, code) =
1461            let
1462                val realDestReg = getAllocatedFPReg resReg
1463                val realOp1Reg = getAllocatedFPReg op1Reg
1464                val _ = realDestReg = fp0 orelse raise InternalError "codeGenICode: FloatingPointArith not fp0"
1465                val _ = realOp1Reg = fp0 orelse raise InternalError "codeGenICode: FloatingPointArith not fp0"
1466                val op2Arg = codeExtArgumentAsFPReg arg2
1467                val precision = if isDouble then DoublePrecision else SinglePrecision
1468            in
1469                case op2Arg of
1470                    MemoryArg{offset, base=baseReg, index=NoIndex} =>
1471                        FPArithMemory{opc=opc, base=baseReg, offset=offset, precision=precision} :: code
1472                |   AddressConstArg const =>
1473                        FPArithConst{opc=opc, source = const, precision=precision} :: code
1474                |   _ => raise InternalError "codeGenICode: X87FPArith: TODO"
1475            end
1476    
1477        |   codeExtended _ ({instr=X87FPUnaryOps{fpOp, dest=PReg resReg, source=PReg op1Reg}, ...}, code) =
1478            let
1479                val realDestReg = getAllocatedFPReg resReg
1480                val realOp1Reg = getAllocatedFPReg op1Reg
1481                val _ = realDestReg = fp0 orelse raise InternalError "codeGenICode: X87FPUnaryOps not fp0"
1482                val _ = realOp1Reg = fp0 orelse raise InternalError "codeGenICode: X87FPUnaryOps not fp0"
1483            in
1484                FPUnary fpOp :: code
1485            end
1486
1487        |   codeExtended _ ({instr=X87Float{dest=PReg resReg, source}, ...}, code) =
1488            let
1489                val intSource = codeExtArgumentAsGenReg source
1490                val fpReg = getAllocatedFPReg resReg
1491                val _ = fpReg = fp0 orelse raise InternalError "codeGenICode: FloatFixedInt not fp0"
1492            in
1493                (* This is complicated.  The integer value has to be in memory not in a
1494                   register so we have to push it to the stack and then make sure it is
1495                   popped afterwards.  Because it is untagged it is unsafe to leave it. *)
1496                ResetStack{numWords=1, preserveCC=false} :: FPLoadInt{ base=esp, offset=0, opSize=polyWordOpSize } :: PushToStack intSource :: code
1497            end
1498
1499        |   codeExtended _ ({instr=SSE2Float{dest=PReg resReg, source}, ...}, code) =
1500            let
1501                val xmmResReg = getAllocatedXMMReg resReg
1502                val srcReg = case codeExtArgumentAsGenReg source of RegisterArg srcReg => srcReg | _ => raise InternalError "FloatFixedInt: not reg"
1503            in
1504                XMMConvertFromInt{ output=xmmResReg, source=srcReg, opSize=polyWordOpSize} :: code
1505            end
1506
1507        |   codeExtended _ ({instr=SSE2FPUnary{opc, resultReg=PReg resReg, source}, ...}, code) =
1508            let
1509                val realDestReg = getAllocatedXMMReg resReg
1510                val opArg = codeExtArgumentAsXMMReg source
1511                val sse2Op =
1512                    case opc of
1513                        SSE2UDoubleToFloat  => SSE2DoubleToFloat
1514                    |   SSE2UFloatToDouble  => SSE2FloatToDouble
1515            in
1516                XMMArith{ opc=sse2Op, output=realDestReg, source=opArg} :: code
1517            end
1518
1519        |   codeExtended _ ({instr=SSE2FPBinary{opc, resultReg=PReg resReg, arg1=PReg op1Reg, arg2}, ...}, code) =
1520            let
1521                val realDestReg = getAllocatedXMMReg resReg
1522                val realOp1Reg = getAllocatedXMMReg op1Reg
1523                val op2Arg = codeExtArgumentAsXMMReg arg2
1524                (* xorpd and andpd require 128-bit arguments with 128-bit alignment. *)
1525                val _ =
1526                    case (opc, op2Arg) of
1527                        (SSE2BXor, RegisterArg _) => ()
1528                    |   (SSE2BXor, _) => raise InternalError "codeGenICode - SSE2Xor not in register"
1529                    |   (SSE2BAnd, RegisterArg _) => ()
1530                    |   (SSE2BAnd, _) => raise InternalError "codeGenICode - SSE2And not in register"
1531                    |   _ => ()
1532                val doMove =
1533                    if realDestReg = realOp1Reg
1534                    then code
1535                    else XMMArith { opc=SSE2MoveDouble, source=RegisterArg realOp1Reg, output=realDestReg } :: code
1536                val sse2Op =
1537                    case opc of
1538                        SSE2BAddDouble  => SSE2AddDouble
1539                    |   SSE2BSubDouble  => SSE2SubDouble
1540                    |   SSE2BMulDouble  => SSE2MulDouble
1541                    |   SSE2BDivDouble  => SSE2DivDouble
1542                    |   SSE2BAddSingle  => SSE2AddSingle
1543                    |   SSE2BSubSingle  => SSE2SubSingle
1544                    |   SSE2BMulSingle  => SSE2MulSingle
1545                    |   SSE2BDivSingle  => SSE2DivSingle
1546                    |   SSE2BXor        => SSE2Xor
1547                    |   SSE2BAnd        => SSE2And
1548            in
1549                XMMArith{ opc=sse2Op, output=realDestReg, source=op2Arg} :: doMove
1550            end
1551
1552        |   codeExtended _ ({instr=TagFloat{source=PReg srcReg, dest as PReg dReg, ...}, ...}, code) =
1553            if not (isUsed dest)
1554            then code
1555            else
1556            let
1557                val _ = targetArch = Native64Bit orelse raise InternalError "TagFloat: not 64-bit"
1558                (* Copy the value from an XMM reg into a general reg and tag it. *)
1559                val regResult = asGenReg(getAllocatedReg dReg)
1560                val realSReg = getAllocatedXMMReg srcReg
1561            in
1562                ArithToGenReg { opc=ADD, output=regResult, source=NonAddressConstArg 1, opSize=polyWordOpSize } ::
1563                ShiftConstant{ shiftType=SHL, output=regResult, shift=0w32, opSize=OpSize64} ::
1564                MoveXMMRegToGenReg { source = realSReg, output = regResult } :: code
1565            end
1566
1567        |   codeExtended _ ({instr=UntagFloat{dest as PReg dReg, cache=SOME(PReg cacheReg), ...}, ...}, code) =
1568            if not (isUsed dest)
1569            then code
1570            else moveIfNecessary({src=getAllocatedReg cacheReg, dst=getAllocatedReg dReg, kind=MoveFloat}, code)
1571
1572        |   codeExtended _ ({instr=UntagFloat{source, dest as PReg dReg, ...}, ...}, code) =
1573            if not (isUsed dest)
1574            then code
1575            else
1576            let
1577                val regResult = getAllocatedXMMReg dReg
1578            in
1579                case codeExtArgumentAsGenReg source of
1580                    RegisterArg realSReg =>
1581                        XMMShiftRight{ output=regResult, shift=0w4 (* Bytes - not bits *) } ::
1582                        MoveGenRegToXMMReg {source=realSReg, output=regResult} :: code
1583                |   MemoryArg{base, offset, index} =>
1584                        (* If the value is in memory we can just load the high order word. *)
1585                        XMMArith { opc=SSE2MoveFloat, source=MemoryArg{base=base, offset=offset+4, index=index}, output=regResult } :: code
1586                |   NonAddressConstArg ic =>
1587                        (* Shift down and then load from the non-constant area. *)
1588                        XMMArith { opc=SSE2MoveFloat, source=NonAddressConstArg(IntInf.~>>(ic, 0w32)), output=regResult } :: code
1589                |   _ => raise InternalError "UntagFloat - not register or memory"
1590            end
1591
1592        |   codeExtended _ ({instr=GetSSE2ControlReg{dest=PReg dReg}, ...}, code) =
1593            let
1594                (* This has to work through memory.  Reserve one word on the stack, get the
1595                   MXCSR register into it and pop it to the register. *)
1596                val regResult = getAllocatedGenReg dReg
1597            in
1598                PopR regResult ::
1599                XMMStoreCSR{base=esp, offset=0, index=NoIndex } ::
1600                PushToStack(NonAddressConstArg 0) :: code
1601            end
1602
1603        |   codeExtended _ ({instr=SetSSE2ControlReg{source=PReg sReg}, ...}, code) =
1604            let
1605                (* This has to work through memory.  Push the register to the stack,
1606                   store the value into the control register and remove it from the stack. *)
1607                val sourceReg = getAllocatedGenReg sReg
1608            in
1609                ResetStack{ numWords=1, preserveCC=false } ::
1610                XMMLoadCSR{base=esp, offset=0, index=NoIndex } ::
1611                PushToStack(RegisterArg sourceReg) :: code
1612            end
1613
1614        |   codeExtended _ ({instr=GetX87ControlReg{dest=PReg dReg}, ...}, code) =
1615            let
1616                (* This has to work through memory.  Reserve one word on the stack, get the
1617                   X87 control register into it and pop it to the register. *)
1618                val regResult = getAllocatedGenReg dReg
1619            in
1620                PopR regResult ::
1621                FPStoreCtrlWord{base=esp, offset=0, index=NoIndex } ::
1622                PushToStack(NonAddressConstArg 0) :: code
1623            end
1624
1625        |   codeExtended _ ({instr=SetX87ControlReg{source=PReg sReg}, ...}, code) =
1626            let
1627                (* This has to work through memory.  Push the register to the stack,
1628                   store the value into the control register and remove it from the stack. *)
1629                val sourceReg = getAllocatedGenReg sReg
1630            in
1631                ResetStack{ numWords=1, preserveCC=false } ::
1632                FPLoadCtrlWord{base=esp, offset=0, index=NoIndex } ::
1633                PushToStack(RegisterArg sourceReg) :: code
1634            end
1635
1636        |   codeExtended _ ({instr=X87RealToInt{source=PReg sReg, dest=PReg dReg}, ...}, code) =
1637            let
1638                (* This has to work through memory.  Reserve one word on the stack,
1639                   convert the value into it and pop it to the register. *)
1640                val regResult = getAllocatedGenReg dReg
1641                val fpReg = getAllocatedFPReg sReg
1642                val _ = fpReg = fp0 orelse raise InternalError "codeGenICode: CompareFloatingPt not fp0"
1643                (* This currently pops the value. *)
1644            in
1645                PopR regResult ::
1646                FPStoreInt{base=esp, offset=0, index=NoIndex } ::
1647                PushToStack(NonAddressConstArg 0) :: code
1648            end
1649
1650        |   codeExtended _ ({instr=SSE2RealToInt{source, dest=PReg dReg, isDouble, isTruncate}, ...}, code) =
1651            let
1652                (* The source is either an XMM register or memory. *)
1653                val regResult = getAllocatedGenReg dReg
1654                val opArg = codeExtArgumentAsXMMReg source
1655            in
1656                XMMStoreInt {
1657                    source=opArg, precision=if isDouble then DoublePrecision else SinglePrecision,
1658                    output = regResult, isTruncate=isTruncate } :: code
1659            end
1660
1661        |   codeExtended _ ({instr=SignExtend32To64{source, dest=PReg dReg}, ...}, code) =
1662            let
1663                val regResult = getAllocatedGenReg dReg
1664                val opArg = codeExtArgumentAsGenReg source
1665            in
1666                Move{moveSize=Move32X64, source=opArg, destination=RegisterArg regResult } :: code
1667            end
1668        
1669        |   codeExtended _ ({instr=TouchArgument _, ...}, code) = code (* Don't need to do anything. *)
1670
1671        val newCode = codeCreate (functionName, profileObject, debugSwitches) 
1672        
1673        local
1674            (* processed - set to true when a block has been processed. *)
1675            val processed = Array.array(numBlocks, false)
1676            fun haveProcessed n = Array.sub(processed, n)
1677            
1678            (* Find the blocks that reference this one.  This isn't essential but
1679               allows us to try to generate blocks in the order of the control
1680               flow.  This in turn may allow us to use short branches rather
1681               than long ones. *)
1682            val labelRefs = Array.array(numBlocks, [])
1683
1684            datatype flowCode =
1685                FlowCodeSimple of int
1686            |   FlowCodeCMove of {code: operation list, trueJump: int, falseJump: int}
1687            
1688            (* Process this recursively to set the references.  If we have
1689               unreachable blocks, perhaps because they've been merged, we
1690               don't want to include them in the reference counting.
1691               This shouldn't happen now that IdentifyReferences removes
1692               unreferenced blocks. *)
1693            fun setReferences fromLabel toLabel =
1694                case Array.sub(labelRefs, toLabel) of
1695                    [] => (* Not yet visited at all. *)
1696                    let
1697                        val ExtendedBasicBlock{ flow, ...} = Vector.sub(blocks, toLabel)
1698                        val refs =
1699                            case flow of
1700                                ExitCode => []
1701                            |   Unconditional lab => [lab]
1702                            |   Conditional{trueJump, falseJump, ... } => [trueJump, falseJump]
1703                            |   IndexedBr labs => labs
1704                            |   SetHandler { handler, continue } => [handler, continue]
1705                            |   UnconditionalHandle _ => []
1706                            |   ConditionalHandle { continue, ...} => [continue]
1707
1708                        val () =
1709                            if fromLabel >= 0 then Array.update(labelRefs, toLabel, [fromLabel]) else ()
1710                    in
1711                        List.app (setReferences toLabel) refs
1712                    end
1713                    
1714                |   refs =>
1715                    (* We've visiting this at least once.  Just add us to the list. *)
1716                        Array.update(labelRefs, toLabel, fromLabel :: refs)
1717            
1718            val _ = setReferences 0 0
1719            
1720            (* Process the blocks.  We keep the "stack" explicit rather than using recursion
1721               because this allows us to select both arms of a conditional branch sooner. *)
1722            fun genCode(toDo, lastFlow, code) =
1723            case List.filter (not o haveProcessed) toDo of
1724                [] =>
1725                let
1726                    (* There's nothing left to do. We may need to add a final branch to the end. *)
1727                    val finalBranch =
1728                        case lastFlow of
1729                            ExitCode => []
1730                        |   IndexedBr _ => []
1731                        |   Unconditional dest => [UncondBranch(getBlockLabel dest)]
1732                        |   Conditional { condition, trueJump, falseJump, ...} =>
1733                                [
1734                                    UncondBranch(getBlockLabel falseJump),
1735                                    ConditionalBranch{test=condition, label=getBlockLabel trueJump}
1736                                ]
1737                        |   SetHandler { continue, ...} => [UncondBranch(getBlockLabel continue)]
1738                        |   UnconditionalHandle _ => []
1739                        |   ConditionalHandle { continue, ...} => [UncondBranch(getBlockLabel continue)]
1740                in
1741                    finalBranch @ code (* Done. *)
1742                end
1743
1744            |   stillToDo as head :: _ =>
1745                let
1746                    local
1747                        (* Check the references.  If all the sources that lead up to this have
1748                           already been we won't have any backward jumps. *)
1749                        fun available dest = List.all haveProcessed (Array.sub(labelRefs, dest))
1750
1751                        val continuation =
1752                            case lastFlow of
1753                                ExitCode => NONE
1754                            |   IndexedBr _ => NONE (* We could put the last branch in here. *)
1755                            |   Unconditional dest =>
1756                                    if not (haveProcessed dest) andalso available dest
1757                                    then SOME(FlowCodeSimple dest)
1758                                    else NONE
1759                            |   Conditional {trueJump, falseJump, condition, ...} =>
1760                                let
1761                                    (* Can we replace this with a SETCC or CMOV?  If both arms simply set
1762                                       a register to a value and either return or jump to the same location
1763                                       we can use a SETCC or a CMOV.  *)
1764                                    val ExtendedBasicBlock { flow=tFlow, block=tBlock, ...} = Vector.sub(blocks, trueJump)
1765                                    and ExtendedBasicBlock { flow=fFlow, block=fBlock, ...} = Vector.sub(blocks, falseJump)
1766
1767                                    fun cmoveOrSetcc{condition, output, tSource=IntegerConstant trueValue, fSource=IntegerConstant falseValue, kind, code} =
1768                                        let (* Could use SETCC.  Only if we can use LEA for multiplication.  The result must be
1769                                               tagged so we will always have a multiplier. *)
1770                                            val (multiplier, fValue, testCondition) =
1771                                                if trueValue >= falseValue
1772                                                then (trueValue-falseValue, falseValue, condition)
1773                                                else (falseValue-trueValue, trueValue, invertTest condition)
1774                                            val destReg = asGenReg output
1775                                        in
1776                                            if not (targetArch = Native32Bit andalso (destReg=esi orelse destReg=edi))
1777                                                (* We can't use Setcc with esi or edi on native 32-bit. *)
1778                                               andalso (multiplier = 2 orelse multiplier = 4 orelse multiplier = 8)
1779                                               (* We're using LEA so can only be multiplying by 2, 4 or 8. *)
1780                                               andalso is32bit fValue (* and we're going to put this in the offset *)
1781                                            then
1782                                            let
1783                                                val effectiveOpSize =
1784                                                    (* We can generally use 32-bit LEA except if the result is negative. *)
1785                                                    if kind = Move32Bit orelse fValue >= 0 andalso fValue+multiplier <= 0x7fffffff
1786                                                    then OpSize32 else OpSize64
1787                                                val (index, base) =
1788                                                    case multiplier of
1789                                                        2 => (Index1 destReg, SOME destReg)
1790                                                    |   4 => (Index4 destReg, NONE)
1791                                                    |   8 => (Index8 destReg, NONE)
1792                                                    |   _ => (NoIndex, NONE)
1793                                                (* Try to put the instruction to zero the register before any compare.  We can do it
1794                                                   provided the register we're going to zero isn't used in the comparison. *)
1795                                                fun checkArg(RegisterArg r) = r <> destReg
1796                                                |   checkArg(MemoryArg mem) = checkMem mem
1797                                                |   checkArg _ = true
1798                                                
1799                                                and checkMem{base, index=NoIndex, ...} = base <> destReg
1800                                                |   checkMem{base, index=Index1 index, ...} = base <> destReg andalso index <> destReg
1801                                                |   checkMem{base, index=Index2 index, ...} = base <> destReg andalso index <> destReg
1802                                                |   checkMem{base, index=Index4 index, ...} = base <> destReg andalso index <> destReg
1803                                                |   checkMem{base, index=Index8 index, ...} = base <> destReg andalso index <> destReg
1804
1805                                                val zeroReg = ArithToGenReg { opc=XOR, output=destReg, source=RegisterArg destReg, opSize=OpSize32 } 
1806
1807                                                fun addXOR [] = NONE
1808                                                |   addXOR ((instr as ResetStack _) :: tl) =
1809                                                        (* If we can add the XOR before the ResetStack do so. *)
1810                                                        Option.map(fn code => instr :: code) (addXOR tl)
1811                                                |   addXOR ((instr as ArithToGenReg{output, source, ...}) :: tl) =
1812                                                        if output <> destReg andalso checkArg source
1813                                                        then SOME(instr :: zeroReg :: tl)
1814                                                        else NONE
1815                                                |   addXOR ((instr as ArithMemConst{address, ...}) :: tl) =
1816                                                        if checkMem address
1817                                                        then SOME(instr :: zeroReg :: tl)
1818                                                        else NONE
1819                                                |   addXOR ((instr as ArithByteMemConst{address, ...}) :: tl) =
1820                                                        if checkMem address
1821                                                        then SOME(instr :: zeroReg :: tl)
1822                                                        else NONE
1823                                                |   addXOR ((instr as XMMArith{source=MemoryArg mem, ...}) :: tl) =
1824                                                        if checkMem mem
1825                                                        then SOME(instr :: zeroReg :: tl)
1826                                                        else NONE
1827                                                |   addXOR ((instr as XMMArith _) :: tl) = SOME(instr :: zeroReg :: tl)
1828                                                |   addXOR ((instr as TestByteBits{arg, ...}) :: tl) =
1829                                                        if checkArg arg
1830                                                        then SOME(instr :: zeroReg :: tl)
1831                                                        else NONE
1832                                                |   addXOR ((instr as RepeatOperation CMPS8) :: tl) =
1833                                                        (* This uses edi, esi and ecx implicitly *)
1834                                                        if destReg <> esi andalso destReg <> edi andalso destReg <> ecx
1835                                                        then SOME(instr :: zeroReg :: tl)
1836                                                        else NONE
1837                                                    (* This seems to be just a conditional jump as a result of
1838                                                       testing the condition code twice in Real.== *)
1839                                                |   addXOR _ = NONE
1840
1841                                                (* If we can't put the XOR before the instruction we need to either zero
1842                                                   it using a move which won't affect the CC or we use MOVZB to extend
1843                                                   the byte value to 32/64 bits. *)
1844                                                val loadAddr = LoadAddress{output=destReg, offset=Int.fromLarge fValue, base=base, index=index, opSize=effectiveOpSize}
1845                                                and setCond = SetCondition{output=destReg, test=testCondition}
1846                                                val code =
1847                                                    case addXOR code of
1848                                                        SOME withXOR => loadAddr :: setCond :: withXOR
1849                                                    |   NONE =>
1850                                                        loadAddr ::
1851                                                            (* We've already check that we're not using esi/edi on native 32-bits. *)
1852                                                            Move{destination=RegisterArg destReg, source=RegisterArg destReg, moveSize=Move8} :: setCond :: code
1853                                            in
1854                                                SOME code
1855                                            end
1856                                            else NONE
1857                                        end
1858
1859                                        (* If either value is a memory location it isn't safe to load it.  The base address
1860                                           may not be valid if the condition does not hold. *)
1861                                    |   cmoveOrSetcc{tSource=MemoryLocation _, ...} = NONE
1862                                    |   cmoveOrSetcc{fSource=MemoryLocation _, ...} = NONE
1863                                        
1864                                    |   cmoveOrSetcc{condition, output, tSource, fSource, kind, code} =
1865                                        if targetArch = Native32Bit
1866                                        then NONE (* CMov doesn't work for constants. *)
1867                                        else
1868                                        let
1869                                            val output = asGenReg output
1870                                            val codeTrue = codeExtArgumentAsGenReg tSource
1871                                            and codeFalse = codeExtArgumentAsGenReg fSource
1872                                            val opSize =
1873                                                case kind of
1874                                                    Move32Bit => OpSize32
1875                                                |   Move64Bit => OpSize64
1876                                                | _ => raise InternalError "move size"
1877                                            (* One argument has to be loaded into a register first and the other
1878                                               is conditionally moved.  *)
1879                                            val loadFalseCmoveTrue =
1880                                                if (case codeFalse of RegisterArg regFalse => regFalse = output | _ => false)
1881                                                then true (* The false value is already in the right register. *)
1882                                                else if (case codeTrue of RegisterArg regTrue => regTrue = output | _ => false)
1883                                                then false (* The true value is in the right register - have to reverse. *)
1884                                                else if (case codeTrue of NonAddressConstArg _ => true | _ => false)
1885                                                then false (* The true value is a short constant.  If we use a CMOV we will have to put that
1886                                                              in the non-constant area and use a PC-relative reference.  Try to avoid it. *)
1887                                                else true
1888                                            fun cmov{codeLoad, codeMove, condition} =
1889                                            let
1890                                                val load =
1891                                                    case codeLoad of
1892                                                        RegisterArg regLoad =>
1893                                                            moveIfNecessary({src=GenReg regLoad, dst=GenReg output, kind=opSizeToIMove opSize}, code)
1894                                                    |   codeLoad =>
1895                                                            Move{source=codeLoad, destination=RegisterArg output, moveSize=opSizeToMove opSize} ::  code
1896                                            in
1897                                                CondMove{test=condition, output=output, source=codeMove, opSize=opSize} :: load
1898                                            end
1899                                        in
1900                                            if loadFalseCmoveTrue
1901                                            then SOME(cmov{codeLoad=codeFalse, codeMove=codeTrue, condition=condition})
1902                                            else SOME(cmov{codeLoad=codeTrue, codeMove=codeFalse, condition=invertTest condition})
1903                                        end
1904                                    
1905                                    val isPossSetCCOrCmov =
1906                                        if not (haveProcessed trueJump) andalso available trueJump
1907                                            andalso not (haveProcessed falseJump) andalso available falseJump
1908                                        then case (tFlow, fFlow, tBlock, fBlock) of
1909                                            (ExitCode,
1910                                             ExitCode,
1911                                             [{instr=LoadArgument{dest=PReg tReg, source=tSource, kind=kindT}, ...},
1912                                                {instr=ReturnResultFromFunction{resultReg=PReg resReg, realReg, numStackArgs, ...}, ...}],
1913                                             [{instr=LoadArgument{dest=PReg fReg, source=fSource, kind=kindF}, ...},
1914                                                {instr=ReturnResultFromFunction _, ...}]) =>
1915                                             (* The real register for the two sides should both be rax. *)
1916                                            let
1917                                                val realTReg = getAllocatedReg tReg and realFReg = getAllocatedReg fReg
1918                                            in
1919                                                if realTReg = realFReg andalso kindT = kindF andalso (kindT = Move32Bit orelse kindT = Move64Bit)
1920                                                then
1921                                                (
1922                                                    case cmoveOrSetcc{condition=condition, output=realTReg, tSource=tSource, fSource=fSource,
1923                                                                      kind=kindT, code=code} of
1924                                                        SOME code =>
1925                                                        let
1926                                                            val resultReg = getAllocatedReg resReg
1927                                                            val code =
1928                                                                ReturnFromFunction numStackArgs ::
1929                                                                    moveIfNecessary({src=resultReg, dst=realReg, kind=moveNativeWord}, code)
1930                                                        in
1931                                                            SOME{code=code, trueJump=trueJump, falseJump=falseJump}
1932                                                        end
1933                                                    |   NONE => NONE
1934                                               )
1935                                               else NONE
1936                                            end
1937                                        |   (Unconditional tDest,
1938                                             Unconditional fDest,
1939                                             [{instr=LoadArgument{dest=PReg tReg, source=tSource, kind=kindT}, ...}],
1940                                             [{instr=LoadArgument{dest=PReg fReg, source=fSource, kind=kindF}, ...}]) =>
1941                                            let
1942                                                val realTReg = getAllocatedReg tReg and realFReg = getAllocatedReg fReg
1943                                            in
1944                                                if tDest = fDest andalso realTReg = realFReg andalso kindT = kindF andalso (kindT = Move32Bit orelse kindT = Move64Bit)
1945                                                then
1946                                                (
1947                                                    case cmoveOrSetcc{condition=condition, output=realTReg, tSource=tSource, fSource=fSource,
1948                                                                      kind=kindT, code=code} of
1949                                                        SOME code => SOME{code=code, trueJump=trueJump, falseJump=falseJump}
1950                                                    |   NONE => NONE
1951                                                )
1952                                                else NONE
1953                                            end
1954                                        |   _ => NONE
1955                                        else NONE
1956                                in
1957                                    case isPossSetCCOrCmov of
1958                                        NONE =>
1959                                        (* We can usually choose either destination and in nearly all cases
1960                                           it won't matter.  The default branch is not to take forward jumps
1961                                           so if there is reason to believe that one branch is more likely
1962                                           we should follow that branch now and leave the other.  If we
1963                                           have JO/JNO we assume that overflow is unusual.  If one branch
1964                                           raises an exception we assume that that is unusual. *)
1965                                        let
1966                                            val (first, second) =
1967                                                case (condition, Vector.sub(blocks, falseJump)) of
1968                                                    (JNO, _) => (trueJump, falseJump)
1969                                                |   (_, ExtendedBasicBlock{ flow=ExitCode, block, ...}) =>
1970                                                        if List.exists(fn{instr=RaiseExceptionPacket _, ...} => true | _ => false) block
1971                                                        then (trueJump, falseJump)
1972                                                        else (falseJump, trueJump)
1973                                                |   _ => (falseJump, trueJump)
1974                                        in
1975                                            if not (haveProcessed first) andalso available first
1976                                            then SOME(FlowCodeSimple first)
1977                                            else if not (haveProcessed second) andalso available second
1978                                            then SOME(FlowCodeSimple second)
1979                                            else NONE
1980                                        end
1981                                    |   SOME args => SOME(FlowCodeCMove args)
1982                                end
1983                           |    SetHandler { continue, ... } =>
1984                                    (* We want the continuation if possible.  We'll need a
1985                                       branch round the handler so that won't help. *)
1986                                    if not (haveProcessed continue) andalso available continue
1987                                    then SOME(FlowCodeSimple continue)
1988                                    else NONE
1989                           |    UnconditionalHandle _ => NONE
1990                           |    ConditionalHandle _ => NONE
1991                    in
1992                        (* First choice - continue the existing block.
1993                           Second choice - the first item whose sources have all been
1994                           processed.
1995                           Third choice - something from the list. *)
1996                        val picked =
1997                            case continuation of
1998                                SOME c => c
1999                            |   NONE =>
2000                                    case List.find available stillToDo of
2001                                        SOME c => FlowCodeSimple c
2002                                    |   NONE => FlowCodeSimple head
2003                    end
2004                    
2005                in
2006                    case picked of
2007                        FlowCodeSimple picked =>
2008                        let
2009                            val () = Array.update(processed, picked, true)
2010
2011                            (* Code to terminate the previous block. *)
2012                            val startCode =
2013                                case lastFlow of
2014                                    ExitCode => []
2015                                |   IndexedBr _ => []
2016                                |   UnconditionalHandle _ => []
2017                                |   Unconditional dest =>
2018                                        if dest = picked then [] else [UncondBranch(getBlockLabel dest)]
2019                                |   ConditionalHandle { continue, ...} =>
2020                                        if continue = picked then [] else [UncondBranch(getBlockLabel continue)]
2021                                |   SetHandler { continue, ... } =>
2022                                        if continue = picked then [] else [UncondBranch(getBlockLabel continue)]
2023                                |   Conditional { condition, trueJump, falseJump, ...} =>
2024                                    if picked = falseJump (* Usual case. *)
2025                                    then [ConditionalBranch{test=condition, label=getBlockLabel trueJump}]
2026                                    else if picked = trueJump
2027                                    then (* We have a jump to the true condition. Invert the jump.
2028                                            This is more than an optimisation.  Because this immediately precedes the
2029                                            true block we're not going to generate a label. *)
2030                                        [ConditionalBranch{test=invertTest condition, label=getBlockLabel falseJump}]
2031                                    else
2032                                    [
2033                                        UncondBranch(getBlockLabel falseJump),
2034                                        ConditionalBranch{test=condition, label=getBlockLabel trueJump}
2035                                    ]
2036
2037                            (* Code-generate the body with the code we've done so far
2038                               at the end.  Add a label at the start if necessary. *)
2039                            local
2040                                (* If the previous block dropped through to this and this was
2041                                   the only reference then we don't need a label. *)
2042                                fun onlyJumpingHere (lab: int) =
2043                                    if lab <> picked then false
2044                                    else case Array.sub(labelRefs, picked) of
2045                                        [singleton] => singleton = lab
2046                                    |   _ => false
2047                    
2048                                val noLabel =
2049                                    case lastFlow of
2050                                        ExitCode => picked = 0 (* Unless this was the first block. *)
2051                                    |   Unconditional dest => onlyJumpingHere dest
2052                                    |   Conditional { trueJump, falseJump, ...} =>
2053                                            onlyJumpingHere trueJump orelse onlyJumpingHere falseJump
2054                                    |   IndexedBr _ => false
2055                                    |   SetHandler _ => false
2056                                    |   UnconditionalHandle _ => false
2057                                    |   ConditionalHandle { continue, ...} => onlyJumpingHere continue
2058                            in
2059                                val startLabel = if noLabel then [] else [JumpLabel(getBlockLabel picked)]
2060                            end
2061
2062                            val ExtendedBasicBlock { flow, block, ...} = Vector.sub(blocks, picked)
2063
2064                            local
2065                                fun genCodeBlock(instr, code) = codeExtended {flow=flow} (instr, code)
2066                            in
2067                                val bodyCode = List.foldl genCodeBlock (startLabel @ startCode @ code) block
2068                            end
2069
2070                            val addSet =
2071                                case flow of
2072                                    ExitCode => []
2073                                |   IndexedBr cases => cases
2074                                |   Unconditional dest => [dest]
2075                                |   Conditional {trueJump, falseJump, ...} => [falseJump, trueJump]
2076                                |   SetHandler { handler, continue } => [handler, continue]
2077                                |   UnconditionalHandle _ => []
2078                                |   ConditionalHandle { continue, ...} => [continue]
2079
2080                        in
2081                            genCode(addSet @ stillToDo, flow, bodyCode)
2082                        end
2083                
2084                    |   FlowCodeCMove{code, trueJump, falseJump} =>
2085                        let
2086                            (* We've generated a conditional move and possibly a return.  If the
2087                               trueJump and falseJump are only ever referenced from this block
2088                               they're done, otherwise we still need to do them. *)
2089                            val _ =
2090                                case Array.sub(labelRefs, trueJump) of
2091                                    [_] => Array.update(processed, trueJump, true)
2092                                |   _ => ()
2093                            val _ =
2094                                case Array.sub(labelRefs, falseJump) of
2095                                    [_] => Array.update(processed, falseJump, true)
2096                                |   _ => ()
2097                            val ExtendedBasicBlock { flow, ...} = Vector.sub(blocks, trueJump)
2098                            val addSet =
2099                                case flow of
2100                                    ExitCode => []
2101                                |   Unconditional dest => [dest]
2102                                |   _ => raise InternalError "FlowCodeCMove"
2103                        in
2104                            genCode(addSet @ stillToDo, flow, code)
2105                        end
2106                end
2107        in
2108            val ops = genCode([0], ExitCode, [])
2109        end
2110    in
2111        X86OPTIMISE.generateCode{code=newCode, ops=List.rev ops,
2112                                 labelCount= !outputLabelCount, resultClosure=resultClosure}
2113    end
2114
2115    val nGenRegs = List.length generalRegisters
2116
2117    structure Sharing =
2118    struct
2119        type intSet             = intSet
2120        and extendedBasicBlock  = extendedBasicBlock
2121        and regProperty         = regProperty
2122        and reg                 = reg
2123        and closureRef          = closureRef
2124    end
2125
2126end;
2127