1(*
2    Copyright (c) 2016-20 David C.J. Matthews
3
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License version 2.1 as published by the Free Software Foundation.
7    
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Lesser General Public License for more details.
12    
13    You should have received a copy of the GNU Lesser General Public
14    License along with this library; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16*)
17
18functor X86FOREIGNCALL(
19
20    structure X86CODE: X86CODESIG
21
22    structure X86OPTIMISE:
23    sig
24        type operation
25        type code
26        type operations = operation list
27        type closureRef
28
29        (* Optimise and code-generate. *)
30        val generateCode: {code: code, ops: operations, labelCount: int, resultClosure: closureRef} -> unit
31
32        structure Sharing:
33        sig
34            type operation = operation
35            type code = code
36            type closureRef = closureRef
37        end
38    end
39
40    structure DEBUG: DEBUG
41    
42    structure CODE_ARRAY: CODEARRAYSIG
43
44    sharing X86CODE.Sharing = X86OPTIMISE.Sharing = CODE_ARRAY.Sharing
45): FOREIGNCALLSIG
46=
47struct
48    open X86CODE
49    open Address
50    open CODE_ARRAY
51        
52    (* Unix X64.  The first six arguments are in rdi, rsi, rdx, rcx, r8, r9.
53                  The rest are on the stack.
54       Windows X64. The first four arguments are in rcx, rdx, r8 and r9.  The rest are
55                   on the stack.  The caller must ensure the stack is aligned on 16-byte boundary
56                   and must allocate 32-byte save area for the register args.
57                   rbx, rbp, rdi, rsi, rsp, r12-r15 are saved by the called function.
58       X86/32.  Arguments are pushed to the stack.
59               ebx, edi, esi, ebp and esp are saved by the called function.
60               We use esi to hold the argument data pointer and edi to save the ML stack pointer
61       Our ML conventions use eax, ebx for the first two arguments in X86/32,
62               rax, ebx, r8, r9, r10 for the first five arguments in X86/64 and
63               rax, rsi, r8, r9 and r10 for the first five arguments in X86/64-32 bit.
64    *)
65    
66    val memRegSize = 0
67
68    val (polyWordOpSize, nativeWordOpSize) =
69        case targetArch of
70            Native32Bit     => (OpSize32, OpSize32)
71        |   Native64Bit     => (OpSize64, OpSize64)
72        |   ObjectId32Bit   => (OpSize32, OpSize64)
73    
74    (* Ebx/Rbx is used for the second argument on the native architectures but
75       is replaced by esi on the object ID arch because ebx is used as the
76       global base register. *)
77    val mlArg2Reg = case targetArch of ObjectId32Bit => esi | _ => ebx
78    
79    exception InternalError = Misc.InternalError
80  
81    fun opSizeToMove OpSize32 = Move32 | opSizeToMove OpSize64 = Move64
82
83    val pushR = PushToStack o RegisterArg
84
85    fun moveRR{source, output, opSize} =
86        Move{source=RegisterArg source, destination=RegisterArg output, moveSize=opSizeToMove opSize}
87
88    fun loadMemory(reg, base, offset, opSize) =
89        Move{source=MemoryArg{base=base, offset=offset, index=NoIndex}, destination=RegisterArg reg, moveSize=opSizeToMove opSize}
90    and storeMemory(reg, base, offset, opSize) =
91        Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset, index=NoIndex}, moveSize=opSizeToMove opSize}
92    
93    val loadHeapMemory =
94        case targetArch of
95            ObjectId32Bit =>
96                (
97                    fn (reg, base, offset, opSize) => 
98                        Move{source=MemoryArg{base=ebx, offset=offset, index=Index4 base},
99                             destination=RegisterArg reg, moveSize=opSizeToMove opSize}
100                )
101        |   _ => loadMemory
102
103
104    fun loadAddress{source=(srcReg, 0), destination} =
105            Move{source=RegisterArg srcReg, destination=RegisterArg destination, moveSize=opSizeToMove nativeWordOpSize}
106    |   loadAddress{source=(srcReg, srcOffset), destination} =
107            LoadAddress{offset=srcOffset, base=SOME srcReg, index=NoIndex, output=destination, opSize=nativeWordOpSize }
108
109    (* Sequence of operations to move memory. *)
110    fun moveMemory{source, destination, length} =
111    [
112        loadAddress{source=source, destination=rsi},
113        loadAddress{source=destination, destination=rdi},
114        (* N.B. When moving a struct in a Win64 callback the source could be rcx so only move this
115           after copying the source to rsi. *)
116        Move{source=NonAddressConstArg(LargeInt.fromInt length), destination=RegisterArg rcx,
117             moveSize=opSizeToMove nativeWordOpSize},
118        RepeatOperation MOVS8
119    ]
120
121    fun createProfileObject _ (*functionName*) =
122    let
123        (* The profile object is a single mutable with the F_bytes bit set. *)
124        open Address
125        val profileObject = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
126        fun clear 0w0 = ()
127        |   clear i = (assignByte(profileObject, i-0w1, 0w0); clear (i-0w1))
128        val () = clear wordSize
129    in
130        toMachineWord profileObject
131    end
132
133    val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
134
135    datatype abi = X86_32 | X64Win | X64Unix
136    
137    local
138        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
139        val getABICall: unit -> int = RunCall.rtsCallFast0 "PolyGetABI"
140    in
141        fun getABI() =
142            case getABICall() of
143                0 => X86_32
144            |   1 => X64Unix
145            |   2 => X64Win
146            |   n => raise InternalError ("Unknown ABI type " ^ Int.toString n)
147    end
148
149    (* This is now the standard entry call code. *)
150    datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat
151
152
153    fun rtsCallFastGeneral (functionName, argFormats, (*resultFormat*) _, debugSwitches) =
154    let
155        val entryPointAddr = makeEntryPoint functionName
156
157        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
158        val abi = getABI()
159
160        val entryPtrReg = if targetArch <> Native32Bit then r11 else ecx
161        
162        val nArgs = List.length argFormats
163        
164        local
165            (* Compute stack space.  The actual number of args passed is nArgs. *)
166            val argSpace =
167                case abi of
168                    X64Unix => Int.max(0, nArgs-6)*8
169                |   X64Win => Int.max(0, nArgs-4)*8
170                |   X86_32 => List.foldl(fn (FastArgDouble, n) => n+8 | (_, n) => n+4) 0 argFormats
171            val align = argSpace mod 16
172        in
173            (* Add sufficient space so that esp will be 16-byte aligned after we
174               have pushed any arguments we need to push. *)
175            val stackSpace =
176                if align = 0
177                then memRegSize
178                else memRegSize + 16 - align
179        end
180
181        (* The number of ML arguments passed on the stack. *)
182        val mlArgsOnStack = Int.max(case abi of X86_32 => nArgs - 2 | _ => nArgs - 5, 0)
183
184        val code =
185            [
186                Move{source=AddressConstArg entryPointAddr, destination=RegisterArg entryPtrReg, moveSize=opSizeToMove polyWordOpSize}, (* Load the entry point ref. *)
187                loadHeapMemory(entryPtrReg, entryPtrReg, 0, nativeWordOpSize)(* Load its value. *)
188            ] @
189            (
190                (* Save heap ptr.  This is in r15 in X86/64 *)
191                if targetArch <> Native32Bit then [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] (* Save heap ptr *)
192                else []
193            ) @
194            (
195                if (case abi of X86_32 => nArgs >= 3 | _ => nArgs >= 6)
196                then [moveRR{source=esp, output=edi, opSize=nativeWordOpSize}] (* Needed if we have to load from the stack. *)
197                else []
198            ) @
199            [
200                storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
201                loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize),
202                (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
203                ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
204            ] @
205            (
206                case abi of  (* Set the argument registers. *)
207                    X86_32 =>
208                    let
209                        fun pushReg(reg, FastArgFixed) = [pushR reg]
210                        |   pushReg(reg, FastArgDouble) = 
211                            (* reg contains the address of the value.  This must be unboxed onto the stack. *)
212                                [
213                                    FPLoadFromMemory{address={base=reg, offset=0, index=NoIndex}, precision=DoublePrecision},
214                                    ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
215                                    FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }
216                                ]
217                        |   pushReg(reg, FastArgFloat) =
218                            (* reg contains the address of the value.  This must be unboxed onto the stack. *)
219                            [
220                                FPLoadFromMemory{address={base=reg, offset=0, index=NoIndex}, precision=SinglePrecision},
221                                ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize},
222                                FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true }
223                            ]
224
225                        (* The stack arguments have to be copied first followed by the ebx and finally eax. *)
226                        fun pushArgs (_, []) = []
227                        |   pushArgs (_, [argType]) = pushReg(eax, argType)
228                        |   pushArgs (_, [arg2Type, arg1Type]) = pushReg(ebx, arg2Type) @ pushReg(eax, arg1Type)
229                        |   pushArgs (n, FastArgFixed :: argTypes) =
230                                PushToStack(MemoryArg{base=edi, offset=(nArgs-n+1)* 4, index=NoIndex}) :: pushArgs(n-1, argTypes)
231                        |   pushArgs (n, argType :: argTypes) =
232                                (* Use esi as a temporary register. *)
233                                loadMemory(esi, edi, (nArgs-n+1)* 4, polyWordOpSize) :: pushReg(esi, argType) @ pushArgs(n-1, argTypes)
234                    in
235                        pushArgs(nArgs, List.rev argFormats)
236                    end
237                
238                |   X64Unix =>
239                    (
240                        if List.all (fn FastArgFixed => true | _ => false) argFormats
241                        then
242                        let
243                            fun pushArgs 0 = []
244                            |   pushArgs 1 = [moveRR{source=eax, output=edi, opSize=polyWordOpSize}]
245                            |   pushArgs 2 = moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize} :: pushArgs 1
246                            |   pushArgs 3 = moveRR{source=r8, output=edx, opSize=polyWordOpSize} :: pushArgs 2
247                            |   pushArgs 4 = moveRR{source=r9, output=ecx, opSize=polyWordOpSize} :: pushArgs 3
248                            |   pushArgs 5 =
249                                    (* We have to move r8 into edx before we can move r10 into r8 *)
250                                    moveRR{source=r8, output=edx, opSize=polyWordOpSize} ::
251                                    moveRR{source=r9, output=ecx, opSize=polyWordOpSize} ::
252                                    moveRR{source=r10, output=r8, opSize=polyWordOpSize} :: pushArgs 2
253                            |   pushArgs 6 =
254                                    (* We have to move r9 into edi before we can load r9 from the stack. *)
255                                    moveRR{source=r8, output=edx, opSize=polyWordOpSize} ::
256                                    moveRR{source=r9, output=ecx, opSize=polyWordOpSize} ::
257                                    loadMemory(r9, edi, 8, polyWordOpSize) ::
258                                    moveRR{source=r10, output=r8, opSize=polyWordOpSize} :: pushArgs 2
259                            |   pushArgs _ = raise InternalError "rtsCall: Abi/argument count not implemented"
260                        in
261                            pushArgs nArgs
262                        end
263                        else case argFormats of
264                            [] => []
265                        |   [FastArgFixed] => [ moveRR{source=eax, output=edi, opSize=polyWordOpSize} ]
266                        |   [FastArgFixed, FastArgFixed] =>
267                                (* Since mlArgs2Reg is esi on 32-in-64 this is redundant. *)
268                                [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize} ]
269                        |   [FastArgFixed, FastArgFixed, FastArgFixed] => 
270                                [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize},
271                                  moveRR{source=r8, output=edx, opSize=polyWordOpSize} ]
272                        |   [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed] => 
273                                [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize},
274                                  moveRR{source=r8, output=edx, opSize=polyWordOpSize}, moveRR{source=r9, output=ecx, opSize=polyWordOpSize} ]
275                            (* One "double" argument.  The value needs to be unboxed. *)
276                        |   [FastArgDouble] => [] (* Already in xmm0 *)
277                            (* X64 on both Windows and Unix take the first arg in xmm0 and the second in xmm1. They are already there. *)
278                        |   [FastArgDouble, FastArgDouble] => []
279                        |   [FastArgDouble, FastArgFixed] => [ moveRR{source=eax, output=edi, opSize=nativeWordOpSize} ]
280                        |   [FastArgFloat] => [] (* Already in xmm0 *)
281                        |   [FastArgFloat, FastArgFloat] => [] (* Already in xmm0 and xmm1 *)
282                                (* One float argument and one fixed. *)
283                        |   [FastArgFloat, FastArgFixed] => [moveRR{source=mlArg2Reg, output=edi, opSize=polyWordOpSize} ]
284                        
285                        |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
286                            
287                    )
288                
289                |   X64Win =>
290                    (
291                        if List.all (fn FastArgFixed => true | _ => false) argFormats
292                        then
293                        let
294                            fun pushArgs 0 = []
295                            |   pushArgs 1 = [moveRR{source=eax, output=ecx, opSize=polyWordOpSize}]
296                            |   pushArgs 2 = moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} :: pushArgs 1
297                            |   pushArgs 3 = (* Already in r8 *) pushArgs 2
298                            |   pushArgs 4 = (* Already in r9, and r8 *) pushArgs 2
299                            |   pushArgs 5 = pushR r10 :: pushArgs 2
300                            |   pushArgs 6 = PushToStack(MemoryArg{base=edi, offset=8, index=NoIndex}) :: pushArgs 5
301                            |   pushArgs _ = raise InternalError "rtsCall: Abi/argument count not implemented"
302                        in
303                            pushArgs nArgs
304                        end
305
306                        else case argFormats of
307                            [FastArgFixed] => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize} ]
308                        |   [FastArgFixed, FastArgFixed] => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} ]
309                        |   [FastArgFixed, FastArgFixed, FastArgFixed] =>
310                                [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8. *) ]
311                        |   [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed] =>
312                                [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8 and arg4 in r9. *) ]
313                        |   [FastArgDouble] => [ (* Already in xmm0 *) ]
314                            (* X64 on both Windows and Unix take the first arg in xmm0 and the second in xmm1. They are already there. *)
315                        |   [FastArgDouble, FastArgDouble] => [ ]
316                            (* X64 on both Windows and Unix take the first arg in xmm0.  On Unix the integer argument is treated
317                               as the first argument and goes into edi.  On Windows it's treated as the second and goes into edx.
318                               N.B.  It's also the first argument in ML so is in rax. *)
319                        |   [FastArgDouble, FastArgFixed] => [ moveRR{source=eax, output=edx, opSize=nativeWordOpSize} ]
320                        |   [FastArgFloat] => []
321                        |   [FastArgFloat, FastArgFloat] => [] (* Already in xmm0 and xmm1 *)
322                        |   [FastArgFloat, FastArgFixed] => [moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize}]
323
324                        |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
325                    )
326            ) @
327            (*  For Windows/64 add in a 32 byte save area ater we've pushed any arguments. *)
328            (case abi of X64Win => [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg 32, opSize=nativeWordOpSize}] | _ => []) @
329            [
330                CallAddress(RegisterArg entryPtrReg), (* Call the function *)
331                loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *)
332            ] @
333            (
334            if targetArch <> Native32Bit then [loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize) ] (* Copy back the heap ptr *)
335            else []
336            ) @
337            [
338                (* Since this is an ML function we need to remove any ML stack arguments. *)
339                ReturnFromFunction mlArgsOnStack
340            ]
341 
342        val profileObject = createProfileObject functionName
343        val newCode = codeCreate (functionName, profileObject, debugSwitches)
344        val closure = makeConstantClosure()
345        val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
346    in
347        closureAsAddress closure
348    end
349    
350    
351    fun rtsCallFast (functionName, nArgs, debugSwitches) =
352        rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches)
353    
354    (* RTS call with one double-precision floating point argument and a floating point result. *)
355    fun rtsCallFastRealtoReal (functionName, debugSwitches) =
356        rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches)
357    
358    (* RTS call with two double-precision floating point arguments and a floating point result. *)
359    fun rtsCallFastRealRealtoReal (functionName, debugSwitches) =
360        rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches)
361
362    (* RTS call with one double-precision floating point argument, one fixed point argument and a
363       floating point result. *)
364    fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) =
365        rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches)
366
367    (* RTS call with one general (i.e. ML word) argument and a floating point result.
368       This is used only to convert arbitrary precision values to floats. *)
369    fun rtsCallFastGeneraltoReal (functionName, debugSwitches) =
370        rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches)
371
372    (* Operations on Real32.real values. *)
373
374    fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
375        rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches)
376    
377    fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) =
378        rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches)
379
380    (* RTS call with one double-precision floating point argument, one fixed point argument and a
381       floating point result. *)
382    fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) =
383        rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches)
384
385    (* RTS call with one general (i.e. ML word) argument and a floating point result.
386       This is used only to convert arbitrary precision values to floats. *)
387    fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
388        rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches)
389
390    datatype ffiABI =
391        FFI_SYSV        (* Unix 32 bit and Windows GCC 32-bit *)
392    |   FFI_STDCALL     (* Windows 32-bit system ABI.  Callee clears the stack. *)
393    |   FFI_MS_CDECL    (* VS 32-bit.  Same as SYSV except when returning a struct.  Default on Windows including GCC in Mingw. *)
394    |   FFI_WIN64       (* Windows 64 bit *)
395    |   FFI_UNIX64      (* Unix 64 bit. libffi also implements this on X86/32. *)
396    (* We don't include various other 32-bit Windows ABIs. *)
397
398    local
399        val getOSType: unit -> int = RunCall.rtsCallFast0 "PolyGetOSType"
400    in
401        (* This actually a constant since each exported saved state has
402           a distinct ABI.  However for compatibility with the interpreted
403           version we make this a function. *)
404        fun abiList () =
405            case getABI() of
406                X86_32 =>
407                    [("sysv", FFI_SYSV), ("stdcall", FFI_STDCALL), ("ms_cdecl", FFI_MS_CDECL),
408                        (* Default to MS_CDECL on Windows otherwise SYSV. *)
409                     ("default", if getOSType() = 1 then FFI_MS_CDECL else FFI_SYSV)]
410            |   X64Win => [("win64", FFI_WIN64), ("default", FFI_WIN64)]
411            |   X64Unix => [("unix64", FFI_UNIX64), ("default", FFI_UNIX64)]
412
413        type abi = ffiABI
414    end
415
416    fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align)
417    fun intAlignUp(s, align) = Word.toInt(alignUp(Word.fromInt s, align))
418    
419    val getThreadDataCall = makeEntryPoint "PolyX86GetThreadData"
420
421    local
422        val sysWordSize = Word.toInt(nativeWordSize div wordSize)
423    in
424        (* Code to box an address as a SysWord.word value *)
425        fun boxRegAsSysWord(boxReg, outputReg, saveRegs) =
426            AllocStore{ size=sysWordSize, output=outputReg, saveRegs=saveRegs } ::
427            (
428                if targetArch = Native64Bit
429                then 
430                [
431                    Move{source=NonAddressConstArg(LargeInt.fromInt sysWordSize),
432                          destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=outputReg, index=NoIndex},
433                          moveSize=opSizeToMove polyWordOpSize},
434                    Move{moveSize=Move8, source=NonAddressConstArg 1 (* byte *), destination=MemoryArg {offset= ~1, base=outputReg, index=NoIndex}}
435                ]
436                else
437                let
438                    val lengthWord = IntInf.orb(IntInf.fromInt sysWordSize, IntInf.<<(1, 0w24))
439                in
440                    [Move{source=NonAddressConstArg lengthWord, destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=outputReg, index=NoIndex},
441                          moveSize=opSizeToMove polyWordOpSize}]
442                end
443            ) @
444            Move{source=RegisterArg boxReg, destination=MemoryArg {offset=0, base=outputReg, index=NoIndex}, moveSize=opSizeToMove nativeWordOpSize} ::
445            (
446                if targetArch = ObjectId32Bit
447                then
448                [
449                    ArithToGenReg{ opc=SUB, output=outputReg, source=RegisterArg rbx, opSize=nativeWordOpSize },
450                    ShiftConstant{ shiftType=SHR, output=outputReg, shift=0w2, opSize=OpSize64 }
451                ]
452                else []
453            ) @ [StoreInitialised]
454    end
455
456    (* Build a foreign call function.  The arguments are the abi, the list of argument types and the result type.
457       The result is the code of the ML function that takes three arguments: the C function to call, the arguments
458       as a vector of C values and the address of the memory for the result. *)
459
460    (* This must match the type in Foreign.LowLevel.  Once this is bootstrapped we could use that
461       type but note that this is the type we use within the compiler and we build Foreign.LowLevel
462       AFTER compiling this. *)
463    datatype cTypeForm =
464        CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt
465    |   CTypeStruct of cType list | CTypeVoid
466    withtype cType = { typeForm: cTypeForm, align: word, size: word }
467
468    fun call32Bits(abi, args, result) =
469    let
470        (* 32-bit arguments.  These all go to the stack so we can simply push them.  The arguments go on the
471           stack in reverse order. *)
472        fun loadArgs32([], stackOffset, argOffset, code, continue) = continue(stackOffset, argOffset, code)
473
474        |   loadArgs32(arg::args, stackOffset, argOffset, code, continue) =
475            let
476                val {size, align, typeForm} = arg
477                val newArgOffset = alignUp(argOffset, align)
478                val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}
479            in
480                case (typeForm, size) of
481                    (CTypeStruct elements, _) => (* structs passed as values are recursively unpacked. *)
482                        loadArgs32(elements, stackOffset, newArgOffset (* Struct is aligned. *), code,
483                            fn (so, ao, code) => loadArgs32(args, so, ao, code, continue))
484                |   (CTypeVoid, _) => raise Foreign.Foreign "Void cannot be used for a function argument"
485                |   (CTypeUnsignedInt, 0w1) => (* Unsigned char. *)
486                        loadArgs32(args, stackOffset+4, newArgOffset+size,
487                            Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8 }
488                                :: PushToStack(RegisterArg edx) :: code, continue)
489                |   (CTypeSignedInt, 0w1) => (* Signed char. *)
490                        loadArgs32(args, stackOffset+4, newArgOffset+size,
491                            Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8X32 }
492                                :: PushToStack(RegisterArg edx) :: code, continue)
493                |   (CTypeUnsignedInt, 0w2) => (* Unsigned 16-bits. *)
494                        loadArgs32(args, stackOffset+4, newArgOffset+size,
495                            Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16 }
496                                :: PushToStack(RegisterArg edx) :: code, continue)
497                |   (CTypeSignedInt, 0w2) => (* Signed 16-bits. *)
498                        loadArgs32(args, stackOffset+4, newArgOffset+size,
499                            Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16X32 }
500                                :: PushToStack(RegisterArg edx) :: code, continue)
501                |   (_, 0w4) => (* 32-bits. *)
502                        loadArgs32(args, stackOffset+4, newArgOffset+size, PushToStack(MemoryArg baseAddr) :: code, continue)
503                |   (CTypeFloatingPt, 0w8) =>(* Double: push the two words.  High-order word first, then low-order. *)
504                        loadArgs32(args, stackOffset+8, newArgOffset+size,
505                            PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset+4, index=NoIndex}) ::
506                                PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}) :: code, continue)
507                |   _ => raise Foreign.Foreign "argument type not supported"
508            end
509
510        val {typeForm, size, ...} =  result
511
512        val resultMemory = {base=ecx, offset=0, index=NoIndex}
513        (* Structures are passed by reference by storing the address of the result as
514           the first argument except that in MS_CDECL (and STDCALL?) structures of
515           size 1, 2, 4 and 8 are returned in EAX, and for 8, EDX.  *)
516        val (getResult, needResultAddress) =
517            if (case typeForm of CTypeStruct _ => true | _ => false) andalso
518                (abi = FFI_SYSV orelse (size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8))
519            (* TODO: We have to get the address of the destination area. *)
520            then ([], true)
521            else if typeForm = CTypeVoid
522            then ([], false)
523            else
524                (loadMemory(ecx, esp, 4, nativeWordOpSize)  ::
525                loadHeapMemory(ecx, ecx, 0, nativeWordOpSize) ::
526                (if size = 0w1
527                then (* Single byte *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move8}]
528                else if size = 0w2
529                then (* 16-bits *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move16}]
530                else if typeForm = CTypeFloatingPt andalso size = 0w4
531                then [FPStoreToMemory{address=resultMemory, precision=SinglePrecision, andPop=true }]
532                else if size = 0w4
533                then [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32}]
534                else if typeForm = CTypeFloatingPt andalso size = 0w8
535                then [FPStoreToMemory{address=resultMemory, precision=DoublePrecision, andPop=true }]
536                else if size = 0w8
537                then
538                [
539                    Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32},
540                    Move{source=RegisterArg edx, destination=MemoryArg {base=ecx, offset=4, index=NoIndex}, moveSize=Move32}
541                ]
542                else raise Foreign.Foreign "Unrecognised result type"), false)
543
544        local
545            (* Load the arguments.  If we need to pass the return address for a struct that is the first arg. *)
546            val (startStack, startCode) =
547                if needResultAddress
548                then (4, [PushToStack(MemoryArg{base=ecx, offset=0, index=NoIndex})])
549                else (0, [])
550        in
551            val (argCode, argStack) =
552                loadArgs32(args, startStack, 0w0, startCode, fn (stackOffset, _, code) => (code, stackOffset))
553        end
554
555        local
556            val align = argStack mod 16
557        in
558            (* Always align the stack.  It's not always necessary on 32-bits but GCC prefers it. *)
559            val preArgAlign = if align = 0 then 0 else 16-align
560            (* Adjustment to be made when the function returns.  Stdcall resets the stack in the callee. *)
561            val postCallStackReset =
562                preArgAlign + (if abi = FFI_STDCALL then 0 else argStack)
563        end
564
565        in
566            (
567                (* If we're returning a struct we need the result address before we call. *)
568                if needResultAddress then [loadMemory(ecx, esp, 4, nativeWordOpSize)] else []
569            ) @
570            [
571                (* Save the stack pointer. *)
572                storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
573                loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
574            ] @
575            (
576                if preArgAlign = 0
577                then []
578                else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
579            ) @
580            (
581                (* The second argument is a SysWord containing the address of a malloced area of memory
582                   with the actual arguments in it. *)
583                if null args
584                then []
585                else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)]
586            ) @ argCode @
587            CallAddress(MemoryArg{base=eax, offset=0, index=NoIndex}) ::
588            (* Restore the C stack.  This is really only necessary if we've called a callback
589               since that will store its esp value.  *)
590            (
591                if postCallStackReset = 0
592                then []
593                else [ArithToGenReg{opc=ADD, output=esp, source=NonAddressConstArg(LargeInt.fromInt postCallStackReset), opSize=nativeWordOpSize}]
594            ) @
595            [
596                storeMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize),
597                loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *)
598            ] @ getResult @ (* Store the result in the destination. *) [ ReturnFromFunction 1 ]
599        end
600    
601    fun closure32Bits(abi, args, result) =
602    let
603        (* Arguments are copied from the stack into a struct that is then passed to the
604           ML function. *)
605        fun copyArgs([], nArgs, argOffset, code, continue) = continue(nArgs, argOffset, code)
606
607        |   copyArgs(arg::args, nArgs, argOffset, code, continue) =
608            let
609                val {size, align, typeForm} =  arg
610                val newArgOffset = alignUp(argOffset, align)
611                val sourceAddr = {base=ebx, offset=nArgs*4, index=NoIndex}
612                val destAddr = {base=esp, offset=Word.toInt newArgOffset, index=NoIndex}
613            in
614                case (typeForm, size) of
615                    (CTypeStruct elements, _) =>
616                        (* structs passed as values are recursively unpacked. *)
617                        copyArgs(elements, nArgs, newArgOffset (* Struct is aligned. *), code,
618                            fn (na, ao, c) => copyArgs(args, na, ao, c, continue))
619                |   (CTypeVoid, _) =>
620                        raise Foreign.Foreign "Void cannot be used for a function argument"
621                |   (CTypeFloatingPt, 0w8) =>
622                    (* Double: copy the two words.  High-order word first, then low-order. *)
623                    copyArgs(args, nArgs+2, argOffset+size,
624                        Move{source=MemoryArg sourceAddr, destination=RegisterArg eax, moveSize=Move32} ::
625                        Move{source=RegisterArg eax, destination=MemoryArg destAddr, moveSize=Move32} ::
626                        Move{source=MemoryArg {base=ebx, offset=nArgs*4+4, index=NoIndex}, destination=RegisterArg eax, moveSize=Move32} ::
627                        Move{source=RegisterArg eax, destination=MemoryArg{base=esp, offset=Word.toInt newArgOffset + 4, index=NoIndex}, moveSize=Move32} ::
628                            code, continue)
629                |   _ => (* Everything else is a single word on the stack. *)
630                    let
631                        val moveOp =
632                            case size of
633                                0w1 => Move8
634                            |   0w2 => Move16
635                            |   0w4 => Move32
636                            |   _ => raise Foreign.Foreign "copyArgs: Invalid size"
637                    in
638                        copyArgs(args, nArgs+1, argOffset+size,
639                            Move{source=MemoryArg sourceAddr, destination=RegisterArg eax, moveSize=Move32} ::
640                                Move{source=RegisterArg eax, destination=MemoryArg destAddr, moveSize=moveOp} :: code, continue)
641                    end
642            end
643
644        val {typeForm, size, align, ...} =  result
645
646        (* Struct results are normally passed by reference. *)
647        val resultStructByRef =
648            (case typeForm of CTypeStruct _ => true | _ => false) andalso
649                (abi = FFI_SYSV orelse (size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8))
650
651        val (argCount, argumentSpace, copyArgsFromStack) =
652            copyArgs(args, if resultStructByRef then 1 else 0, 0w0, [], fn result => result)
653
654        val resultOffset = alignUp(argumentSpace, align) (* Offset of result area *)
655
656        val (loadResults, resultSize) =
657            if typeForm = CTypeVoid orelse resultStructByRef
658            then ([], 0w0)
659            else
660            let
661                val resultMem = {base=esp, offset=Word.toInt resultOffset, index=NoIndex}
662                val resultCode =
663                    case (typeForm, size) of
664                        (CTypeSignedInt, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move8X32 }]
665                    |   (_, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move8 }]
666                    |   (CTypeSignedInt, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move16X32 }]
667                    |   (_, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move16 }]
668                    |   (CTypeFloatingPt, 0w4) => [FPLoadFromMemory{ address=resultMem, precision=SinglePrecision }]
669                    |   (_, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move32 }]
670                    |   (CTypeFloatingPt, 0w8) => [FPLoadFromMemory{ address=resultMem, precision=DoublePrecision }]
671                    |   (_, 0w8) => (* MSC only.  Struct returned in eax/edx. *)
672                        [
673                            Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move32 },
674                            Move{source=MemoryArg {base=esp, offset=Word.toInt resultOffset + 4, index=NoIndex},
675                                destination=RegisterArg edx, moveSize=Move32 }
676                        ]
677                    |   _ => raise Foreign.Foreign "Unrecognised result type"
678            in
679                (resultCode, size)
680            end
681
682        val stackSpace = Word.toInt(resultOffset + resultSize)
683        
684        local
685            val align = stackSpace mod 16
686        in
687            (* Stack space.  In order to align the stack correctly for GCC we need the value in memRegCStackPtr
688               to be a multiple of 16 bytes + 8.  esp would have been on a 16 byte boundary before the return address
689               was pushed so after pushing the return address and four registers we need a further 4 bytes
690               to get the alignment back again.  The effect of this is that the argument and result area is
691               on an 8-byte boundary. *)
692            val stackToAllocate = stackSpace + (if align = 0 then 0 else 16-align) + 4
693        end
694    in
695        [
696            (* Push callee-save registers. *)
697            PushToStack(RegisterArg ebp), PushToStack(RegisterArg ebx), PushToStack(RegisterArg edi), PushToStack(RegisterArg esi),
698            (* Set ebx to point to the original args. *)
699            LoadAddress{ output=ebx, offset=20, base=SOME esp, index=NoIndex, opSize=OpSize32},
700            (* Allocate stack space. *)
701            ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=OpSize32},
702            (* Move the function address in eax into esi since that's a callee-save register. *)
703            Move{source=RegisterArg eax, destination=RegisterArg esi, moveSize=Move32}
704        ] @ copyArgsFromStack @
705        [
706            (* Get the value for ebp. *)
707            Move{source=AddressConstArg getThreadDataCall, destination=RegisterArg ecx, moveSize=Move32},
708            CallAddress(MemoryArg{base=ecx, offset=0, index=NoIndex}), (* Get the address - N.B. Heap addr in 32-in-64. *)
709            moveRR{source=eax, output=ebp, opSize=OpSize32},
710            (* Save the address of the argument and result area. *)
711            moveRR{source=esp, output=ecx, opSize=OpSize32},
712            (* Switch to the ML stack. *)
713            storeMemory(esp, ebp, memRegCStackPtr, OpSize32),
714            loadMemory(esp, ebp, memRegStackPtr, OpSize32),
715            (* Move esi into the closure register edx *)
716            Move{source=RegisterArg esi, destination=RegisterArg edx, moveSize=Move32}
717        ] @ boxRegAsSysWord(ecx, eax, []) @
718        (
719            (* If we're returning a struct the address for the result will have been passed in the
720               first argument.  We use that as the result area.  Otherwise point to the result
721               area on the stack.  *)
722            if resultStructByRef
723            then Move{source=MemoryArg {offset=0, base=ebx, index=NoIndex}, destination=RegisterArg ecx, moveSize=Move32}
724            else ArithToGenReg{opc=ADD, output=ecx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=OpSize32}
725        ) :: boxRegAsSysWord(ecx, ebx, [eax]) @
726        [
727            (* Call the ML function using the full closure call. *)
728            CallAddress(MemoryArg{offset=0, base=edx, index=NoIndex}),
729            (* Save the ML stack pointer because we may have grown the stack.  Switch to the C stack. *)
730            storeMemory(esp, ebp, memRegStackPtr, OpSize32),
731            loadMemory(esp, ebp, memRegCStackPtr, OpSize32)
732        ] @ loadResults @
733        [
734            (* Remove the stack space. *)
735            ArithToGenReg{opc=ADD, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=OpSize32},
736            PopR esi, PopR edi, PopR ebx, PopR ebp (* Restore callee-save registers. *)
737        ] @
738        (
739            (* If we've passed in the address of the area for the result structure
740               we're supposed to pass that back in eax. *)
741            if resultStructByRef
742            then [loadMemory(eax, esp, 4, OpSize32)]
743            else []
744        ) @
745        [
746            (* Callee removes arguments in StdCall. *)
747            ReturnFromFunction (if abi = FFI_STDCALL then argCount else 0)
748        ]
749    end
750
751    local (* Windows on X64. *)
752        val win64ArgRegs = [ (rcx, xmm0), (rdx, xmm1), (r8, xmm2), (r9, xmm3) ]
753    in
754        fun callWindows64Bits(args, result) =
755        let
756            val extraStackReg = r10 (* Not used for any arguments. *)
757
758            fun loadWin64Args([], stackOffset, _, _, code, extraStack, preCode) = (code, stackOffset, preCode, extraStack)
759
760            |   loadWin64Args(arg::args, stackOffset, argOffset, regs, code, extraStack, preCode) =
761                let
762                    val {size, align, typeForm, ...} =  arg
763                    val newArgOffset = alignUp(argOffset, align)
764                    val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}
765                    val workReg = rcx (* rcx: the last to be loaded. *)
766                
767                    (* Integer arguments. *)
768                    fun loadIntArg moveOp =
769                        case regs of
770                            (areg, _) :: regs' => 
771                                loadWin64Args(args, stackOffset, newArgOffset+size, regs',
772                                    Move{source=MemoryArg baseAddr, destination=RegisterArg areg, moveSize=moveOp } :: code,
773                                    extraStack, preCode)
774                        |   [] =>
775                                loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
776                                    if size = 0w8
777                                    then PushToStack(MemoryArg baseAddr) :: code
778                                    else (* Need to load it into a register first. *)
779                                        Move{source=MemoryArg baseAddr,
780                                               destination=RegisterArg workReg, moveSize=moveOp } :: PushToStack(RegisterArg workReg) :: code,
781                                    extraStack, preCode)
782                in
783                    (* Structs of 1, 2, 4 and 8 bytes are passed as the corresponding int.  It may not
784                       be necessary to sign-extend 1, 2 or 4-byte values.
785                       2, 4 or 8-byte structs may not be aligned onto the appropriate boundary but
786                       it should still work. *)
787                    case (size, typeForm) of
788                        (0w1, CTypeSignedInt) => (* Signed char. *) loadIntArg Move8X64
789                    |   (0w1, _) => (* Unsigned char or single byte struct *) loadIntArg Move8
790
791                    |   (0w2, CTypeSignedInt) =>(* Signed 16-bits. *) loadIntArg Move16X64
792                    |   (0w2, _) => (* Unsigned 16-bits. *) loadIntArg Move16
793
794                    |   (0w4, CTypeFloatingPt) =>
795                        (
796                            case regs of
797                                (_, fpReg) :: regs' => 
798                                loadWin64Args(args, stackOffset, newArgOffset+size, regs',
799                                    XMMArith{opc=SSE2MoveFloat, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode)
800                            |   [] =>
801                                loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
802                                    Move{source=MemoryArg baseAddr,
803                                           destination=RegisterArg workReg, moveSize=Move32 } :: PushToStack(RegisterArg workReg) :: code,
804                                    extraStack, preCode)
805                        )
806                    |   (0w4, CTypeSignedInt) => (* Signed 32-bits. *) loadIntArg Move32X64
807                    |   (0w4, _) => (* Unsigned 32-bits. *) loadIntArg Move32
808
809                    |   (0w8, CTypeFloatingPt) =>
810                        (
811                            case regs of
812                                (_, fpReg) :: regs' => 
813                                loadWin64Args(args, stackOffset, newArgOffset+size, regs',
814                                    XMMArith{opc=SSE2MoveDouble, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode)
815                            |   [] =>
816                                loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
817                                    Move{source=MemoryArg baseAddr,
818                                           destination=RegisterArg workReg, moveSize=Move64 } :: PushToStack(RegisterArg workReg) :: code,
819                                    extraStack, preCode)
820                        )
821                    |   (0w8, _) => (* 64-bits. *) loadIntArg Move64
822
823                    |   (_, CTypeStruct _) =>
824                        let
825                            (* Structures of other sizes are passed by reference.  They are first
826                               copied into new areas on the stack.  This ensures that the called function
827                               can update the structure without changing the original values. *)
828                            val newExtra = intAlignUp(extraStack + Word.toInt size, 0w16)
829                            val newPreCode =
830                                moveMemory{source=(mlArg2Reg, Word.toInt newArgOffset), destination=(extraStackReg, extraStack),
831                                           length=Word.toInt size} @ preCode
832                        in
833                            case regs of
834                                (areg, _) :: regs' => 
835                                loadWin64Args(args, stackOffset, newArgOffset+size, regs',
836                                    loadAddress{source=(extraStackReg, extraStack), destination=areg} :: code,
837                                    newExtra, newPreCode)
838                            |   [] =>
839                                loadWin64Args(args, stackOffset+8, newArgOffset+size, [],
840                                    loadAddress{source=(extraStackReg, extraStack), destination=workReg} ::
841                                    PushToStack(RegisterArg workReg) :: code, newExtra, newPreCode)
842                        end
843                   
844                    |   _ => raise Foreign.Foreign "Unrecognised type for function argument"
845                end
846
847            val {typeForm, size, ...} =  result
848        
849            val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *)
850            val resultMemory = {base=resultAreaPtr, offset=0, index=NoIndex}
851            fun storeIntValue moveOp =
852                ([Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=moveOp}], false)
853            and storeFloatValue precision =
854                ([XMMStoreToMemory{toStore=xmm0, address=resultMemory, precision=precision}], false)
855
856            val (getResult, passStructAddress) =
857                case (typeForm, size) of
858                    (CTypeVoid, _) => ([], false)
859                |   (_, 0w1) (* Includes structs *) => (* Single byte *) storeIntValue Move8
860                |   (_, 0w2) => (* 16-bits *) storeIntValue Move16
861                |   (CTypeFloatingPt, 0w4) => storeFloatValue SinglePrecision
862                |   (_, 0w4) => storeIntValue Move32
863                |   (CTypeFloatingPt, 0w8) => storeFloatValue DoublePrecision
864                |   (_, 0w8) => storeIntValue Move64
865                |   (CTypeStruct _, _) => ([], true)
866                |   _ => raise Foreign.Foreign "Unrecognised result type"
867
868            (* argCode is the code to load and push the arguments.  argStack is the amount of stack space
869               the arguments will take.  It's only used to ensure that the stack is aligned onto a 16-byte
870               boundary.  preArgCode is any code that is needed to copy the arguments before they are
871               actually loaded.  Because it is done before the argument registers are loaded it can
872               use rcx, rdi and rsi.  extraStack is local stack space needed.  It is usually zero but
873               if it is non-zero it must be a multiple of 16 bytes.  The address of this area is loaded
874               into r10 before preArgCode is called. *)
875            val (argCode, argStack, preArgCode, extraStack) =
876                if passStructAddress
877                then (* The address of the result structure goes in the first argument register: rcx *)
878                    loadWin64Args(args, 0, 0w0, tl win64ArgRegs,
879                        [moveRR{source=resultAreaPtr, output=rcx, opSize=nativeWordOpSize}], 0, [])
880                else loadWin64Args(args, 0, 0w0, win64ArgRegs, [], 0, [])
881
882            local
883                val align = argStack mod 16
884            in
885                (* Always align the stack. *)
886                val preArgAlign = if align = 0 then 0 else 16-align
887                (* The total space on the stack that needs to be removed at the end. *)
888                val postCallStackReset = argStack + preArgAlign + extraStack + 32
889            end
890
891            in
892                (* Save heap ptr.  Needed in case we have a callback. *)
893                [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @
894                (
895                    (* Put the destination address into a callee save resgister.
896                       We have to put the C address in there now because an ML address wouldn't be updated
897                       by a possible GC in a callback. *)
898                    if #typeForm( result) <> CTypeVoid
899                    then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)]
900                    else []
901                ) @
902                [
903                    (* Save the stack pointer. *)
904                    storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
905                    loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
906                ] @
907                (
908                    if extraStack = 0
909                    then []
910                    else
911                    [
912                        ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt extraStack), opSize=nativeWordOpSize},
913                        Move{source=RegisterArg rsp, destination=RegisterArg extraStackReg, moveSize=Move64}
914                    ]
915                ) @
916                (
917                    if preArgAlign = 0
918                    then []
919                    else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
920                ) @
921                (
922                    (* The second argument is a SysWord containing the address of a malloced area of memory
923                       with the actual arguments in it. *)
924                    if null args
925                    then []
926                    else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)]
927                ) @ preArgCode @ argCode @
928                [ (* Reserve a 32-byte area after the arguments.  This is specific to the Windows ABI. *)
929                    ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt 32), opSize=nativeWordOpSize},
930                    let
931                        (* The entry point is in a SysWord.word value in RAX. *)
932                        val entryPoint =
933                            case targetArch of
934                                ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax}
935                            |   _ => MemoryArg{base=eax, offset=0, index=NoIndex}
936                    in
937                        (* Call the function.  We're discarding the value in rsp so no need to remove args. *)
938                        CallAddress entryPoint
939                    end,
940                    (* Restore the C stack value in case it's been changed by a callback. *)
941                    ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt postCallStackReset), opSize=nativeWordOpSize},
942                    storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize),
943                    loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *)
944                    (* Reload the heap pointer.  If we've called back to ML this could well have changed. *)
945                    loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize)
946                ] @ (* Store the result in the destination. *) getResult @ [ReturnFromFunction 0 ]
947            end (* callWindows64Bits *)
948
949        fun closureWindows64Bits(args, result) =
950        let
951            val {typeForm, size, align, ...} =  result
952
953            (* Struct results are normally passed by reference. *)
954            val resultStructByRef = (* If true we've copied rcx (the first arg) into r9 *)
955                (case typeForm of CTypeStruct _ => true | _ => false) andalso
956                    size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8
957
958            (* Store the register arguments and copy everything else into the argument structure on the stack.
959               The code is ordered so that the early arguments are stored first. *)
960            fun copyWin64Args([], _, _, _) = []
961
962            |   copyWin64Args(arg::args, nStackArgs, argOffset, regs) =
963                let
964                    val {size, align, typeForm, ...} =  arg
965                    val newArgOffset = alignUp(argOffset, align)
966                    val destAddr = {base=rsp, offset=Word.toInt newArgOffset, index=NoIndex}
967                
968                    (* Integer arguments. *)
969                    fun moveIntArg moveOp =
970                        case regs of
971                            (areg, _) :: regs' =>
972                                Move{source=RegisterArg areg, destination=MemoryArg destAddr, moveSize=moveOp } ::
973                                    copyWin64Args(args, nStackArgs, newArgOffset+size, regs')
974                        |   [] =>
975                                Move{source=MemoryArg {base=r10, offset=nStackArgs*8, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64} ::
976                                    Move{source=RegisterArg rax, destination=MemoryArg destAddr, moveSize=moveOp} :: 
977                                    copyWin64Args(args, nStackArgs+1, newArgOffset+size, [])
978                in
979                    (* Structs of 1, 2, 4 and 8 bytes are passed as the corresponding int. *)
980                    case (typeForm, size) of
981                        (_, 0w1) => moveIntArg Move8
982                    |   (_, 0w2) => moveIntArg Move16
983
984                    |   (CTypeFloatingPt, 0w4) =>
985                        (
986                            case regs of
987                                (_, fpReg) :: regs' =>
988                                    XMMStoreToMemory{ toStore=fpReg, address=destAddr, precision=SinglePrecision} :: 
989                                        copyWin64Args(args, nStackArgs, newArgOffset+size, regs')
990                            |   [] => moveIntArg Move32
991                        )
992                    |   (_, 0w4) => (* 32-bits *) moveIntArg Move32
993
994                    |   (CTypeFloatingPt, 0w8) =>
995                        (
996                            case regs of
997                                (_, fpReg) :: regs' =>
998                                    XMMStoreToMemory{ toStore=fpReg, address=destAddr, precision=DoublePrecision} :: 
999                                        copyWin64Args(args, nStackArgs, newArgOffset+size, regs')
1000                            |   [] => moveIntArg Move64
1001                        )
1002                    |   (_, 0w8) => (* 64-bits. *) moveIntArg Move64
1003
1004                    |   (CTypeStruct _, _) =>
1005                        (* Structures of other size are passed by reference.  We need to copy the source
1006                               structure into our stack area.  Since rsi and rdi aren't used as args and
1007                               rcx is only used for the first argument we can copy the argument now. *)
1008                        (
1009                            case regs of
1010                                (arg, _) :: regs' =>
1011                                    moveMemory{source=(arg, 0), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @
1012                                    copyWin64Args(args, nStackArgs, newArgOffset+size, regs')
1013                            |   [] =>
1014                                    moveMemory{source=(r10, nStackArgs*8), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @
1015                                    copyWin64Args(args, nStackArgs+1, newArgOffset+size, [])
1016                        )
1017                            
1018                    |   _ => raise Foreign.Foreign "Unrecognised type for function argument"
1019                end
1020
1021            val copyArgsFromRegsAndStack =
1022                if resultStructByRef
1023                then (* If we're returning a struct by reference we copy the address into r9 and pass that
1024                        as the result address. *)
1025                    Move{source=RegisterArg rcx, destination=RegisterArg r9, moveSize=Move64} ::
1026                        copyWin64Args(args, 0, 0w0, tl win64ArgRegs)
1027                else copyWin64Args(args, 0, 0w0, win64ArgRegs)
1028
1029            local
1030                fun getNextSize (arg, argOffset) =
1031                let val {size, align, ...} =  arg in alignUp(argOffset, align) + size end
1032            in
1033                val argumentSpace = List.foldl getNextSize 0w0 args
1034            end
1035
1036            val resultOffset = alignUp(argumentSpace, align) (* Offset of result area *)
1037        
1038            val (loadResults, resultSize) =
1039                if typeForm = CTypeVoid orelse resultStructByRef
1040                then ([], 0w0)
1041                else
1042                let
1043                    val resultMem = {base=rsp, offset=Word.toInt resultOffset, index=NoIndex}
1044                    val resultCode =
1045                        case (typeForm, size) of
1046                            (CTypeSignedInt, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move8X64}]
1047                        |   (_, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move8}]
1048                        |   (CTypeSignedInt, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move16X64}]
1049                        |   (_, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move16}]
1050                        |   (CTypeFloatingPt, 0w4) => [XMMArith{opc=SSE2MoveFloat, source=MemoryArg resultMem, output=xmm0}]
1051                        |   (CTypeSignedInt, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move32X64}]
1052                        |   (_, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move32}]
1053                        |   (CTypeFloatingPt, 0w8) => [XMMArith{opc=SSE2MoveDouble, source=MemoryArg resultMem, output=xmm0}]
1054                        |   (_, 0w8) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move64}]
1055                        |   _ => raise Foreign.Foreign "Unrecognised result type"
1056                in
1057                    (resultCode, size)
1058                end
1059
1060            (* Stack space.  The stack must be 16 byte aligned.  We've pushed 8 regs and a return address
1061               so add a further 8 bytes to bring it back into alignment.  If we're returning a struct
1062               by reference, though, we've pushed 9 regs so don't add 8. *)
1063            val stackToAllocate =
1064                Word.toInt(alignUp(resultOffset + resultSize, 0w16)) + (if resultStructByRef then 0 else 8)
1065        in
1066        [
1067            (* Push callee-save registers. *)
1068            PushToStack(RegisterArg rbp), PushToStack(RegisterArg rbx), PushToStack(RegisterArg r12), PushToStack(RegisterArg r13),
1069            PushToStack(RegisterArg r14), PushToStack(RegisterArg r15), PushToStack(RegisterArg rdi), PushToStack(RegisterArg rsi)
1070        ] @
1071        (
1072            (* If we're returning a struct by reference we have to return the address in rax even though
1073               it's been set by the caller.  Save this address. *)
1074            if resultStructByRef
1075            then [PushToStack(RegisterArg rcx)]
1076            else []
1077        ) @
1078        [
1079            (* Set r10 to point to the original stack args if any.  This is beyond the pushed regs and also the 32-byte area. *)
1080            LoadAddress{ output=r10, offset=if resultStructByRef then 112 else 104, base=SOME rsp, index=NoIndex, opSize=nativeWordOpSize},
1081            (* Allocate stack space. *)
1082            ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize},
1083            (* Move the function we're calling, in rax, into r13, a callee-save register *)
1084            moveRR{source=rax, output=r13, opSize=polyWordOpSize}
1085        ]
1086          @ copyArgsFromRegsAndStack @
1087        [
1088            (* Get the value for rbp. *)
1089            (* This is a problem for 32-in-64.  The value of getThreadDataCall is an object ID but rbx may well no
1090               longer hold the heap base address.  We use a special inline constant to hold the full 64-bit address. *)
1091            LoadAbsolute{value=getThreadDataCall, destination=rcx},
1092            CallAddress(MemoryArg{base=rcx, offset=0, index=NoIndex}),
1093            moveRR{source=rax, output=rbp, opSize=nativeWordOpSize},
1094            (* Save the address of the argument and result area. *)
1095            moveRR{source=rsp, output=rcx, opSize=nativeWordOpSize},
1096            (* Switch to the ML stack. *)
1097            storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize),
1098            loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize),
1099            (* Load the ML heap pointer. *)
1100            loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize),
1101            (* Now move the function closure into the closure register ready for the call. *)
1102            moveRR{source=r13, output=rdx, opSize=polyWordOpSize}
1103        ] @
1104        (* Reload the heap base address in 32-in-64. *)
1105        ( if targetArch = ObjectId32Bit then [loadMemory(rbx, rbp, memRegSavedRbx, nativeWordOpSize)] else [] )
1106         @ boxRegAsSysWord(rcx, rax, []) @
1107        (
1108            (* If we're returning a struct by reference the address for the result will have been passed in the
1109               first argument.  We use that as the result area.  Otherwise point to the result area on the stack.  *)
1110            if resultStructByRef
1111            then loadMemory(rcx, r10, ~112, nativeWordOpSize)
1112            else ArithToGenReg{opc=ADD, output=rcx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=nativeWordOpSize}
1113        ) :: boxRegAsSysWord(rcx, mlArg2Reg, [rax]) @
1114        [
1115            (* Call the ML function using the full closure call. *)
1116            CallAddress(
1117                if targetArch = ObjectId32Bit
1118                then MemoryArg{base=rbx, index=Index4 rdx, offset=0}
1119                else MemoryArg{base=rdx, index=NoIndex, offset=0}),
1120            (* Save the ML stack pointer because we may have grown the stack.  Switch to the C stack. *)
1121            storeMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize),
1122            loadMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize),
1123            storeMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize)
1124        ] @ loadResults @
1125        [
1126            (* Remove the stack space. *)
1127            ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize}
1128        ] @
1129        ( if resultStructByRef then [PopR rax] else [] ) @
1130        [
1131            PopR rsi, PopR rdi, PopR r15, PopR r14, PopR r13, PopR r12, PopR rbx, PopR rbp, (* Restore callee-save registers. *)
1132            ReturnFromFunction 0 (* Caller removes any stack arguments. *)
1133        ]
1134        end
1135    end
1136
1137    local
1138        (* The rules for passing structs in SysV on X86/64 are complicated but most of the special
1139           cases don't apply.  We don't support floating point larger than 8 bytes, packed structures
1140           or C++ constructors.  It then reduces to the following:
1141           Structures of up to 8 bytes are passed in a single register and of 8-16 bytes in two
1142           registers.  Larger structures are passed on the stack.  The question is whether to use
1143           general registers or SSE2 registers.  Each 8 byte chunk is considered independently after
1144           any internal structs have been unwrapped.  Each chunk will consist of either a single
1145           8-byte value (i.e.. a pointer, int64_t or a double) or one or more smaller values and
1146           possibly some padding.  An SSE2 register is used if the value is a double, two floats
1147           or a single float and padding.  Otherwise it must have at least one shorter int-like
1148           type (e.g. int, char, short etc) in which case a general register is used.  That
1149           applies even if it also contains a float.  If, having selected the kind of
1150           registers to be used, there are not enough for the whole struct it is passed
1151           on the stack.
1152        
1153           We don't really need this for simple arguments but it's easier to consider
1154           them all together. *)
1155        datatype argClass = ArgInMemory | ArgInRegs of { firstInSSE: bool, secondInSSE: bool }
1156        
1157        fun classifyArg arg =
1158        let
1159            val {size, ...} =  arg
1160
1161            (* Unwrap the struct and any internal structs. *)
1162            fun getFields([], _) = []
1163
1164            |   getFields(field::fields, offset) =
1165                let
1166                    val {size, align, typeForm} =  field
1167                    val alignedOffset = alignUp(offset, align) (* Align this even if it's a sub-struct *)
1168                in
1169                    case typeForm of
1170                        CTypeVoid => raise Foreign.Foreign "Void cannot be used for a function argument"
1171                    |   CTypeStruct elements =>
1172                            getFields(elements, alignedOffset) @ getFields(fields, alignedOffset+size)
1173                    |   _  => (typeForm, alignedOffset) :: getFields(fields, alignedOffset+size)
1174                end
1175       
1176            val isSSE =
1177                List.all (fn (CTypeFloatingPt, _) => true | _ => false)
1178        in
1179            if size > 0w16
1180            then ArgInMemory
1181            else
1182            let
1183                val fieldsAndOffsets = getFields([arg], 0w0)
1184            in
1185                if size <= 0w8 (* Only the first register will be used. *)
1186                then ArgInRegs{firstInSSE=isSSE fieldsAndOffsets, secondInSSE=false}
1187                else
1188                let
1189                    val (first8Bytes, second8Bytes) =
1190                        List.partition (fn (_, off) => off <= 0w8) fieldsAndOffsets
1191                in
1192                    ArgInRegs{firstInSSE=isSSE first8Bytes, secondInSSE=isSSE second8Bytes}
1193                end
1194            end
1195        end
1196
1197        val sysVGenRegs = [rdi, rsi, rdx, rcx, r8, r9]
1198        and sysVFPRegs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
1199
1200        (* Store a register into upto 8 bytes.  Most values will involve a single store but odd-sized
1201           structs can require shifts and multiple stores.  N.B.  May modify the source register. *)
1202        fun storeUpTo8(reg, base, offset, size) =
1203        let
1204            val moveOp =
1205                if size = 0w8 then Move64 else if size >= 0w4 then Move32 else if size >= 0w2 then Move16 else Move8
1206        in
1207            [Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset, index=NoIndex}, moveSize=moveOp}]
1208        end @
1209        (
1210            if size = 0w6 orelse size = 0w7
1211            then
1212            [
1213                ShiftConstant{ shiftType=SHR, output=reg, shift=0w32, opSize=OpSize64 },
1214                Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset+4, index=NoIndex}, moveSize=Move16}
1215            ]
1216            else []
1217        ) @
1218        (
1219            if size = 0w3 orelse size = 0w5 orelse size = 0w7
1220            then
1221            [
1222                ShiftConstant{ shiftType=SHR, output=reg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 },
1223                Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset+Word.toInt(size-0w1), index=NoIndex}, moveSize=Move8}
1224            ]
1225            else []
1226        )
1227
1228    in
1229        fun callUnix64Bits(args, result) =
1230        let
1231            val argWorkReg = r10 (* Not used for any arguments. *)
1232            val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *)
1233            val argPtrReg = r11 (* Pointer to argument area - Can't use mlArg2Reg since that's RSI on 32-in-64. *)
1234
1235            fun loadSysV64Args([], stackOffset, _, _, _, code, preCode) = (code, stackOffset, preCode)
1236
1237            |   loadSysV64Args(arg::args, stackOffset, argOffset, gRegs, fpRegs, code, preCode) =
1238                let
1239                    val {size, align, typeForm, ...} =  arg
1240
1241                    (* Load a value into a register.  Normally the size will be 1, 2, 4 or 8 bytes and
1242                       this will just involve a simple load.  Structs, though, can be of any size up
1243                       to 8 bytes. *)
1244                    fun loadRegister(reg, offset, size) =
1245                    let
1246                        (* We don't necessarily have to sign-extend.  There's a comment in libffi that
1247                           suggests that LVM expects it even though the SysV ABI doesn't require it. *)
1248                        val moveOp =
1249                            if size = 0w8
1250                            then Move64
1251                            else if typeForm = CTypeSignedInt andalso size = 0w4
1252                            then Move32X64
1253                            else if size >= 0w4
1254                            then Move32
1255                            else if typeForm = CTypeSignedInt andalso size = 0w2
1256                            then Move16X64
1257                            else if size >= 0w2
1258                            then Move16
1259                            else if typeForm = CTypeSignedInt andalso size = 0w1
1260                            then Move8X64 else Move8
1261                    in
1262                        [Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset, index=NoIndex}, destination=RegisterArg reg, moveSize=moveOp}]
1263                    end @
1264                    (
1265                        if size = 0w6 orelse size = 0w7
1266                        then
1267                        [
1268                            Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset + 4, index=NoIndex},
1269                                destination=RegisterArg argWorkReg, moveSize=Move16},
1270                            ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=0w32, opSize=OpSize64 },
1271                            ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 }
1272                        ]
1273                        else []
1274                    ) @
1275                    (
1276                        if size = 0w3 orelse size = 0w5 orelse size = 0w7
1277                        then
1278                        [
1279                            Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset + Word.toInt(size-0w1), index=NoIndex},
1280                                destination=RegisterArg argWorkReg, moveSize=Move8},
1281                            ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 },
1282                            ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 }
1283                        ]
1284                        else []
1285                    )
1286
1287                    val newArgOffset = alignUp(argOffset, align)
1288                    val word1Addr = {base=argPtrReg, offset=Word.toInt newArgOffset, index=NoIndex}
1289                    val word2Addr = {base=argPtrReg, offset=Word.toInt newArgOffset + 8, index=NoIndex}
1290                in
1291                    case (classifyArg arg, size > 0w8, gRegs, fpRegs) of
1292                        (* 8 bytes or smaller - single general reg.  This is the usual case. *)
1293                        (ArgInRegs{firstInSSE=false, ...}, false, gReg :: gRegs', fpRegs') =>
1294                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1295                                loadRegister(gReg, newArgOffset, size) @ code, preCode)
1296
1297                        (* 8 bytes or smaller - single SSE reg.  Usual case for real arguments. *)
1298                    |   (ArgInRegs{firstInSSE=true, ...}, false, gRegs', fpReg :: fpRegs') =>
1299                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1300                                XMMArith{opc=if size = 0w4 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } :: code,
1301                                preCode)
1302                    
1303                        (* 9-16 bytes - both values in general regs. *)
1304                    |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true, gReg1 :: gReg2 :: gRegs', fpRegs') =>
1305                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1306                                Move{source=MemoryArg word1Addr, destination=RegisterArg gReg1, moveSize=Move64} ::
1307                                loadRegister(gReg2, newArgOffset+0w8, size-0w8) @ code, preCode)
1308
1309                        (* 9-16 bytes - first in general, second in SSE. *)
1310                    |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true, gReg :: gRegs', fpReg :: fpRegs') =>
1311                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1312                                Move{source=MemoryArg word1Addr, destination=RegisterArg gReg, moveSize=Move64} ::
1313                                    XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word2Addr, output=fpReg } :: code,
1314                                preCode)
1315
1316                        (* 9-16 bytes - first in SSE, second in general. *)
1317                    |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true, gReg :: gRegs', fpReg :: fpRegs') =>
1318                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1319                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } ::
1320                                    loadRegister(gReg, newArgOffset+0w8, size-0w8) @ code,
1321                                preCode)
1322
1323                    |   (* 9-16 bytes - both values in SSE regs. *)
1324                        (ArgInRegs{firstInSSE=true, secondInSSE=true}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') =>
1325                            loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs',
1326                                XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg1 } ::
1327                                XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble,
1328                                        source=MemoryArg word2Addr, output=fpReg2 } :: code,
1329                                preCode)
1330
1331                    |   (_, _, gRegs', fpRegs') => (* Either larger than 16 bytes or we've run out of the right kind of registers. *)
1332                            (* Move the argument in the preCode.  It's possible a large struct could be the first argument
1333                               and if we left it until the end RDI and RSI would already have been loaded.
1334                               Structs are passed by value on the stack not, as in Win64, by reference. *)
1335                        let
1336                            val space = intAlignUp(Word.toInt size, 0w8)
1337                        in
1338                            loadSysV64Args(args, stackOffset+space, newArgOffset+size, gRegs', fpRegs', code,
1339                                ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt space), opSize=nativeWordOpSize} ::
1340                                    moveMemory{source=(argPtrReg, Word.toInt newArgOffset), destination=(rsp, 0), length=Word.toInt size} @ preCode)
1341                        end
1342                end
1343
1344            (* The rules for returning structs are similar to those for parameters. *)
1345            local
1346                (* Store a result register into the result area.  In almost all cases
1347                   this is very simple: the only complication is with structs of odd sizes. *)
1348                fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size)
1349                    
1350                val {size, typeForm, ...} =  result
1351            in
1352                val (getResult, passArgAddress) =
1353                    if typeForm = CTypeVoid
1354                    then ([], false)
1355                    else case (classifyArg result, size > 0w8) of
1356                        (* 8 bytes or smaller - returned in RAX - Normal case for int-like results. *)
1357                        (ArgInRegs{firstInSSE=false, ...}, false) => (storeResult(rax, 0, size), false)
1358
1359                        (* 8 bytes or smaller - returned in XMM0 - Normal case for real results. *)
1360                    |   (ArgInRegs{firstInSSE=true, ...}, false) =>
1361                            ([XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex},
1362                                  precision=if size = 0w4 then SinglePrecision else DoublePrecision}], false)
1363
1364                        (* 9-16 bytes - returned in RAX/RDX. *)
1365                    |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true) =>
1366                            (storeResult(rax, 0, 0w8) @ storeResult(rdx, 0, size-0w8), false)
1367
1368                        (* 9-16 bytes - first in RAX, second in XMM0. *)
1369                    |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true) =>
1370                            (XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=8, index=NoIndex},
1371                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision} ::
1372                                storeResult(rax, 0, 0w8), false)
1373
1374                        (* 9-16 bytes - first in XMM0, second in RAX. *)
1375                    |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true) =>
1376                            (XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex}, precision=DoublePrecision} ::
1377                                storeResult(rax, 8, size-0w8), false)
1378
1379                        (* 9-16 bytes - both values in SSE regs.*)
1380                    |   (ArgInRegs{firstInSSE=true, secondInSSE=true}, true) =>
1381                            ([XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex}, precision=DoublePrecision},
1382                              XMMStoreToMemory{toStore=xmm1, address={base=resultAreaPtr, offset=8, index=NoIndex},
1383                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision}], false)
1384
1385                    |   _ => ([], true) (* Have to pass the address of the area in memory *)
1386            end
1387
1388            val (argCode, argStack, preArgCode) =
1389                if passArgAddress (* If we have to pass the address of the result struct it goes in rdi. *)
1390                then loadSysV64Args(args, 0, 0w0, tl sysVGenRegs, sysVFPRegs,
1391                        [moveRR{source=resultAreaPtr, output=rdi, opSize=nativeWordOpSize}], [])
1392                else loadSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, [], [])
1393
1394            local
1395                val align = argStack mod 16
1396            in
1397                (* Always align the stack. *)
1398                val preArgAlign = if align = 0 then 0 else 16-align
1399            end
1400
1401        in
1402            (* Save heap ptr.  Needed in case we have a callback. *)
1403            [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @
1404            (
1405                (* Put the destination address into a callee save resgister.
1406                   We have to put the C address in there now because an ML address wouldn't be updated
1407                   by a possible GC in a callback. *)
1408                if #typeForm( result) <> CTypeVoid
1409                then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)]
1410                else []
1411            ) @
1412            [
1413                (* Save the stack pointer. *)
1414                storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *)
1415                loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize)  (* Load the saved C stack pointer. *)
1416            ] @
1417            (
1418                if preArgAlign = 0
1419                then []
1420                else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}]
1421            ) @
1422            (
1423                (* The second argument is a SysWord containing the address of a malloced area of memory
1424                   with the actual arguments in it. *)
1425                if null args
1426                then []
1427                else [loadHeapMemory(argPtrReg, mlArg2Reg, 0, nativeWordOpSize)]
1428            ) @ preArgCode @ argCode @
1429            [
1430                let
1431                    (* The entry point is in a SysWord.word value in RAX. *)
1432                    val entryPoint =
1433                        case targetArch of
1434                            ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax}
1435                        |   _ => MemoryArg{base=eax, offset=0, index=NoIndex}
1436                in
1437                    (* Call the function.  We're discarding the value in rsp so no need to remove args. *)
1438                    CallAddress entryPoint
1439                end,
1440                loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *)
1441                (* Reload the heap pointer.  If we've called back to ML this could well have changed. *)
1442                loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)
1443            ] @ (* Store the result in the destination. *) getResult @ [ ReturnFromFunction 0 ]
1444        end (* callUnix64Bits *)
1445        
1446        fun closureUnix64Bits(args, result) =
1447        let
1448            fun moveSysV64Args([], _, _, _, _, moveFromStack) = moveFromStack
1449
1450            |   moveSysV64Args(arg::args, stackSpace, argOffset, gRegs, fpRegs, moveFromStack) =
1451                let
1452                    val {size, align, ...} =  arg
1453                    fun storeRegister(reg, offset, size) = storeUpTo8(reg, rsp, offset, size)
1454                    val newArgOffset = alignUp(argOffset, align)
1455                    val word1Addr = {base=rsp, offset=Word.toInt newArgOffset, index=NoIndex}
1456                    val word2Addr = {base=rsp, offset=Word.toInt newArgOffset + 8, index=NoIndex}
1457                in
1458                    case (classifyArg arg, size > 0w8, gRegs, fpRegs) of
1459                        (* 8 bytes or smaller - single general reg.  This is the usual case. *)
1460                        (ArgInRegs{firstInSSE=false, ...}, false, gReg :: gRegs', fpRegs') =>
1461                            storeRegister(gReg, Word.toInt newArgOffset, size) @
1462                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1463
1464                        (* 8 bytes or smaller - single SSE reg.  Usual case for real arguments. *)
1465                    |   (ArgInRegs{firstInSSE=true, ...}, false, gRegs', fpReg :: fpRegs') =>
1466                            XMMStoreToMemory{precision=if size = 0w4 then SinglePrecision else DoublePrecision, address=word1Addr, toStore=fpReg } :: 
1467                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1468                    
1469                        (* 9-16 bytes - both values in general regs. *)
1470                    |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true, gReg1 :: gReg2 :: gRegs', fpRegs') =>
1471                            Move{source=MemoryArg word1Addr, destination=RegisterArg gReg1, moveSize=Move64} ::
1472                                storeRegister(gReg2, Word.toInt newArgOffset+8, size-0w8) @ 
1473                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1474
1475                        (* 9-16 bytes - first in general, second in SSE. *)
1476                    |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true, gReg :: gRegs', fpReg :: fpRegs') =>
1477                            Move{source=MemoryArg word1Addr, destination=RegisterArg gReg, moveSize=Move64} ::
1478                                XMMStoreToMemory{precision=if size = 0w12 then SinglePrecision else DoublePrecision, address=word2Addr, toStore=fpReg } :: 
1479                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1480
1481                        (* 9-16 bytes - first in SSE, second in general. *)
1482                    |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true, gReg :: gRegs', fpReg :: fpRegs') =>
1483                            XMMStoreToMemory{precision=DoublePrecision, address=word1Addr, toStore=fpReg } ::
1484                                storeRegister(gReg, Word.toInt newArgOffset+8, size-0w8) @
1485                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1486
1487                    |   (* 9-16 bytes - both values in SSE regs. *)
1488                        (ArgInRegs{firstInSSE=true, secondInSSE=true}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') =>
1489                            XMMStoreToMemory{precision=DoublePrecision, address=word1Addr, toStore=fpReg1 } ::
1490                                XMMStoreToMemory{precision=if size = 0w12 then SinglePrecision else DoublePrecision,
1491                                        address=word2Addr, toStore=fpReg2 } :: 
1492                                moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack)
1493
1494                    |   (_, _, gRegs', fpRegs') =>
1495                        (* Either larger than 16 bytes or we've run out of the right kind of register.
1496                           Structs larger than 16 bytes are passed by value on the stack.  Move the
1497                           argument after we've stored all the registers in particular rsi and rdi. *)
1498                        let
1499                            val space = intAlignUp(Word.toInt size, 0w8)
1500                        in
1501                            moveSysV64Args(args, stackSpace+space, newArgOffset+size, gRegs', fpRegs',
1502                                moveMemory{source=(r10, stackSpace), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @ 
1503                                    moveFromStack)
1504                        end
1505                end
1506
1507            (* Result structs larger than 16 bytes are returned by reference.  *)
1508            val resultStructByRef = #size ( result) > 0w16
1509
1510            val copyArgsFromRegsAndStack =
1511                if resultStructByRef (* rdi contains the address for the result. *)
1512                then moveSysV64Args(args, 0, 0w0, tl sysVGenRegs, sysVFPRegs, [])
1513                else moveSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, [])
1514
1515            local
1516                fun getNextSize (arg, argOffset) =
1517                let val {size, align, ...} =  arg in alignUp(argOffset, align) + size end
1518            in
1519                val argumentSpace = List.foldl getNextSize 0w0 args
1520            end
1521
1522            (* Allocate a 16-byte area for any results returned in registers.  This will not be used
1523               if the result is a structure larger than 16-bytes. *)
1524            val resultOffset = alignUp(argumentSpace, 0w8)
1525            (* Ensure the stack is 16 bytes aligned.  We've pushed 6 regs and a return address
1526               so add a further 8 bytes to bring it back into alignment.  If we're returning a struct
1527               by reference, though, we've pushed 7 regs so don't add 8. *)
1528            val stackToAllocate =
1529                Word.toInt(alignUp(resultOffset + 0w16, 0w16)) + (if resultStructByRef then 0 else 8)
1530
1531            (* The rules for returning structs are similar to those for parameters. *)
1532            local
1533                (* The result area is always 16 bytes wide so we can load the result without risking reading outside.
1534                   At least at the moment we ignore any sign extension. *)                    
1535                val {size, typeForm, ...} = result
1536                val resultOffset = Word.toInt resultOffset
1537            in
1538                val loadResults =
1539                    if typeForm = CTypeVoid
1540                    then []
1541                    else case (classifyArg result, size > 0w8) of
1542                        (* 8 bytes or smaller - returned in RAX - Normal case for int-like results. *)
1543                        (ArgInRegs{firstInSSE=false, ...}, false) =>
1544                            [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}]
1545
1546                        (* 8 bytes or smaller - returned in XMM0 - Normal case for real results. *)
1547                    |   (ArgInRegs{firstInSSE=true, ...}, false) =>
1548                            [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex},
1549                                  precision=if size = 0w4 then SinglePrecision else DoublePrecision}]
1550
1551                        (* 9-16 bytes - returned in RAX/RDX. *)
1552                    |   (ArgInRegs{firstInSSE=false, secondInSSE=false}, true) =>
1553                            [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64},
1554                              Move{source=MemoryArg {base=rsp, offset=resultOffset+8, index=NoIndex}, destination=RegisterArg rdx, moveSize=Move64}]
1555
1556                        (* 9-16 bytes - first in RAX, second in XMM0. *)
1557                    |   (ArgInRegs{firstInSSE=false, secondInSSE=true}, true) =>
1558                            [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64},
1559                              XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset+8, index=NoIndex},
1560                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision}]
1561
1562                        (* 9-16 bytes - first in XMM0, second in RAX. *)
1563                    |   (ArgInRegs{firstInSSE=true, secondInSSE=false}, true) =>
1564                            [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex}, precision=DoublePrecision},
1565                             Move{source=MemoryArg {base=rsp, offset=resultOffset+8, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}]
1566
1567                        (* 9-16 bytes - both values in SSE regs.*)
1568                    |   (ArgInRegs{firstInSSE=true, secondInSSE=true}, true) =>
1569                            [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex}, precision=DoublePrecision},
1570                              XMMStoreToMemory{toStore=xmm1, address={base=rsp, offset=resultOffset+8, index=NoIndex},
1571                                    precision=if size = 0w12 then SinglePrecision else DoublePrecision}]
1572
1573                    |   _ => [] (* Have to pass the address of the area in memory *)
1574            end
1575        in
1576        [
1577            (* Push callee-save registers. *)
1578            PushToStack(RegisterArg rbp), PushToStack(RegisterArg rbx), PushToStack(RegisterArg r12),
1579            PushToStack(RegisterArg r13), PushToStack(RegisterArg r14), PushToStack(RegisterArg r15)
1580        ] @
1581        (
1582            (* If we're returning a struct by reference we have to return the address in rax even though
1583               it's been set by the caller.  Save this address. *)
1584            if resultStructByRef
1585            then [PushToStack(RegisterArg rdi)]
1586            else []
1587        ) @
1588        [
1589            (* Set r10 to point to the original stack args if any. *)
1590            LoadAddress{ output=r10, offset=if resultStructByRef then 64 else 56, base=SOME rsp, index=NoIndex, opSize=nativeWordOpSize},
1591            (* Allocate stack space. *)
1592            ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize},
1593            (* Move the function we're calling, in rax, into r13, a callee-save register *)
1594            moveRR{source=rax, output=r13, opSize=polyWordOpSize}
1595        ]
1596          @ copyArgsFromRegsAndStack @
1597        [
1598            (* Get the value for rbp.  This has to be an absolute address in 32-in-64. *)
1599            LoadAbsolute{value=getThreadDataCall, destination=rcx},
1600            CallAddress(MemoryArg{base=rcx, offset=0, index=NoIndex}),
1601            moveRR{source=rax, output=rbp, opSize=nativeWordOpSize},
1602            (* Save the address of the argument and result area. *)
1603            moveRR{source=rsp, output=rcx, opSize=nativeWordOpSize},
1604            (* Switch to the ML stack. *)
1605            storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize),
1606            loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize),
1607            (* Load the ML heap pointer. *)
1608            loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize),
1609            (* Now move the function closure into the closure register ready for the call. *)
1610            moveRR{source=r13, output=rdx, opSize=polyWordOpSize}
1611        ] @
1612        (* Reload the heap base address in 32-in-64. *)
1613        ( if targetArch = ObjectId32Bit then [loadMemory(rbx, rbp, memRegSavedRbx, nativeWordOpSize)] else [] )
1614         @ boxRegAsSysWord(rcx, rax, []) @
1615        (
1616            (* If we're returning a struct by reference the address for the result will have been passed in the
1617               first argument.  We use that as the result area.  Otherwise point to the result area on the stack.  *)
1618            if resultStructByRef
1619            then loadMemory(rcx, r10, ~64, nativeWordOpSize)
1620            else ArithToGenReg{opc=ADD, output=rcx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=nativeWordOpSize}
1621        ) :: boxRegAsSysWord(rcx, mlArg2Reg, [rax]) @
1622        [
1623            (* Call the ML function using the full closure call. *)
1624            CallAddress(
1625                if targetArch = ObjectId32Bit
1626                then MemoryArg{base=rbx, index=Index4 rdx, offset=0}
1627                else MemoryArg{base=rdx, index=NoIndex, offset=0}),
1628            (* Save the ML stack pointer because we may have grown the stack.  Switch to the C stack. *)
1629            storeMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize),
1630            loadMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize),
1631            storeMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize)
1632        ] @ loadResults @
1633        [
1634            (* Remove the stack space. *)
1635            ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize}
1636        ] @
1637        ( if resultStructByRef then [PopR rax] else [] ) @
1638        [
1639            PopR r15, PopR r14, PopR r13, PopR r12, PopR rbx, PopR rbp, (* Restore callee-save registers. *)
1640            ReturnFromFunction 0 (* Caller removes any stack arguments. *)
1641        ]
1642        end
1643    end
1644
1645    fun foreignCall(abi: ffiABI, args: cType list, result: cType): Address.machineWord =
1646    let
1647        val code =
1648            case abi of
1649                FFI_UNIX64 => callUnix64Bits(args, result)
1650            |   FFI_WIN64 => callWindows64Bits(args, result)
1651            |   abi => call32Bits(abi, args, result)
1652
1653        val functionName = "foreignCall"
1654        val debugSwitches =
1655            [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
1656               Universal.tagInject DEBUG.assemblyCodeTag true*)]
1657        val profileObject = createProfileObject functionName
1658        val newCode = codeCreate (functionName, profileObject, debugSwitches)
1659        val closure = makeConstantClosure()
1660        val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
1661    in
1662        closureAsAddress closure
1663    end
1664
1665    (* Build a callback function.  The arguments are the abi, the list of argument types and the result type.
1666       The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and
1667       returns the C function as its result.  When the C function is called the arguments are copied into
1668       temporary memory and the vector passed to f along with the address of the memory for the result.
1669       "f" stores the result in it when it returns and the result is then passed back as the result of the
1670       callback.
1671       N.B.  This returns a closure cell which contains the address of the code.  It can be used as a
1672       SysWord.word value except that while it exists the code will not be GCed.  *)
1673    fun buildCallBack(abi: ffiABI, args: cType list, result: cType): Address.machineWord =
1674    let
1675        val code =
1676            case abi of
1677                FFI_UNIX64 => closureUnix64Bits(args, result)
1678            |   FFI_WIN64 => closureWindows64Bits(args, result)
1679            |   abi => closure32Bits(abi, args, result)
1680
1681        val functionName = "foreignCallBack(2)"
1682        val debugSwitches =
1683            [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
1684               Universal.tagInject DEBUG.assemblyCodeTag true*)]
1685        val profileObject = createProfileObject functionName
1686        val newCode = codeCreate (functionName, profileObject, debugSwitches)
1687        val closure = makeConstantClosure()
1688        val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
1689        val stage2Code = closureAsAddress closure
1690
1691        fun resultFunction f =
1692        let
1693            (* Generate a small function to load the address of f into rax/eax and then jump to stage2.
1694               The idea is that it should be possible to generate this eventually in a single RTS call.
1695               That could be done by using a version of this as a model. *)
1696            val codeAddress =
1697                (* In the native code versions we extract the code address from the closure.
1698                   We don't do that in 32-in-64 and instead the RTS does it. *)
1699                if targetArch = ObjectId32Bit
1700                then stage2Code
1701                else Address.loadWord(Address.toAddress stage2Code, 0w0)
1702            val code =
1703                [
1704                    Move{source=AddressConstArg(Address.toMachineWord f), destination=RegisterArg rax, moveSize=opSizeToMove polyWordOpSize},
1705                    JumpAddress(AddressConstArg codeAddress)
1706                ]
1707            val functionName = "foreignCallBack(1)"
1708            val debugSwitches =
1709                [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)),
1710                   Universal.tagInject DEBUG.assemblyCodeTag true*)]
1711            val profileObject = createProfileObject functionName
1712            val newCode = codeCreate (functionName, profileObject, debugSwitches)
1713            val closure = makeConstantClosure()
1714            val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure}
1715            val res = closureAsAddress closure
1716            (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*)
1717        in
1718            res
1719        end
1720    in
1721        Address.toMachineWord resultFunction
1722    end
1723
1724end;
1725