1(*
2    Copyright (c) 2016-18 David C.J. Matthews
3
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License version 2.1 as published by the Free Software Foundation.
7    
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Lesser General Public License for more details.
12    
13    You should have received a copy of the GNU Lesser General Public
14    License along with this library; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16*)
17
18functor X86FOREIGNCALL(
19
20    structure X86CODE: X86CODESIG
21
22    structure X86OPTIMISE:
23    sig
24        type operation
25        type code
26        type operations = operation list
27        type address = Address.address
28
29        (* Optimise and code-generate. *)
30        val generateCode: {code: code, ops: operations, labelCount: int} -> address
31
32        structure Sharing:
33        sig
34            type operation = operation
35            type code = code
36        end
37    end
38
39    structure DEBUG: DEBUGSIG
40
41    sharing X86CODE.Sharing = X86OPTIMISE.Sharing
42): FOREIGNCALLSIG
43=
44struct
45    open X86CODE
46    open Address
47    
48    exception InternalError = Misc.InternalError
49    
50    val pushR = PushToStack o RegisterArg
51
52    fun moveRR{source, output} = MoveToRegister{source=RegisterArg source, output=output}
53
54    fun loadMemory(reg, base, offset) =
55        MoveToRegister{source=MemoryArg{base=base, offset=offset, index=NoIndex}, output=reg}
56    and storeMemory(reg, base, offset) =
57        StoreRegToMemory{toStore=reg, address={base=base, offset=offset, index=NoIndex}}
58
59    fun createProfileObject _ (*functionName*) =
60    let
61        (* The profile object is a single mutable with the F_bytes bit set. *)
62        open Address
63        val profileObject = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes))))
64        fun clear 0w0 = ()
65        |   clear i = (assignByte(profileObject, i-0w1, 0w0); clear (i-0w1))
66        val () = clear wordSize
67    in
68        toMachineWord profileObject
69    end
70
71    val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject"
72
73    datatype abi = X86_32 | X64Win | X64Unix
74    
75    local
76        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
77        val getABICall: unit -> int = RunCall.rtsCallFast0 "PolyGetABI"
78    in
79        fun getABI() =
80            case getABICall() of
81                0 => X86_32
82            |   1 => X64Unix
83            |   2 => X64Win
84            |   n => raise InternalError ("Unknown ABI type " ^ Int.toString n)
85    end
86
87    val noException = 1
88
89    (* Full RTS call version.  An extra argument is passed that contains the thread ID.
90       This allows the taskData object to be found which is needed if the code allocates
91       any ML memory or raises an exception.  It also saves the stack and heap pointers
92       in case of a GC. *)
93    fun rtsCallFull (functionName, nArgs (* Not counting the thread ID *), debugSwitches) =
94    let
95        val entryPointAddr = makeEntryPoint functionName
96
97        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
98        val abi = getABI()
99
100        (* Branch to check for exception. *)
101        val exLabel = Label{labelNo=0} (* There's just one label in this function. *)
102        
103        (* Unix X64.  The first six arguments are in rdi, rsi, rdx, rcx, r8, r9.
104                      The rest are on the stack.
105           Windows X64. The first four arguments are in rcx, rdx, r8 and r9.  The rest are
106                       on the stack.  The caller must ensure the stack is aligned on 16-byte boundary
107                       and must allocate 32-byte save area for the register args.
108                       rbx, rbp, rdi, rsi, rsp, r12-r15 are saved by the called function.
109           X86/32.  Arguments are pushed to the stack.
110                   ebx, edi, esi, ebp and esp are saved by the called function.
111                   We use esi to hold the argument data pointer and edi to save the ML stack pointer
112           Our ML conventions use eax, ebx for the first two arguments in X86/32 and
113                   rax, ebx, r8, r9, r10 for the first five arguments in X86/64.
114        *)
115        
116        (* Previously the ML stack pointer was saved in a callee-save register.  This works
117           in almost all circumstances except when a call to the FFI code results in a callback
118           and the callback moves the ML stack.  Instead the RTS callback handler adjusts the value
119           in memRegStackPtr and we reload the ML stack pointer from there. *)
120        val entryPtrReg = if isX64 then r11 else ecx
121        
122        val stackSpace =
123            case abi of
124                X64Unix => memRegSize
125            |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
126            |   X86_32 =>
127                let
128                    (* GCC likes to keep the stack on a 16-byte alignment. *)
129                    val argSpace = (nArgs+1)*4
130                    val align = argSpace mod 16
131                in
132                    (* Add sufficient space so that esp will be 16-byte aligned *)
133                    if align = 0
134                    then memRegSize
135                    else memRegSize + 16 - align
136                end
137
138        val code =
139            [
140                MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *)
141                loadMemory(entryPtrReg, entryPtrReg, 0)(* Load its value. *)
142            ] @
143            (
144                (* Save heap ptr.  This is in r15 in X86/64 *)
145                if isX64 then [storeMemory(r15, ebp, memRegLocalMPointer)] (* Save heap ptr *)
146                else []
147            ) @
148            (
149                if abi = X86_32 andalso nArgs >= 3
150                then [moveRR{source=esp, output=edi}] (* Needed if we have to load from the stack. *)
151                else []
152            ) @
153            
154            [
155                (* Have to save the stack pointer to the arg structure in case we need to scan the stack for a GC. *)
156                storeMemory(esp, ebp, memRegStackPtr), (* Save ML stack and switch to C stack. *)
157                loadMemory(esp, ebp, memRegCStackPtr), (*moveRR{source=ebp, output=esp},*) (* Load the saved C stack pointer. *)
158                (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
159                ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
160            ] @
161            (
162                case (abi, nArgs) of  (* Set the argument registers. *)
163                    (X64Unix, 0) => [ loadMemory(edi, ebp, memRegThreadSelf) ]
164                |   (X64Unix, 1) => [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi} ]
165                |   (X64Unix, 2) =>
166                        [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi}, moveRR{source=ebx, output=edx} ]
167                |   (X64Unix, 3) => 
168                        [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi}, moveRR{source=ebx, output=edx}, moveRR{source=r8, output=ecx} ]
169                |   (X64Win, 0) => [ loadMemory(ecx, ebp, memRegThreadSelf) ]
170                |   (X64Win, 1) => [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx} ]
171                |   (X64Win, 2) =>
172                        [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx}, moveRR{source=ebx, output=r8} ]
173                |   (X64Win, 3) =>
174                        [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx}, moveRR{source=r8, output=r9}, moveRR{source=ebx, output=r8} ]
175                |   (X86_32, 0) => [ PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
176                |   (X86_32, 1) => [ pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
177                |   (X86_32, 2) => [ pushR ebx, pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ]
178                |   (X86_32, 3) =>
179                        [
180                            (* We need to move an argument from the ML stack. *)
181                            PushToStack(MemoryArg{base=edi, offset=4, index=NoIndex}), pushR ebx, pushR eax,
182                            PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex})
183                        ]
184                |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
185            ) @
186            [
187                CallFunction(DirectReg entryPtrReg), (* Call the function *)
188                loadMemory(esp, ebp, memRegStackPtr) (* Restore the ML stack pointer. *)
189            ] @
190            (
191            if isX64 then [loadMemory(r15, ebp, memRegLocalMPointer) ] (* Copy back the heap ptr *)
192            else []
193            ) @
194            [
195                ArithMemConst{opc=CMP, address={offset=memRegExceptionPacket, base=ebp, index=NoIndex}, source=noException, opSize=polyWordOpSize},
196                ConditionalBranch{test=JNE, label=exLabel},
197                (* Remove any arguments that have been passed on the stack. *)
198                ReturnFromFunction(Int.max(case abi of X86_32 => nArgs-2 | _ => nArgs-5, 0)),
199                JumpLabel exLabel, (* else raise the exception *)
200                loadMemory(eax, ebp, memRegExceptionPacket),
201                RaiseException { workReg=ecx }
202            ]
203 
204        val profileObject = createProfileObject functionName
205        val newCode = codeCreate (functionName, profileObject, debugSwitches)
206        val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=1(*One label.*), ops=code}
207        (* Have to create a closure for this *)
208        open Address
209        val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0)
210    in
211        assignWord(closure, 0w0, toMachineWord createdCode);
212        lock closure;
213        closure
214    end
215
216    (* This is a quicker version but can only be used if the RTS entry does
217       not allocated ML memory, raise an exception or need to suspend the thread. *)
218    fun rtsCallFast (functionName, nArgs, debugSwitches) =
219    let
220        val entryPointAddr = makeEntryPoint functionName
221
222        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
223        val abi = getABI()
224
225        val (entryPtrReg, saveMLStackPtrReg) =
226            if isX64 then (r11, r13) else (ecx, edi)
227        
228        val stackSpace =
229            case abi of
230                X64Unix => memRegSize
231            |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
232            |   X86_32 =>
233                let
234                    (* GCC likes to keep the stack on a 16-byte alignment. *)
235                    val argSpace = nArgs*4
236                    val align = argSpace mod 16
237                in
238                    (* Add sufficient space so that esp will be 16-byte aligned *)
239                    if align = 0
240                    then memRegSize
241                    else memRegSize + 16 - align
242                end
243
244        val code =
245            [
246                MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *)
247                loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *)
248                moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *)
249                loadMemory(esp, ebp, memRegCStackPtr),
250                (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
251                ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
252            ] @
253            (
254                case (abi, nArgs) of  (* Set the argument registers. *)
255                    (_, 0) => []
256                |   (X64Unix, 1) => [ moveRR{source=eax, output=edi} ]
257                |   (X64Unix, 2) =>
258                        [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi} ]
259                |   (X64Unix, 3) => 
260                        [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi}, moveRR{source=r8, output=edx} ]
261                |   (X64Unix, 4) => 
262                        [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi}, moveRR{source=r8, output=edx}, moveRR{source=r9, output=ecx} ]
263                |   (X64Win, 1) => [ moveRR{source=eax, output=ecx} ]
264                |   (X64Win, 2) => [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} ]
265                |   (X64Win, 3) =>
266                        [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} (* Arg3 is already in r8. *) ]
267                |   (X64Win, 4) =>
268                        [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} (* Arg3 is already in r8 and arg4 in r9. *) ]
269                |   (X86_32, 1) => [ pushR eax ]
270                |   (X86_32, 2) => [ pushR ebx, pushR eax ]
271                |   (X86_32, 3) =>
272                        [
273                            (* We need to move an argument from the ML stack. *)
274                            loadMemory(edx, saveMLStackPtrReg, 4), pushR edx, pushR ebx, pushR eax
275                        ]
276                |   (X86_32, 4) =>
277                        [
278                            (* We need to move an arguments from the ML stack. *)
279                            loadMemory(edx, saveMLStackPtrReg, 4), pushR edx,
280                            loadMemory(edx, saveMLStackPtrReg, 8), pushR edx,
281                            pushR ebx, pushR eax
282                        ]
283                |   _ => raise InternalError "rtsCall: Abi/argument count not implemented"
284            ) @
285            [
286                CallFunction(DirectReg entryPtrReg), (* Call the function *)
287                moveRR{source=saveMLStackPtrReg, output=esp}, (* Restore the ML stack pointer *)
288                (* Remove any arguments that have been passed on the stack. *)
289                ReturnFromFunction(Int.max(case abi of X86_32 => nArgs-2 | _ => nArgs-5, 0))
290            ]
291 
292        val profileObject = createProfileObject functionName
293        val newCode = codeCreate (functionName, profileObject, debugSwitches)
294        val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code}
295        (* Have to create a closure for this *)
296        open Address
297        val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0)
298    in
299        assignWord(closure, 0w0, toMachineWord createdCode);
300        lock closure;
301        closure
302    end
303    
304    (* RTS call with one double-precision floating point argument and a floating point result.
305       First version.  This will probably be merged into the above code in due
306       course.
307       Currently ML always uses boxed values for floats.  *)
308    fun rtsCallFastFloattoFloat (functionName, debugSwitches) =
309    let
310        val entryPointAddr = makeEntryPoint functionName
311
312        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
313        val abi = getABI()
314
315        val (entryPtrReg, saveMLStackPtrReg) =
316            if isX64 then (r11, r13) else (ecx, edi)
317        
318        val stackSpace =
319            case abi of
320                X64Unix => memRegSize
321            |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
322            |   X86_32 =>
323                let
324                    (* GCC likes to keep the stack on a 16-byte alignment. *)
325                    val argSpace = 8 (*nArgs*4*) (* One "double" value. *)
326                    val align = argSpace mod 16
327                in
328                    (* Add sufficient space so that esp will be 16-byte aligned *)
329                    if align = 0
330                    then memRegSize
331                    else memRegSize + 16 - align
332                end
333
334        (* Constants for a box for a float *)
335        val fpBoxSize = 8 div Word.toInt wordSize
336        val fpBoxLengthWord32 = IntInf.orb(IntInf.fromInt fpBoxSize, IntInf.<<(Word8.toLargeInt F_bytes, 0w24))
337
338        val code =
339            [
340                MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *)
341                loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *)
342                moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *)
343                loadMemory(esp, ebp, memRegCStackPtr),
344                (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
345                ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
346            ] @
347            (
348                case abi of
349                    (* X64 on both Windows and Unix take the first arg in xmm0.  We need to
350                       unbox the value pointed at by rax. *)
351                    X64Unix => [ XMMArith { opc= SSE2Move, source=MemoryArg{base=eax, offset=0, index=NoIndex}, output=xmm0 } ]
352                |   X64Win => [ XMMArith { opc= SSE2Move, source=MemoryArg{base=eax, offset=0, index=NoIndex}, output=xmm0 } ]
353                |   X86_32 =>
354                     (* eax contains the address of the value.  This must be unboxed onto the stack. *)
355                    [
356                        FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision},
357                        ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize},
358                        FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }
359                    ]
360            ) @
361            [
362                CallFunction(DirectReg entryPtrReg), (* Call the function *)
363                moveRR{source=saveMLStackPtrReg, output=esp} (* Restore the ML stack pointer *)
364            ] @
365            (
366                (* Put the floating point result into a box. *)
367                case abi of
368                   X86_32 =>
369                    [
370                        AllocStore{size=fpBoxSize, output=eax, saveRegs=[]},
371                        StoreConstToMemory{toStore=fpBoxLengthWord32,
372                                address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}},
373                        FPStoreToMemory{ address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true },
374                        StoreInitialised
375                    ]
376                |   _ => (* X64 The result is in xmm0 *)
377                    [
378                        AllocStore{size=fpBoxSize, output=eax, saveRegs=[]},
379                        StoreConstToMemory{toStore=LargeInt.fromInt fpBoxSize,
380                            address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}},
381                        StoreNonWordConst{size=Size8Bit, toStore=Word8.toLargeInt F_bytes, address={offset= ~1, base=eax, index=NoIndex}},
382                        XMMStoreToMemory { address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, toStore=xmm0 },
383                        StoreInitialised
384                    ]                    
385            ) @
386            [
387                (* Remove any arguments that have been passed on the stack. *)
388                ReturnFromFunction 0
389            ]
390 
391        val profileObject = createProfileObject functionName
392        val newCode = codeCreate (functionName, profileObject, debugSwitches)
393        val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code}
394        (* Have to create a closure for this *)
395        open Address
396        val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0)
397    in
398        assignWord(closure, 0w0, toMachineWord createdCode);
399        lock closure;
400        closure
401    end
402
403    (* RTS call with one general (i.e. ML word) argument and a floating point result.
404       This is used only to convert arbitrary precision values to floats.
405       In due course this will be merged into the other functions. *)
406    fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) =
407    let
408        val entryPointAddr = makeEntryPoint functionName
409
410        (* Get the ABI.  On 64-bit Windows and Unix use different calling conventions. *)
411        val abi = getABI()
412
413        val (entryPtrReg, saveMLStackPtrReg) =
414            if isX64 then (r11, r13) else (ecx, edi)
415        
416        val stackSpace =
417            case abi of
418                X64Unix => memRegSize
419            |   X64Win => memRegSize + 32 (* Requires 32-byte save area. *)
420            |   X86_32 =>
421                let
422                    (* GCC likes to keep the stack on a 16-byte alignment. *)
423                    val argSpace = 4 (* One "PolyWord" value. *)
424                    val align = argSpace mod 16
425                in
426                    (* Add sufficient space so that esp will be 16-byte aligned *)
427                    if align = 0
428                    then memRegSize
429                    else memRegSize + 16 - align
430                end
431
432        (* Constants for a box for a float *)
433        val fpBoxSize = 8 div Word.toInt wordSize
434        val fpBoxLengthWord32 = IntInf.orb(IntInf.fromInt fpBoxSize, IntInf.<<(Word8.toLargeInt F_bytes, 0w24))
435
436        val code =
437            [
438                MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *)
439                loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *)
440                moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *)
441                loadMemory(esp, ebp, memRegCStackPtr),
442                (* Set the stack pointer past the data on the stack.  For Windows/64 add in a 32 byte save area *)
443                ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize}
444            ] @
445            (
446                case abi of
447                    X64Unix => [ moveRR{source=eax, output=edi} ]
448                |   X64Win => [ moveRR{source=eax, output=ecx} ]
449                |   X86_32 => [ pushR eax ]
450            ) @
451            [
452                CallFunction(DirectReg entryPtrReg), (* Call the function *)
453                moveRR{source=saveMLStackPtrReg, output=esp} (* Restore the ML stack pointer *)
454            ] @
455            (
456                (* Put the floating point result into a box. *)
457                case abi of
458                   X86_32 =>
459                    [
460                        AllocStore{size=fpBoxSize, output=eax, saveRegs=[]},
461                        StoreConstToMemory{toStore=fpBoxLengthWord32,
462                                address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}},
463                        FPStoreToMemory{ address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true },
464                        StoreInitialised
465                    ]
466                |   _ => (* X64 The result is in xmm0 *)
467                    [
468                        AllocStore{size=fpBoxSize, output=eax, saveRegs=[]},
469                        StoreConstToMemory{toStore=LargeInt.fromInt fpBoxSize,
470                            address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}},
471                        StoreNonWordConst{size=Size8Bit, toStore=Word8.toLargeInt F_bytes, address={offset= ~1, base=eax, index=NoIndex}},
472                        XMMStoreToMemory { address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, toStore=xmm0 },
473                        StoreInitialised
474                    ]                    
475            ) @
476            [
477                (* Remove any arguments that have been passed on the stack. *)
478                ReturnFromFunction 0
479            ]
480 
481        val profileObject = createProfileObject functionName
482        val newCode = codeCreate (functionName, profileObject, debugSwitches)
483        val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code}
484        (* Have to create a closure for this *)
485        open Address
486        val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0)
487    in
488        assignWord(closure, 0w0, toMachineWord createdCode);
489        lock closure;
490        closure
491    end
492
493end;
494