1(* 2 Copyright (c) 2016-18 David C.J. Matthews 3 4 This library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Lesser General Public 6 License version 2.1 as published by the Free Software Foundation. 7 8 This library is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 Lesser General Public License for more details. 12 13 You should have received a copy of the GNU Lesser General Public 14 License along with this library; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16*) 17 18functor X86FOREIGNCALL( 19 20 structure X86CODE: X86CODESIG 21 22 structure X86OPTIMISE: 23 sig 24 type operation 25 type code 26 type operations = operation list 27 type address = Address.address 28 29 (* Optimise and code-generate. *) 30 val generateCode: {code: code, ops: operations, labelCount: int} -> address 31 32 structure Sharing: 33 sig 34 type operation = operation 35 type code = code 36 end 37 end 38 39 structure DEBUG: DEBUGSIG 40 41 sharing X86CODE.Sharing = X86OPTIMISE.Sharing 42): FOREIGNCALLSIG 43= 44struct 45 open X86CODE 46 open Address 47 48 exception InternalError = Misc.InternalError 49 50 val pushR = PushToStack o RegisterArg 51 52 fun moveRR{source, output} = MoveToRegister{source=RegisterArg source, output=output} 53 54 fun loadMemory(reg, base, offset) = 55 MoveToRegister{source=MemoryArg{base=base, offset=offset, index=NoIndex}, output=reg} 56 and storeMemory(reg, base, offset) = 57 StoreRegToMemory{toStore=reg, address={base=base, offset=offset, index=NoIndex}} 58 59 fun createProfileObject _ (*functionName*) = 60 let 61 (* The profile object is a single mutable with the F_bytes bit set. *) 62 open Address 63 val profileObject = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes)))) 64 fun clear 0w0 = () 65 | clear i = (assignByte(profileObject, i-0w1, 0w0); clear (i-0w1)) 66 val () = clear wordSize 67 in 68 toMachineWord profileObject 69 end 70 71 val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject" 72 73 datatype abi = X86_32 | X64Win | X64Unix 74 75 local 76 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 77 val getABICall: unit -> int = RunCall.rtsCallFast0 "PolyGetABI" 78 in 79 fun getABI() = 80 case getABICall() of 81 0 => X86_32 82 | 1 => X64Unix 83 | 2 => X64Win 84 | n => raise InternalError ("Unknown ABI type " ^ Int.toString n) 85 end 86 87 val noException = 1 88 89 (* Full RTS call version. An extra argument is passed that contains the thread ID. 90 This allows the taskData object to be found which is needed if the code allocates 91 any ML memory or raises an exception. It also saves the stack and heap pointers 92 in case of a GC. *) 93 fun rtsCallFull (functionName, nArgs (* Not counting the thread ID *), debugSwitches) = 94 let 95 val entryPointAddr = makeEntryPoint functionName 96 97 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 98 val abi = getABI() 99 100 (* Branch to check for exception. *) 101 val exLabel = Label{labelNo=0} (* There's just one label in this function. *) 102 103 (* Unix X64. The first six arguments are in rdi, rsi, rdx, rcx, r8, r9. 104 The rest are on the stack. 105 Windows X64. The first four arguments are in rcx, rdx, r8 and r9. The rest are 106 on the stack. The caller must ensure the stack is aligned on 16-byte boundary 107 and must allocate 32-byte save area for the register args. 108 rbx, rbp, rdi, rsi, rsp, r12-r15 are saved by the called function. 109 X86/32. Arguments are pushed to the stack. 110 ebx, edi, esi, ebp and esp are saved by the called function. 111 We use esi to hold the argument data pointer and edi to save the ML stack pointer 112 Our ML conventions use eax, ebx for the first two arguments in X86/32 and 113 rax, ebx, r8, r9, r10 for the first five arguments in X86/64. 114 *) 115 116 (* Previously the ML stack pointer was saved in a callee-save register. This works 117 in almost all circumstances except when a call to the FFI code results in a callback 118 and the callback moves the ML stack. Instead the RTS callback handler adjusts the value 119 in memRegStackPtr and we reload the ML stack pointer from there. *) 120 val entryPtrReg = if isX64 then r11 else ecx 121 122 val stackSpace = 123 case abi of 124 X64Unix => memRegSize 125 | X64Win => memRegSize + 32 (* Requires 32-byte save area. *) 126 | X86_32 => 127 let 128 (* GCC likes to keep the stack on a 16-byte alignment. *) 129 val argSpace = (nArgs+1)*4 130 val align = argSpace mod 16 131 in 132 (* Add sufficient space so that esp will be 16-byte aligned *) 133 if align = 0 134 then memRegSize 135 else memRegSize + 16 - align 136 end 137 138 val code = 139 [ 140 MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *) 141 loadMemory(entryPtrReg, entryPtrReg, 0)(* Load its value. *) 142 ] @ 143 ( 144 (* Save heap ptr. This is in r15 in X86/64 *) 145 if isX64 then [storeMemory(r15, ebp, memRegLocalMPointer)] (* Save heap ptr *) 146 else [] 147 ) @ 148 ( 149 if abi = X86_32 andalso nArgs >= 3 150 then [moveRR{source=esp, output=edi}] (* Needed if we have to load from the stack. *) 151 else [] 152 ) @ 153 154 [ 155 (* Have to save the stack pointer to the arg structure in case we need to scan the stack for a GC. *) 156 storeMemory(esp, ebp, memRegStackPtr), (* Save ML stack and switch to C stack. *) 157 loadMemory(esp, ebp, memRegCStackPtr), (*moveRR{source=ebp, output=esp},*) (* Load the saved C stack pointer. *) 158 (* Set the stack pointer past the data on the stack. For Windows/64 add in a 32 byte save area *) 159 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize} 160 ] @ 161 ( 162 case (abi, nArgs) of (* Set the argument registers. *) 163 (X64Unix, 0) => [ loadMemory(edi, ebp, memRegThreadSelf) ] 164 | (X64Unix, 1) => [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi} ] 165 | (X64Unix, 2) => 166 [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi}, moveRR{source=ebx, output=edx} ] 167 | (X64Unix, 3) => 168 [ loadMemory(edi, ebp, memRegThreadSelf), moveRR{source=eax, output=esi}, moveRR{source=ebx, output=edx}, moveRR{source=r8, output=ecx} ] 169 | (X64Win, 0) => [ loadMemory(ecx, ebp, memRegThreadSelf) ] 170 | (X64Win, 1) => [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx} ] 171 | (X64Win, 2) => 172 [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx}, moveRR{source=ebx, output=r8} ] 173 | (X64Win, 3) => 174 [ loadMemory(ecx, ebp, memRegThreadSelf), moveRR{source=eax, output=edx}, moveRR{source=r8, output=r9}, moveRR{source=ebx, output=r8} ] 175 | (X86_32, 0) => [ PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ] 176 | (X86_32, 1) => [ pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ] 177 | (X86_32, 2) => [ pushR ebx, pushR eax, PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) ] 178 | (X86_32, 3) => 179 [ 180 (* We need to move an argument from the ML stack. *) 181 PushToStack(MemoryArg{base=edi, offset=4, index=NoIndex}), pushR ebx, pushR eax, 182 PushToStack(MemoryArg{base=ebp, offset=memRegThreadSelf, index=NoIndex}) 183 ] 184 | _ => raise InternalError "rtsCall: Abi/argument count not implemented" 185 ) @ 186 [ 187 CallFunction(DirectReg entryPtrReg), (* Call the function *) 188 loadMemory(esp, ebp, memRegStackPtr) (* Restore the ML stack pointer. *) 189 ] @ 190 ( 191 if isX64 then [loadMemory(r15, ebp, memRegLocalMPointer) ] (* Copy back the heap ptr *) 192 else [] 193 ) @ 194 [ 195 ArithMemConst{opc=CMP, address={offset=memRegExceptionPacket, base=ebp, index=NoIndex}, source=noException, opSize=polyWordOpSize}, 196 ConditionalBranch{test=JNE, label=exLabel}, 197 (* Remove any arguments that have been passed on the stack. *) 198 ReturnFromFunction(Int.max(case abi of X86_32 => nArgs-2 | _ => nArgs-5, 0)), 199 JumpLabel exLabel, (* else raise the exception *) 200 loadMemory(eax, ebp, memRegExceptionPacket), 201 RaiseException { workReg=ecx } 202 ] 203 204 val profileObject = createProfileObject functionName 205 val newCode = codeCreate (functionName, profileObject, debugSwitches) 206 val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=1(*One label.*), ops=code} 207 (* Have to create a closure for this *) 208 open Address 209 val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0) 210 in 211 assignWord(closure, 0w0, toMachineWord createdCode); 212 lock closure; 213 closure 214 end 215 216 (* This is a quicker version but can only be used if the RTS entry does 217 not allocated ML memory, raise an exception or need to suspend the thread. *) 218 fun rtsCallFast (functionName, nArgs, debugSwitches) = 219 let 220 val entryPointAddr = makeEntryPoint functionName 221 222 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 223 val abi = getABI() 224 225 val (entryPtrReg, saveMLStackPtrReg) = 226 if isX64 then (r11, r13) else (ecx, edi) 227 228 val stackSpace = 229 case abi of 230 X64Unix => memRegSize 231 | X64Win => memRegSize + 32 (* Requires 32-byte save area. *) 232 | X86_32 => 233 let 234 (* GCC likes to keep the stack on a 16-byte alignment. *) 235 val argSpace = nArgs*4 236 val align = argSpace mod 16 237 in 238 (* Add sufficient space so that esp will be 16-byte aligned *) 239 if align = 0 240 then memRegSize 241 else memRegSize + 16 - align 242 end 243 244 val code = 245 [ 246 MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *) 247 loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *) 248 moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *) 249 loadMemory(esp, ebp, memRegCStackPtr), 250 (* Set the stack pointer past the data on the stack. For Windows/64 add in a 32 byte save area *) 251 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize} 252 ] @ 253 ( 254 case (abi, nArgs) of (* Set the argument registers. *) 255 (_, 0) => [] 256 | (X64Unix, 1) => [ moveRR{source=eax, output=edi} ] 257 | (X64Unix, 2) => 258 [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi} ] 259 | (X64Unix, 3) => 260 [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi}, moveRR{source=r8, output=edx} ] 261 | (X64Unix, 4) => 262 [ moveRR{source=eax, output=edi}, moveRR{source=ebx, output=esi}, moveRR{source=r8, output=edx}, moveRR{source=r9, output=ecx} ] 263 | (X64Win, 1) => [ moveRR{source=eax, output=ecx} ] 264 | (X64Win, 2) => [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} ] 265 | (X64Win, 3) => 266 [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} (* Arg3 is already in r8. *) ] 267 | (X64Win, 4) => 268 [ moveRR{source=eax, output=ecx}, moveRR{source=ebx, output=edx} (* Arg3 is already in r8 and arg4 in r9. *) ] 269 | (X86_32, 1) => [ pushR eax ] 270 | (X86_32, 2) => [ pushR ebx, pushR eax ] 271 | (X86_32, 3) => 272 [ 273 (* We need to move an argument from the ML stack. *) 274 loadMemory(edx, saveMLStackPtrReg, 4), pushR edx, pushR ebx, pushR eax 275 ] 276 | (X86_32, 4) => 277 [ 278 (* We need to move an arguments from the ML stack. *) 279 loadMemory(edx, saveMLStackPtrReg, 4), pushR edx, 280 loadMemory(edx, saveMLStackPtrReg, 8), pushR edx, 281 pushR ebx, pushR eax 282 ] 283 | _ => raise InternalError "rtsCall: Abi/argument count not implemented" 284 ) @ 285 [ 286 CallFunction(DirectReg entryPtrReg), (* Call the function *) 287 moveRR{source=saveMLStackPtrReg, output=esp}, (* Restore the ML stack pointer *) 288 (* Remove any arguments that have been passed on the stack. *) 289 ReturnFromFunction(Int.max(case abi of X86_32 => nArgs-2 | _ => nArgs-5, 0)) 290 ] 291 292 val profileObject = createProfileObject functionName 293 val newCode = codeCreate (functionName, profileObject, debugSwitches) 294 val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code} 295 (* Have to create a closure for this *) 296 open Address 297 val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0) 298 in 299 assignWord(closure, 0w0, toMachineWord createdCode); 300 lock closure; 301 closure 302 end 303 304 (* RTS call with one double-precision floating point argument and a floating point result. 305 First version. This will probably be merged into the above code in due 306 course. 307 Currently ML always uses boxed values for floats. *) 308 fun rtsCallFastFloattoFloat (functionName, debugSwitches) = 309 let 310 val entryPointAddr = makeEntryPoint functionName 311 312 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 313 val abi = getABI() 314 315 val (entryPtrReg, saveMLStackPtrReg) = 316 if isX64 then (r11, r13) else (ecx, edi) 317 318 val stackSpace = 319 case abi of 320 X64Unix => memRegSize 321 | X64Win => memRegSize + 32 (* Requires 32-byte save area. *) 322 | X86_32 => 323 let 324 (* GCC likes to keep the stack on a 16-byte alignment. *) 325 val argSpace = 8 (*nArgs*4*) (* One "double" value. *) 326 val align = argSpace mod 16 327 in 328 (* Add sufficient space so that esp will be 16-byte aligned *) 329 if align = 0 330 then memRegSize 331 else memRegSize + 16 - align 332 end 333 334 (* Constants for a box for a float *) 335 val fpBoxSize = 8 div Word.toInt wordSize 336 val fpBoxLengthWord32 = IntInf.orb(IntInf.fromInt fpBoxSize, IntInf.<<(Word8.toLargeInt F_bytes, 0w24)) 337 338 val code = 339 [ 340 MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *) 341 loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *) 342 moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *) 343 loadMemory(esp, ebp, memRegCStackPtr), 344 (* Set the stack pointer past the data on the stack. For Windows/64 add in a 32 byte save area *) 345 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize} 346 ] @ 347 ( 348 case abi of 349 (* X64 on both Windows and Unix take the first arg in xmm0. We need to 350 unbox the value pointed at by rax. *) 351 X64Unix => [ XMMArith { opc= SSE2Move, source=MemoryArg{base=eax, offset=0, index=NoIndex}, output=xmm0 } ] 352 | X64Win => [ XMMArith { opc= SSE2Move, source=MemoryArg{base=eax, offset=0, index=NoIndex}, output=xmm0 } ] 353 | X86_32 => 354 (* eax contains the address of the value. This must be unboxed onto the stack. *) 355 [ 356 FPLoadFromMemory{address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision}, 357 ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize}, 358 FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true } 359 ] 360 ) @ 361 [ 362 CallFunction(DirectReg entryPtrReg), (* Call the function *) 363 moveRR{source=saveMLStackPtrReg, output=esp} (* Restore the ML stack pointer *) 364 ] @ 365 ( 366 (* Put the floating point result into a box. *) 367 case abi of 368 X86_32 => 369 [ 370 AllocStore{size=fpBoxSize, output=eax, saveRegs=[]}, 371 StoreConstToMemory{toStore=fpBoxLengthWord32, 372 address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}}, 373 FPStoreToMemory{ address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }, 374 StoreInitialised 375 ] 376 | _ => (* X64 The result is in xmm0 *) 377 [ 378 AllocStore{size=fpBoxSize, output=eax, saveRegs=[]}, 379 StoreConstToMemory{toStore=LargeInt.fromInt fpBoxSize, 380 address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}}, 381 StoreNonWordConst{size=Size8Bit, toStore=Word8.toLargeInt F_bytes, address={offset= ~1, base=eax, index=NoIndex}}, 382 XMMStoreToMemory { address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, toStore=xmm0 }, 383 StoreInitialised 384 ] 385 ) @ 386 [ 387 (* Remove any arguments that have been passed on the stack. *) 388 ReturnFromFunction 0 389 ] 390 391 val profileObject = createProfileObject functionName 392 val newCode = codeCreate (functionName, profileObject, debugSwitches) 393 val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code} 394 (* Have to create a closure for this *) 395 open Address 396 val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0) 397 in 398 assignWord(closure, 0w0, toMachineWord createdCode); 399 lock closure; 400 closure 401 end 402 403 (* RTS call with one general (i.e. ML word) argument and a floating point result. 404 This is used only to convert arbitrary precision values to floats. 405 In due course this will be merged into the other functions. *) 406 fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) = 407 let 408 val entryPointAddr = makeEntryPoint functionName 409 410 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 411 val abi = getABI() 412 413 val (entryPtrReg, saveMLStackPtrReg) = 414 if isX64 then (r11, r13) else (ecx, edi) 415 416 val stackSpace = 417 case abi of 418 X64Unix => memRegSize 419 | X64Win => memRegSize + 32 (* Requires 32-byte save area. *) 420 | X86_32 => 421 let 422 (* GCC likes to keep the stack on a 16-byte alignment. *) 423 val argSpace = 4 (* One "PolyWord" value. *) 424 val align = argSpace mod 16 425 in 426 (* Add sufficient space so that esp will be 16-byte aligned *) 427 if align = 0 428 then memRegSize 429 else memRegSize + 16 - align 430 end 431 432 (* Constants for a box for a float *) 433 val fpBoxSize = 8 div Word.toInt wordSize 434 val fpBoxLengthWord32 = IntInf.orb(IntInf.fromInt fpBoxSize, IntInf.<<(Word8.toLargeInt F_bytes, 0w24)) 435 436 val code = 437 [ 438 MoveToRegister{source=AddressConstArg entryPointAddr, output=entryPtrReg}, (* Load the entry point ref. *) 439 loadMemory(entryPtrReg, entryPtrReg, 0),(* Load its value. *) 440 moveRR{source=esp, output=saveMLStackPtrReg}, (* Save ML stack and switch to C stack. *) 441 loadMemory(esp, ebp, memRegCStackPtr), 442 (* Set the stack pointer past the data on the stack. For Windows/64 add in a 32 byte save area *) 443 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize} 444 ] @ 445 ( 446 case abi of 447 X64Unix => [ moveRR{source=eax, output=edi} ] 448 | X64Win => [ moveRR{source=eax, output=ecx} ] 449 | X86_32 => [ pushR eax ] 450 ) @ 451 [ 452 CallFunction(DirectReg entryPtrReg), (* Call the function *) 453 moveRR{source=saveMLStackPtrReg, output=esp} (* Restore the ML stack pointer *) 454 ] @ 455 ( 456 (* Put the floating point result into a box. *) 457 case abi of 458 X86_32 => 459 [ 460 AllocStore{size=fpBoxSize, output=eax, saveRegs=[]}, 461 StoreConstToMemory{toStore=fpBoxLengthWord32, 462 address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}}, 463 FPStoreToMemory{ address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true }, 464 StoreInitialised 465 ] 466 | _ => (* X64 The result is in xmm0 *) 467 [ 468 AllocStore{size=fpBoxSize, output=eax, saveRegs=[]}, 469 StoreConstToMemory{toStore=LargeInt.fromInt fpBoxSize, 470 address={offset= ~ (Word.toInt wordSize), base=eax, index=NoIndex}}, 471 StoreNonWordConst{size=Size8Bit, toStore=Word8.toLargeInt F_bytes, address={offset= ~1, base=eax, index=NoIndex}}, 472 XMMStoreToMemory { address={base=eax, offset=0, index=NoIndex}, precision=DoublePrecision, toStore=xmm0 }, 473 StoreInitialised 474 ] 475 ) @ 476 [ 477 (* Remove any arguments that have been passed on the stack. *) 478 ReturnFromFunction 0 479 ] 480 481 val profileObject = createProfileObject functionName 482 val newCode = codeCreate (functionName, profileObject, debugSwitches) 483 val createdCode = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code} 484 (* Have to create a closure for this *) 485 open Address 486 val closure = allocWordData(0w1, Word8.orb (F_mutable, F_words), toMachineWord 0w0) 487 in 488 assignWord(closure, 0w0, toMachineWord createdCode); 489 lock closure; 490 closure 491 end 492 493end; 494