1(* 2 Copyright (c) 2016-20 David C.J. Matthews 3 4 This library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Lesser General Public 6 License version 2.1 as published by the Free Software Foundation. 7 8 This library is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 Lesser General Public License for more details. 12 13 You should have received a copy of the GNU Lesser General Public 14 License along with this library; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16*) 17 18functor X86FOREIGNCALL( 19 20 structure X86CODE: X86CODESIG 21 22 structure X86OPTIMISE: 23 sig 24 type operation 25 type code 26 type operations = operation list 27 type closureRef 28 29 (* Optimise and code-generate. *) 30 val generateCode: {code: code, ops: operations, labelCount: int, resultClosure: closureRef} -> unit 31 32 structure Sharing: 33 sig 34 type operation = operation 35 type code = code 36 type closureRef = closureRef 37 end 38 end 39 40 structure DEBUG: DEBUG 41 42 structure CODE_ARRAY: CODEARRAYSIG 43 44 sharing X86CODE.Sharing = X86OPTIMISE.Sharing = CODE_ARRAY.Sharing 45): FOREIGNCALLSIG 46= 47struct 48 open X86CODE 49 open Address 50 open CODE_ARRAY 51 52 (* Unix X64. The first six arguments are in rdi, rsi, rdx, rcx, r8, r9. 53 The rest are on the stack. 54 Windows X64. The first four arguments are in rcx, rdx, r8 and r9. The rest are 55 on the stack. The caller must ensure the stack is aligned on 16-byte boundary 56 and must allocate 32-byte save area for the register args. 57 rbx, rbp, rdi, rsi, rsp, r12-r15 are saved by the called function. 58 X86/32. Arguments are pushed to the stack. 59 ebx, edi, esi, ebp and esp are saved by the called function. 60 We use esi to hold the argument data pointer and edi to save the ML stack pointer 61 Our ML conventions use eax, ebx for the first two arguments in X86/32, 62 rax, ebx, r8, r9, r10 for the first five arguments in X86/64 and 63 rax, rsi, r8, r9 and r10 for the first five arguments in X86/64-32 bit. 64 *) 65 66 val memRegSize = 0 67 68 val (polyWordOpSize, nativeWordOpSize) = 69 case targetArch of 70 Native32Bit => (OpSize32, OpSize32) 71 | Native64Bit => (OpSize64, OpSize64) 72 | ObjectId32Bit => (OpSize32, OpSize64) 73 74 (* Ebx/Rbx is used for the second argument on the native architectures but 75 is replaced by esi on the object ID arch because ebx is used as the 76 global base register. *) 77 val mlArg2Reg = case targetArch of ObjectId32Bit => esi | _ => ebx 78 79 exception InternalError = Misc.InternalError 80 81 fun opSizeToMove OpSize32 = Move32 | opSizeToMove OpSize64 = Move64 82 83 val pushR = PushToStack o RegisterArg 84 85 fun moveRR{source, output, opSize} = 86 Move{source=RegisterArg source, destination=RegisterArg output, moveSize=opSizeToMove opSize} 87 88 fun loadMemory(reg, base, offset, opSize) = 89 Move{source=MemoryArg{base=base, offset=offset, index=NoIndex}, destination=RegisterArg reg, moveSize=opSizeToMove opSize} 90 and storeMemory(reg, base, offset, opSize) = 91 Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset, index=NoIndex}, moveSize=opSizeToMove opSize} 92 93 val loadHeapMemory = 94 case targetArch of 95 ObjectId32Bit => 96 ( 97 fn (reg, base, offset, opSize) => 98 Move{source=MemoryArg{base=ebx, offset=offset, index=Index4 base}, 99 destination=RegisterArg reg, moveSize=opSizeToMove opSize} 100 ) 101 | _ => loadMemory 102 103 104 fun loadAddress{source=(srcReg, 0), destination} = 105 Move{source=RegisterArg srcReg, destination=RegisterArg destination, moveSize=opSizeToMove nativeWordOpSize} 106 | loadAddress{source=(srcReg, srcOffset), destination} = 107 LoadAddress{offset=srcOffset, base=SOME srcReg, index=NoIndex, output=destination, opSize=nativeWordOpSize } 108 109 (* Sequence of operations to move memory. *) 110 fun moveMemory{source, destination, length} = 111 [ 112 loadAddress{source=source, destination=rsi}, 113 loadAddress{source=destination, destination=rdi}, 114 (* N.B. When moving a struct in a Win64 callback the source could be rcx so only move this 115 after copying the source to rsi. *) 116 Move{source=NonAddressConstArg(LargeInt.fromInt length), destination=RegisterArg rcx, 117 moveSize=opSizeToMove nativeWordOpSize}, 118 RepeatOperation MOVS8 119 ] 120 121 fun createProfileObject _ (*functionName*) = 122 let 123 (* The profile object is a single mutable with the F_bytes bit set. *) 124 open Address 125 val profileObject = RunCall.allocateByteMemory(0w1, Word.fromLargeWord(Word8.toLargeWord(Word8.orb(F_mutable, F_bytes)))) 126 fun clear 0w0 = () 127 | clear i = (assignByte(profileObject, i-0w1, 0w0); clear (i-0w1)) 128 val () = clear wordSize 129 in 130 toMachineWord profileObject 131 end 132 133 val makeEntryPoint: string -> machineWord = RunCall.rtsCallFull1 "PolyCreateEntryPointObject" 134 135 datatype abi = X86_32 | X64Win | X64Unix 136 137 local 138 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 139 val getABICall: unit -> int = RunCall.rtsCallFast0 "PolyGetABI" 140 in 141 fun getABI() = 142 case getABICall() of 143 0 => X86_32 144 | 1 => X64Unix 145 | 2 => X64Win 146 | n => raise InternalError ("Unknown ABI type " ^ Int.toString n) 147 end 148 149 (* This is now the standard entry call code. *) 150 datatype fastArgs = FastArgFixed | FastArgDouble | FastArgFloat 151 152 153 fun rtsCallFastGeneral (functionName, argFormats, (*resultFormat*) _, debugSwitches) = 154 let 155 val entryPointAddr = makeEntryPoint functionName 156 157 (* Get the ABI. On 64-bit Windows and Unix use different calling conventions. *) 158 val abi = getABI() 159 160 val entryPtrReg = if targetArch <> Native32Bit then r11 else ecx 161 162 val nArgs = List.length argFormats 163 164 local 165 (* Compute stack space. The actual number of args passed is nArgs. *) 166 val argSpace = 167 case abi of 168 X64Unix => Int.max(0, nArgs-6)*8 169 | X64Win => Int.max(0, nArgs-4)*8 170 | X86_32 => List.foldl(fn (FastArgDouble, n) => n+8 | (_, n) => n+4) 0 argFormats 171 val align = argSpace mod 16 172 in 173 (* Add sufficient space so that esp will be 16-byte aligned after we 174 have pushed any arguments we need to push. *) 175 val stackSpace = 176 if align = 0 177 then memRegSize 178 else memRegSize + 16 - align 179 end 180 181 (* The number of ML arguments passed on the stack. *) 182 val mlArgsOnStack = Int.max(case abi of X86_32 => nArgs - 2 | _ => nArgs - 5, 0) 183 184 val code = 185 [ 186 Move{source=AddressConstArg entryPointAddr, destination=RegisterArg entryPtrReg, moveSize=opSizeToMove polyWordOpSize}, (* Load the entry point ref. *) 187 loadHeapMemory(entryPtrReg, entryPtrReg, 0, nativeWordOpSize)(* Load its value. *) 188 ] @ 189 ( 190 (* Save heap ptr. This is in r15 in X86/64 *) 191 if targetArch <> Native32Bit then [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] (* Save heap ptr *) 192 else [] 193 ) @ 194 ( 195 if (case abi of X86_32 => nArgs >= 3 | _ => nArgs >= 6) 196 then [moveRR{source=esp, output=edi, opSize=nativeWordOpSize}] (* Needed if we have to load from the stack. *) 197 else [] 198 ) @ 199 [ 200 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *) 201 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize), 202 (* Set the stack pointer past the data on the stack. For Windows/64 add in a 32 byte save area *) 203 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackSpace), opSize=nativeWordOpSize} 204 ] @ 205 ( 206 case abi of (* Set the argument registers. *) 207 X86_32 => 208 let 209 fun pushReg(reg, FastArgFixed) = [pushR reg] 210 | pushReg(reg, FastArgDouble) = 211 (* reg contains the address of the value. This must be unboxed onto the stack. *) 212 [ 213 FPLoadFromMemory{address={base=reg, offset=0, index=NoIndex}, precision=DoublePrecision}, 214 ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 8, opSize=nativeWordOpSize}, 215 FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=DoublePrecision, andPop=true } 216 ] 217 | pushReg(reg, FastArgFloat) = 218 (* reg contains the address of the value. This must be unboxed onto the stack. *) 219 [ 220 FPLoadFromMemory{address={base=reg, offset=0, index=NoIndex}, precision=SinglePrecision}, 221 ArithToGenReg{ opc=SUB, output=esp, source=NonAddressConstArg 4, opSize=nativeWordOpSize}, 222 FPStoreToMemory{ address={base=esp, offset=0, index=NoIndex}, precision=SinglePrecision, andPop=true } 223 ] 224 225 (* The stack arguments have to be copied first followed by the ebx and finally eax. *) 226 fun pushArgs (_, []) = [] 227 | pushArgs (_, [argType]) = pushReg(eax, argType) 228 | pushArgs (_, [arg2Type, arg1Type]) = pushReg(ebx, arg2Type) @ pushReg(eax, arg1Type) 229 | pushArgs (n, FastArgFixed :: argTypes) = 230 PushToStack(MemoryArg{base=edi, offset=(nArgs-n+1)* 4, index=NoIndex}) :: pushArgs(n-1, argTypes) 231 | pushArgs (n, argType :: argTypes) = 232 (* Use esi as a temporary register. *) 233 loadMemory(esi, edi, (nArgs-n+1)* 4, polyWordOpSize) :: pushReg(esi, argType) @ pushArgs(n-1, argTypes) 234 in 235 pushArgs(nArgs, List.rev argFormats) 236 end 237 238 | X64Unix => 239 ( 240 if List.all (fn FastArgFixed => true | _ => false) argFormats 241 then 242 let 243 fun pushArgs 0 = [] 244 | pushArgs 1 = [moveRR{source=eax, output=edi, opSize=polyWordOpSize}] 245 | pushArgs 2 = moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize} :: pushArgs 1 246 | pushArgs 3 = moveRR{source=r8, output=edx, opSize=polyWordOpSize} :: pushArgs 2 247 | pushArgs 4 = moveRR{source=r9, output=ecx, opSize=polyWordOpSize} :: pushArgs 3 248 | pushArgs 5 = 249 (* We have to move r8 into edx before we can move r10 into r8 *) 250 moveRR{source=r8, output=edx, opSize=polyWordOpSize} :: 251 moveRR{source=r9, output=ecx, opSize=polyWordOpSize} :: 252 moveRR{source=r10, output=r8, opSize=polyWordOpSize} :: pushArgs 2 253 | pushArgs 6 = 254 (* We have to move r9 into edi before we can load r9 from the stack. *) 255 moveRR{source=r8, output=edx, opSize=polyWordOpSize} :: 256 moveRR{source=r9, output=ecx, opSize=polyWordOpSize} :: 257 loadMemory(r9, edi, 8, polyWordOpSize) :: 258 moveRR{source=r10, output=r8, opSize=polyWordOpSize} :: pushArgs 2 259 | pushArgs _ = raise InternalError "rtsCall: Abi/argument count not implemented" 260 in 261 pushArgs nArgs 262 end 263 else case argFormats of 264 [] => [] 265 | [FastArgFixed] => [ moveRR{source=eax, output=edi, opSize=polyWordOpSize} ] 266 | [FastArgFixed, FastArgFixed] => 267 (* Since mlArgs2Reg is esi on 32-in-64 this is redundant. *) 268 [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize} ] 269 | [FastArgFixed, FastArgFixed, FastArgFixed] => 270 [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize}, 271 moveRR{source=r8, output=edx, opSize=polyWordOpSize} ] 272 | [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed] => 273 [ moveRR{source=mlArg2Reg, output=esi, opSize=polyWordOpSize}, moveRR{source=eax, output=edi, opSize=polyWordOpSize}, 274 moveRR{source=r8, output=edx, opSize=polyWordOpSize}, moveRR{source=r9, output=ecx, opSize=polyWordOpSize} ] 275 (* One "double" argument. The value needs to be unboxed. *) 276 | [FastArgDouble] => [] (* Already in xmm0 *) 277 (* X64 on both Windows and Unix take the first arg in xmm0 and the second in xmm1. They are already there. *) 278 | [FastArgDouble, FastArgDouble] => [] 279 | [FastArgDouble, FastArgFixed] => [ moveRR{source=eax, output=edi, opSize=nativeWordOpSize} ] 280 | [FastArgFloat] => [] (* Already in xmm0 *) 281 | [FastArgFloat, FastArgFloat] => [] (* Already in xmm0 and xmm1 *) 282 (* One float argument and one fixed. *) 283 | [FastArgFloat, FastArgFixed] => [moveRR{source=mlArg2Reg, output=edi, opSize=polyWordOpSize} ] 284 285 | _ => raise InternalError "rtsCall: Abi/argument count not implemented" 286 287 ) 288 289 | X64Win => 290 ( 291 if List.all (fn FastArgFixed => true | _ => false) argFormats 292 then 293 let 294 fun pushArgs 0 = [] 295 | pushArgs 1 = [moveRR{source=eax, output=ecx, opSize=polyWordOpSize}] 296 | pushArgs 2 = moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} :: pushArgs 1 297 | pushArgs 3 = (* Already in r8 *) pushArgs 2 298 | pushArgs 4 = (* Already in r9, and r8 *) pushArgs 2 299 | pushArgs 5 = pushR r10 :: pushArgs 2 300 | pushArgs 6 = PushToStack(MemoryArg{base=edi, offset=8, index=NoIndex}) :: pushArgs 5 301 | pushArgs _ = raise InternalError "rtsCall: Abi/argument count not implemented" 302 in 303 pushArgs nArgs 304 end 305 306 else case argFormats of 307 [FastArgFixed] => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize} ] 308 | [FastArgFixed, FastArgFixed] => [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} ] 309 | [FastArgFixed, FastArgFixed, FastArgFixed] => 310 [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8. *) ] 311 | [FastArgFixed, FastArgFixed, FastArgFixed, FastArgFixed] => 312 [ moveRR{source=eax, output=ecx, opSize=polyWordOpSize}, moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize} (* Arg3 is already in r8 and arg4 in r9. *) ] 313 | [FastArgDouble] => [ (* Already in xmm0 *) ] 314 (* X64 on both Windows and Unix take the first arg in xmm0 and the second in xmm1. They are already there. *) 315 | [FastArgDouble, FastArgDouble] => [ ] 316 (* X64 on both Windows and Unix take the first arg in xmm0. On Unix the integer argument is treated 317 as the first argument and goes into edi. On Windows it's treated as the second and goes into edx. 318 N.B. It's also the first argument in ML so is in rax. *) 319 | [FastArgDouble, FastArgFixed] => [ moveRR{source=eax, output=edx, opSize=nativeWordOpSize} ] 320 | [FastArgFloat] => [] 321 | [FastArgFloat, FastArgFloat] => [] (* Already in xmm0 and xmm1 *) 322 | [FastArgFloat, FastArgFixed] => [moveRR{source=mlArg2Reg, output=edx, opSize=polyWordOpSize}] 323 324 | _ => raise InternalError "rtsCall: Abi/argument count not implemented" 325 ) 326 ) @ 327 (* For Windows/64 add in a 32 byte save area ater we've pushed any arguments. *) 328 (case abi of X64Win => [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg 32, opSize=nativeWordOpSize}] | _ => []) @ 329 [ 330 CallAddress(RegisterArg entryPtrReg), (* Call the function *) 331 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *) 332 ] @ 333 ( 334 if targetArch <> Native32Bit then [loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize) ] (* Copy back the heap ptr *) 335 else [] 336 ) @ 337 [ 338 (* Since this is an ML function we need to remove any ML stack arguments. *) 339 ReturnFromFunction mlArgsOnStack 340 ] 341 342 val profileObject = createProfileObject functionName 343 val newCode = codeCreate (functionName, profileObject, debugSwitches) 344 val closure = makeConstantClosure() 345 val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure} 346 in 347 closureAsAddress closure 348 end 349 350 351 fun rtsCallFast (functionName, nArgs, debugSwitches) = 352 rtsCallFastGeneral (functionName, List.tabulate(nArgs, fn _ => FastArgFixed), FastArgFixed, debugSwitches) 353 354 (* RTS call with one double-precision floating point argument and a floating point result. *) 355 fun rtsCallFastRealtoReal (functionName, debugSwitches) = 356 rtsCallFastGeneral (functionName, [FastArgDouble], FastArgDouble, debugSwitches) 357 358 (* RTS call with two double-precision floating point arguments and a floating point result. *) 359 fun rtsCallFastRealRealtoReal (functionName, debugSwitches) = 360 rtsCallFastGeneral (functionName, [FastArgDouble, FastArgDouble], FastArgDouble, debugSwitches) 361 362 (* RTS call with one double-precision floating point argument, one fixed point argument and a 363 floating point result. *) 364 fun rtsCallFastRealGeneraltoReal (functionName, debugSwitches) = 365 rtsCallFastGeneral (functionName, [FastArgDouble, FastArgFixed], FastArgDouble, debugSwitches) 366 367 (* RTS call with one general (i.e. ML word) argument and a floating point result. 368 This is used only to convert arbitrary precision values to floats. *) 369 fun rtsCallFastGeneraltoReal (functionName, debugSwitches) = 370 rtsCallFastGeneral (functionName, [FastArgFixed], FastArgDouble, debugSwitches) 371 372 (* Operations on Real32.real values. *) 373 374 fun rtsCallFastFloattoFloat (functionName, debugSwitches) = 375 rtsCallFastGeneral (functionName, [FastArgFloat], FastArgFloat, debugSwitches) 376 377 fun rtsCallFastFloatFloattoFloat (functionName, debugSwitches) = 378 rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFloat], FastArgFloat, debugSwitches) 379 380 (* RTS call with one double-precision floating point argument, one fixed point argument and a 381 floating point result. *) 382 fun rtsCallFastFloatGeneraltoFloat (functionName, debugSwitches) = 383 rtsCallFastGeneral (functionName, [FastArgFloat, FastArgFixed], FastArgFloat, debugSwitches) 384 385 (* RTS call with one general (i.e. ML word) argument and a floating point result. 386 This is used only to convert arbitrary precision values to floats. *) 387 fun rtsCallFastGeneraltoFloat (functionName, debugSwitches) = 388 rtsCallFastGeneral (functionName, [FastArgFixed], FastArgFloat, debugSwitches) 389 390 datatype ffiABI = 391 FFI_SYSV (* Unix 32 bit and Windows GCC 32-bit *) 392 | FFI_STDCALL (* Windows 32-bit system ABI. Callee clears the stack. *) 393 | FFI_MS_CDECL (* VS 32-bit. Same as SYSV except when returning a struct. Default on Windows including GCC in Mingw. *) 394 | FFI_WIN64 (* Windows 64 bit *) 395 | FFI_UNIX64 (* Unix 64 bit. libffi also implements this on X86/32. *) 396 (* We don't include various other 32-bit Windows ABIs. *) 397 398 local 399 val getOSType: unit -> int = RunCall.rtsCallFast0 "PolyGetOSType" 400 in 401 (* This actually a constant since each exported saved state has 402 a distinct ABI. However for compatibility with the interpreted 403 version we make this a function. *) 404 fun abiList () = 405 case getABI() of 406 X86_32 => 407 [("sysv", FFI_SYSV), ("stdcall", FFI_STDCALL), ("ms_cdecl", FFI_MS_CDECL), 408 (* Default to MS_CDECL on Windows otherwise SYSV. *) 409 ("default", if getOSType() = 1 then FFI_MS_CDECL else FFI_SYSV)] 410 | X64Win => [("win64", FFI_WIN64), ("default", FFI_WIN64)] 411 | X64Unix => [("unix64", FFI_UNIX64), ("default", FFI_UNIX64)] 412 413 type abi = ffiABI 414 end 415 416 fun alignUp(s, align) = Word.andb(s + align-0w1, ~ align) 417 fun intAlignUp(s, align) = Word.toInt(alignUp(Word.fromInt s, align)) 418 419 val getThreadDataCall = makeEntryPoint "PolyX86GetThreadData" 420 421 local 422 val sysWordSize = Word.toInt(nativeWordSize div wordSize) 423 in 424 (* Code to box an address as a SysWord.word value *) 425 fun boxRegAsSysWord(boxReg, outputReg, saveRegs) = 426 AllocStore{ size=sysWordSize, output=outputReg, saveRegs=saveRegs } :: 427 ( 428 if targetArch = Native64Bit 429 then 430 [ 431 Move{source=NonAddressConstArg(LargeInt.fromInt sysWordSize), 432 destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=outputReg, index=NoIndex}, 433 moveSize=opSizeToMove polyWordOpSize}, 434 Move{moveSize=Move8, source=NonAddressConstArg 1 (* byte *), destination=MemoryArg {offset= ~1, base=outputReg, index=NoIndex}} 435 ] 436 else 437 let 438 val lengthWord = IntInf.orb(IntInf.fromInt sysWordSize, IntInf.<<(1, 0w24)) 439 in 440 [Move{source=NonAddressConstArg lengthWord, destination=MemoryArg {offset= ~ (Word.toInt wordSize), base=outputReg, index=NoIndex}, 441 moveSize=opSizeToMove polyWordOpSize}] 442 end 443 ) @ 444 Move{source=RegisterArg boxReg, destination=MemoryArg {offset=0, base=outputReg, index=NoIndex}, moveSize=opSizeToMove nativeWordOpSize} :: 445 ( 446 if targetArch = ObjectId32Bit 447 then 448 [ 449 ArithToGenReg{ opc=SUB, output=outputReg, source=RegisterArg rbx, opSize=nativeWordOpSize }, 450 ShiftConstant{ shiftType=SHR, output=outputReg, shift=0w2, opSize=OpSize64 } 451 ] 452 else [] 453 ) @ [StoreInitialised] 454 end 455 456 (* Build a foreign call function. The arguments are the abi, the list of argument types and the result type. 457 The result is the code of the ML function that takes three arguments: the C function to call, the arguments 458 as a vector of C values and the address of the memory for the result. *) 459 460 (* This must match the type in Foreign.LowLevel. Once this is bootstrapped we could use that 461 type but note that this is the type we use within the compiler and we build Foreign.LowLevel 462 AFTER compiling this. *) 463 datatype cTypeForm = 464 CTypeFloatingPt | CTypePointer | CTypeSignedInt | CTypeUnsignedInt 465 | CTypeStruct of cType list | CTypeVoid 466 withtype cType = { typeForm: cTypeForm, align: word, size: word } 467 468 fun call32Bits(abi, args, result) = 469 let 470 (* 32-bit arguments. These all go to the stack so we can simply push them. The arguments go on the 471 stack in reverse order. *) 472 fun loadArgs32([], stackOffset, argOffset, code, continue) = continue(stackOffset, argOffset, code) 473 474 | loadArgs32(arg::args, stackOffset, argOffset, code, continue) = 475 let 476 val {size, align, typeForm} = arg 477 val newArgOffset = alignUp(argOffset, align) 478 val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex} 479 in 480 case (typeForm, size) of 481 (CTypeStruct elements, _) => (* structs passed as values are recursively unpacked. *) 482 loadArgs32(elements, stackOffset, newArgOffset (* Struct is aligned. *), code, 483 fn (so, ao, code) => loadArgs32(args, so, ao, code, continue)) 484 | (CTypeVoid, _) => raise Foreign.Foreign "Void cannot be used for a function argument" 485 | (CTypeUnsignedInt, 0w1) => (* Unsigned char. *) 486 loadArgs32(args, stackOffset+4, newArgOffset+size, 487 Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8 } 488 :: PushToStack(RegisterArg edx) :: code, continue) 489 | (CTypeSignedInt, 0w1) => (* Signed char. *) 490 loadArgs32(args, stackOffset+4, newArgOffset+size, 491 Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move8X32 } 492 :: PushToStack(RegisterArg edx) :: code, continue) 493 | (CTypeUnsignedInt, 0w2) => (* Unsigned 16-bits. *) 494 loadArgs32(args, stackOffset+4, newArgOffset+size, 495 Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16 } 496 :: PushToStack(RegisterArg edx) :: code, continue) 497 | (CTypeSignedInt, 0w2) => (* Signed 16-bits. *) 498 loadArgs32(args, stackOffset+4, newArgOffset+size, 499 Move{source=MemoryArg baseAddr, destination=RegisterArg edx, moveSize=Move16X32 } 500 :: PushToStack(RegisterArg edx) :: code, continue) 501 | (_, 0w4) => (* 32-bits. *) 502 loadArgs32(args, stackOffset+4, newArgOffset+size, PushToStack(MemoryArg baseAddr) :: code, continue) 503 | (CTypeFloatingPt, 0w8) =>(* Double: push the two words. High-order word first, then low-order. *) 504 loadArgs32(args, stackOffset+8, newArgOffset+size, 505 PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset+4, index=NoIndex}) :: 506 PushToStack(MemoryArg{base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex}) :: code, continue) 507 | _ => raise Foreign.Foreign "argument type not supported" 508 end 509 510 val {typeForm, size, ...} = result 511 512 val resultMemory = {base=ecx, offset=0, index=NoIndex} 513 (* Structures are passed by reference by storing the address of the result as 514 the first argument except that in MS_CDECL (and STDCALL?) structures of 515 size 1, 2, 4 and 8 are returned in EAX, and for 8, EDX. *) 516 val (getResult, needResultAddress) = 517 if (case typeForm of CTypeStruct _ => true | _ => false) andalso 518 (abi = FFI_SYSV orelse (size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8)) 519 (* TODO: We have to get the address of the destination area. *) 520 then ([], true) 521 else if typeForm = CTypeVoid 522 then ([], false) 523 else 524 (loadMemory(ecx, esp, 4, nativeWordOpSize) :: 525 loadHeapMemory(ecx, ecx, 0, nativeWordOpSize) :: 526 (if size = 0w1 527 then (* Single byte *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move8}] 528 else if size = 0w2 529 then (* 16-bits *) [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move16}] 530 else if typeForm = CTypeFloatingPt andalso size = 0w4 531 then [FPStoreToMemory{address=resultMemory, precision=SinglePrecision, andPop=true }] 532 else if size = 0w4 533 then [Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32}] 534 else if typeForm = CTypeFloatingPt andalso size = 0w8 535 then [FPStoreToMemory{address=resultMemory, precision=DoublePrecision, andPop=true }] 536 else if size = 0w8 537 then 538 [ 539 Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=Move32}, 540 Move{source=RegisterArg edx, destination=MemoryArg {base=ecx, offset=4, index=NoIndex}, moveSize=Move32} 541 ] 542 else raise Foreign.Foreign "Unrecognised result type"), false) 543 544 local 545 (* Load the arguments. If we need to pass the return address for a struct that is the first arg. *) 546 val (startStack, startCode) = 547 if needResultAddress 548 then (4, [PushToStack(MemoryArg{base=ecx, offset=0, index=NoIndex})]) 549 else (0, []) 550 in 551 val (argCode, argStack) = 552 loadArgs32(args, startStack, 0w0, startCode, fn (stackOffset, _, code) => (code, stackOffset)) 553 end 554 555 local 556 val align = argStack mod 16 557 in 558 (* Always align the stack. It's not always necessary on 32-bits but GCC prefers it. *) 559 val preArgAlign = if align = 0 then 0 else 16-align 560 (* Adjustment to be made when the function returns. Stdcall resets the stack in the callee. *) 561 val postCallStackReset = 562 preArgAlign + (if abi = FFI_STDCALL then 0 else argStack) 563 end 564 565 in 566 ( 567 (* If we're returning a struct we need the result address before we call. *) 568 if needResultAddress then [loadMemory(ecx, esp, 4, nativeWordOpSize)] else [] 569 ) @ 570 [ 571 (* Save the stack pointer. *) 572 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *) 573 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize) (* Load the saved C stack pointer. *) 574 ] @ 575 ( 576 if preArgAlign = 0 577 then [] 578 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}] 579 ) @ 580 ( 581 (* The second argument is a SysWord containing the address of a malloced area of memory 582 with the actual arguments in it. *) 583 if null args 584 then [] 585 else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)] 586 ) @ argCode @ 587 CallAddress(MemoryArg{base=eax, offset=0, index=NoIndex}) :: 588 (* Restore the C stack. This is really only necessary if we've called a callback 589 since that will store its esp value. *) 590 ( 591 if postCallStackReset = 0 592 then [] 593 else [ArithToGenReg{opc=ADD, output=esp, source=NonAddressConstArg(LargeInt.fromInt postCallStackReset), opSize=nativeWordOpSize}] 594 ) @ 595 [ 596 storeMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize), 597 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize) (* Restore the ML stack pointer. *) 598 ] @ getResult @ (* Store the result in the destination. *) [ ReturnFromFunction 1 ] 599 end 600 601 fun closure32Bits(abi, args, result) = 602 let 603 (* Arguments are copied from the stack into a struct that is then passed to the 604 ML function. *) 605 fun copyArgs([], nArgs, argOffset, code, continue) = continue(nArgs, argOffset, code) 606 607 | copyArgs(arg::args, nArgs, argOffset, code, continue) = 608 let 609 val {size, align, typeForm} = arg 610 val newArgOffset = alignUp(argOffset, align) 611 val sourceAddr = {base=ebx, offset=nArgs*4, index=NoIndex} 612 val destAddr = {base=esp, offset=Word.toInt newArgOffset, index=NoIndex} 613 in 614 case (typeForm, size) of 615 (CTypeStruct elements, _) => 616 (* structs passed as values are recursively unpacked. *) 617 copyArgs(elements, nArgs, newArgOffset (* Struct is aligned. *), code, 618 fn (na, ao, c) => copyArgs(args, na, ao, c, continue)) 619 | (CTypeVoid, _) => 620 raise Foreign.Foreign "Void cannot be used for a function argument" 621 | (CTypeFloatingPt, 0w8) => 622 (* Double: copy the two words. High-order word first, then low-order. *) 623 copyArgs(args, nArgs+2, argOffset+size, 624 Move{source=MemoryArg sourceAddr, destination=RegisterArg eax, moveSize=Move32} :: 625 Move{source=RegisterArg eax, destination=MemoryArg destAddr, moveSize=Move32} :: 626 Move{source=MemoryArg {base=ebx, offset=nArgs*4+4, index=NoIndex}, destination=RegisterArg eax, moveSize=Move32} :: 627 Move{source=RegisterArg eax, destination=MemoryArg{base=esp, offset=Word.toInt newArgOffset + 4, index=NoIndex}, moveSize=Move32} :: 628 code, continue) 629 | _ => (* Everything else is a single word on the stack. *) 630 let 631 val moveOp = 632 case size of 633 0w1 => Move8 634 | 0w2 => Move16 635 | 0w4 => Move32 636 | _ => raise Foreign.Foreign "copyArgs: Invalid size" 637 in 638 copyArgs(args, nArgs+1, argOffset+size, 639 Move{source=MemoryArg sourceAddr, destination=RegisterArg eax, moveSize=Move32} :: 640 Move{source=RegisterArg eax, destination=MemoryArg destAddr, moveSize=moveOp} :: code, continue) 641 end 642 end 643 644 val {typeForm, size, align, ...} = result 645 646 (* Struct results are normally passed by reference. *) 647 val resultStructByRef = 648 (case typeForm of CTypeStruct _ => true | _ => false) andalso 649 (abi = FFI_SYSV orelse (size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8)) 650 651 val (argCount, argumentSpace, copyArgsFromStack) = 652 copyArgs(args, if resultStructByRef then 1 else 0, 0w0, [], fn result => result) 653 654 val resultOffset = alignUp(argumentSpace, align) (* Offset of result area *) 655 656 val (loadResults, resultSize) = 657 if typeForm = CTypeVoid orelse resultStructByRef 658 then ([], 0w0) 659 else 660 let 661 val resultMem = {base=esp, offset=Word.toInt resultOffset, index=NoIndex} 662 val resultCode = 663 case (typeForm, size) of 664 (CTypeSignedInt, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move8X32 }] 665 | (_, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move8 }] 666 | (CTypeSignedInt, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move16X32 }] 667 | (_, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move16 }] 668 | (CTypeFloatingPt, 0w4) => [FPLoadFromMemory{ address=resultMem, precision=SinglePrecision }] 669 | (_, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move32 }] 670 | (CTypeFloatingPt, 0w8) => [FPLoadFromMemory{ address=resultMem, precision=DoublePrecision }] 671 | (_, 0w8) => (* MSC only. Struct returned in eax/edx. *) 672 [ 673 Move{source=MemoryArg resultMem, destination=RegisterArg eax, moveSize=Move32 }, 674 Move{source=MemoryArg {base=esp, offset=Word.toInt resultOffset + 4, index=NoIndex}, 675 destination=RegisterArg edx, moveSize=Move32 } 676 ] 677 | _ => raise Foreign.Foreign "Unrecognised result type" 678 in 679 (resultCode, size) 680 end 681 682 val stackSpace = Word.toInt(resultOffset + resultSize) 683 684 local 685 val align = stackSpace mod 16 686 in 687 (* Stack space. In order to align the stack correctly for GCC we need the value in memRegCStackPtr 688 to be a multiple of 16 bytes + 8. esp would have been on a 16 byte boundary before the return address 689 was pushed so after pushing the return address and four registers we need a further 4 bytes 690 to get the alignment back again. The effect of this is that the argument and result area is 691 on an 8-byte boundary. *) 692 val stackToAllocate = stackSpace + (if align = 0 then 0 else 16-align) + 4 693 end 694 in 695 [ 696 (* Push callee-save registers. *) 697 PushToStack(RegisterArg ebp), PushToStack(RegisterArg ebx), PushToStack(RegisterArg edi), PushToStack(RegisterArg esi), 698 (* Set ebx to point to the original args. *) 699 LoadAddress{ output=ebx, offset=20, base=SOME esp, index=NoIndex, opSize=OpSize32}, 700 (* Allocate stack space. *) 701 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=OpSize32}, 702 (* Move the function address in eax into esi since that's a callee-save register. *) 703 Move{source=RegisterArg eax, destination=RegisterArg esi, moveSize=Move32} 704 ] @ copyArgsFromStack @ 705 [ 706 (* Get the value for ebp. *) 707 Move{source=AddressConstArg getThreadDataCall, destination=RegisterArg ecx, moveSize=Move32}, 708 CallAddress(MemoryArg{base=ecx, offset=0, index=NoIndex}), (* Get the address - N.B. Heap addr in 32-in-64. *) 709 moveRR{source=eax, output=ebp, opSize=OpSize32}, 710 (* Save the address of the argument and result area. *) 711 moveRR{source=esp, output=ecx, opSize=OpSize32}, 712 (* Switch to the ML stack. *) 713 storeMemory(esp, ebp, memRegCStackPtr, OpSize32), 714 loadMemory(esp, ebp, memRegStackPtr, OpSize32), 715 (* Move esi into the closure register edx *) 716 Move{source=RegisterArg esi, destination=RegisterArg edx, moveSize=Move32} 717 ] @ boxRegAsSysWord(ecx, eax, []) @ 718 ( 719 (* If we're returning a struct the address for the result will have been passed in the 720 first argument. We use that as the result area. Otherwise point to the result 721 area on the stack. *) 722 if resultStructByRef 723 then Move{source=MemoryArg {offset=0, base=ebx, index=NoIndex}, destination=RegisterArg ecx, moveSize=Move32} 724 else ArithToGenReg{opc=ADD, output=ecx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=OpSize32} 725 ) :: boxRegAsSysWord(ecx, ebx, [eax]) @ 726 [ 727 (* Call the ML function using the full closure call. *) 728 CallAddress(MemoryArg{offset=0, base=edx, index=NoIndex}), 729 (* Save the ML stack pointer because we may have grown the stack. Switch to the C stack. *) 730 storeMemory(esp, ebp, memRegStackPtr, OpSize32), 731 loadMemory(esp, ebp, memRegCStackPtr, OpSize32) 732 ] @ loadResults @ 733 [ 734 (* Remove the stack space. *) 735 ArithToGenReg{opc=ADD, output=esp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=OpSize32}, 736 PopR esi, PopR edi, PopR ebx, PopR ebp (* Restore callee-save registers. *) 737 ] @ 738 ( 739 (* If we've passed in the address of the area for the result structure 740 we're supposed to pass that back in eax. *) 741 if resultStructByRef 742 then [loadMemory(eax, esp, 4, OpSize32)] 743 else [] 744 ) @ 745 [ 746 (* Callee removes arguments in StdCall. *) 747 ReturnFromFunction (if abi = FFI_STDCALL then argCount else 0) 748 ] 749 end 750 751 local (* Windows on X64. *) 752 val win64ArgRegs = [ (rcx, xmm0), (rdx, xmm1), (r8, xmm2), (r9, xmm3) ] 753 in 754 fun callWindows64Bits(args, result) = 755 let 756 val extraStackReg = r10 (* Not used for any arguments. *) 757 758 fun loadWin64Args([], stackOffset, _, _, code, extraStack, preCode) = (code, stackOffset, preCode, extraStack) 759 760 | loadWin64Args(arg::args, stackOffset, argOffset, regs, code, extraStack, preCode) = 761 let 762 val {size, align, typeForm, ...} = arg 763 val newArgOffset = alignUp(argOffset, align) 764 val baseAddr = {base=mlArg2Reg, offset=Word.toInt newArgOffset, index=NoIndex} 765 val workReg = rcx (* rcx: the last to be loaded. *) 766 767 (* Integer arguments. *) 768 fun loadIntArg moveOp = 769 case regs of 770 (areg, _) :: regs' => 771 loadWin64Args(args, stackOffset, newArgOffset+size, regs', 772 Move{source=MemoryArg baseAddr, destination=RegisterArg areg, moveSize=moveOp } :: code, 773 extraStack, preCode) 774 | [] => 775 loadWin64Args(args, stackOffset+8, newArgOffset+size, [], 776 if size = 0w8 777 then PushToStack(MemoryArg baseAddr) :: code 778 else (* Need to load it into a register first. *) 779 Move{source=MemoryArg baseAddr, 780 destination=RegisterArg workReg, moveSize=moveOp } :: PushToStack(RegisterArg workReg) :: code, 781 extraStack, preCode) 782 in 783 (* Structs of 1, 2, 4 and 8 bytes are passed as the corresponding int. It may not 784 be necessary to sign-extend 1, 2 or 4-byte values. 785 2, 4 or 8-byte structs may not be aligned onto the appropriate boundary but 786 it should still work. *) 787 case (size, typeForm) of 788 (0w1, CTypeSignedInt) => (* Signed char. *) loadIntArg Move8X64 789 | (0w1, _) => (* Unsigned char or single byte struct *) loadIntArg Move8 790 791 | (0w2, CTypeSignedInt) =>(* Signed 16-bits. *) loadIntArg Move16X64 792 | (0w2, _) => (* Unsigned 16-bits. *) loadIntArg Move16 793 794 | (0w4, CTypeFloatingPt) => 795 ( 796 case regs of 797 (_, fpReg) :: regs' => 798 loadWin64Args(args, stackOffset, newArgOffset+size, regs', 799 XMMArith{opc=SSE2MoveFloat, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode) 800 | [] => 801 loadWin64Args(args, stackOffset+8, newArgOffset+size, [], 802 Move{source=MemoryArg baseAddr, 803 destination=RegisterArg workReg, moveSize=Move32 } :: PushToStack(RegisterArg workReg) :: code, 804 extraStack, preCode) 805 ) 806 | (0w4, CTypeSignedInt) => (* Signed 32-bits. *) loadIntArg Move32X64 807 | (0w4, _) => (* Unsigned 32-bits. *) loadIntArg Move32 808 809 | (0w8, CTypeFloatingPt) => 810 ( 811 case regs of 812 (_, fpReg) :: regs' => 813 loadWin64Args(args, stackOffset, newArgOffset+size, regs', 814 XMMArith{opc=SSE2MoveDouble, source=MemoryArg baseAddr, output=fpReg } :: code, extraStack, preCode) 815 | [] => 816 loadWin64Args(args, stackOffset+8, newArgOffset+size, [], 817 Move{source=MemoryArg baseAddr, 818 destination=RegisterArg workReg, moveSize=Move64 } :: PushToStack(RegisterArg workReg) :: code, 819 extraStack, preCode) 820 ) 821 | (0w8, _) => (* 64-bits. *) loadIntArg Move64 822 823 | (_, CTypeStruct _) => 824 let 825 (* Structures of other sizes are passed by reference. They are first 826 copied into new areas on the stack. This ensures that the called function 827 can update the structure without changing the original values. *) 828 val newExtra = intAlignUp(extraStack + Word.toInt size, 0w16) 829 val newPreCode = 830 moveMemory{source=(mlArg2Reg, Word.toInt newArgOffset), destination=(extraStackReg, extraStack), 831 length=Word.toInt size} @ preCode 832 in 833 case regs of 834 (areg, _) :: regs' => 835 loadWin64Args(args, stackOffset, newArgOffset+size, regs', 836 loadAddress{source=(extraStackReg, extraStack), destination=areg} :: code, 837 newExtra, newPreCode) 838 | [] => 839 loadWin64Args(args, stackOffset+8, newArgOffset+size, [], 840 loadAddress{source=(extraStackReg, extraStack), destination=workReg} :: 841 PushToStack(RegisterArg workReg) :: code, newExtra, newPreCode) 842 end 843 844 | _ => raise Foreign.Foreign "Unrecognised type for function argument" 845 end 846 847 val {typeForm, size, ...} = result 848 849 val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *) 850 val resultMemory = {base=resultAreaPtr, offset=0, index=NoIndex} 851 fun storeIntValue moveOp = 852 ([Move{source=RegisterArg eax, destination=MemoryArg resultMemory, moveSize=moveOp}], false) 853 and storeFloatValue precision = 854 ([XMMStoreToMemory{toStore=xmm0, address=resultMemory, precision=precision}], false) 855 856 val (getResult, passStructAddress) = 857 case (typeForm, size) of 858 (CTypeVoid, _) => ([], false) 859 | (_, 0w1) (* Includes structs *) => (* Single byte *) storeIntValue Move8 860 | (_, 0w2) => (* 16-bits *) storeIntValue Move16 861 | (CTypeFloatingPt, 0w4) => storeFloatValue SinglePrecision 862 | (_, 0w4) => storeIntValue Move32 863 | (CTypeFloatingPt, 0w8) => storeFloatValue DoublePrecision 864 | (_, 0w8) => storeIntValue Move64 865 | (CTypeStruct _, _) => ([], true) 866 | _ => raise Foreign.Foreign "Unrecognised result type" 867 868 (* argCode is the code to load and push the arguments. argStack is the amount of stack space 869 the arguments will take. It's only used to ensure that the stack is aligned onto a 16-byte 870 boundary. preArgCode is any code that is needed to copy the arguments before they are 871 actually loaded. Because it is done before the argument registers are loaded it can 872 use rcx, rdi and rsi. extraStack is local stack space needed. It is usually zero but 873 if it is non-zero it must be a multiple of 16 bytes. The address of this area is loaded 874 into r10 before preArgCode is called. *) 875 val (argCode, argStack, preArgCode, extraStack) = 876 if passStructAddress 877 then (* The address of the result structure goes in the first argument register: rcx *) 878 loadWin64Args(args, 0, 0w0, tl win64ArgRegs, 879 [moveRR{source=resultAreaPtr, output=rcx, opSize=nativeWordOpSize}], 0, []) 880 else loadWin64Args(args, 0, 0w0, win64ArgRegs, [], 0, []) 881 882 local 883 val align = argStack mod 16 884 in 885 (* Always align the stack. *) 886 val preArgAlign = if align = 0 then 0 else 16-align 887 (* The total space on the stack that needs to be removed at the end. *) 888 val postCallStackReset = argStack + preArgAlign + extraStack + 32 889 end 890 891 in 892 (* Save heap ptr. Needed in case we have a callback. *) 893 [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @ 894 ( 895 (* Put the destination address into a callee save resgister. 896 We have to put the C address in there now because an ML address wouldn't be updated 897 by a possible GC in a callback. *) 898 if #typeForm( result) <> CTypeVoid 899 then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)] 900 else [] 901 ) @ 902 [ 903 (* Save the stack pointer. *) 904 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *) 905 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize) (* Load the saved C stack pointer. *) 906 ] @ 907 ( 908 if extraStack = 0 909 then [] 910 else 911 [ 912 ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt extraStack), opSize=nativeWordOpSize}, 913 Move{source=RegisterArg rsp, destination=RegisterArg extraStackReg, moveSize=Move64} 914 ] 915 ) @ 916 ( 917 if preArgAlign = 0 918 then [] 919 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}] 920 ) @ 921 ( 922 (* The second argument is a SysWord containing the address of a malloced area of memory 923 with the actual arguments in it. *) 924 if null args 925 then [] 926 else [loadHeapMemory(mlArg2Reg, mlArg2Reg, 0, nativeWordOpSize)] 927 ) @ preArgCode @ argCode @ 928 [ (* Reserve a 32-byte area after the arguments. This is specific to the Windows ABI. *) 929 ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt 32), opSize=nativeWordOpSize}, 930 let 931 (* The entry point is in a SysWord.word value in RAX. *) 932 val entryPoint = 933 case targetArch of 934 ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax} 935 | _ => MemoryArg{base=eax, offset=0, index=NoIndex} 936 in 937 (* Call the function. We're discarding the value in rsp so no need to remove args. *) 938 CallAddress entryPoint 939 end, 940 (* Restore the C stack value in case it's been changed by a callback. *) 941 ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt postCallStackReset), opSize=nativeWordOpSize}, 942 storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize), 943 loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *) 944 (* Reload the heap pointer. If we've called back to ML this could well have changed. *) 945 loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize) 946 ] @ (* Store the result in the destination. *) getResult @ [ReturnFromFunction 0 ] 947 end (* callWindows64Bits *) 948 949 fun closureWindows64Bits(args, result) = 950 let 951 val {typeForm, size, align, ...} = result 952 953 (* Struct results are normally passed by reference. *) 954 val resultStructByRef = (* If true we've copied rcx (the first arg) into r9 *) 955 (case typeForm of CTypeStruct _ => true | _ => false) andalso 956 size <> 0w1 andalso size <> 0w2 andalso size <> 0w4 andalso size <> 0w8 957 958 (* Store the register arguments and copy everything else into the argument structure on the stack. 959 The code is ordered so that the early arguments are stored first. *) 960 fun copyWin64Args([], _, _, _) = [] 961 962 | copyWin64Args(arg::args, nStackArgs, argOffset, regs) = 963 let 964 val {size, align, typeForm, ...} = arg 965 val newArgOffset = alignUp(argOffset, align) 966 val destAddr = {base=rsp, offset=Word.toInt newArgOffset, index=NoIndex} 967 968 (* Integer arguments. *) 969 fun moveIntArg moveOp = 970 case regs of 971 (areg, _) :: regs' => 972 Move{source=RegisterArg areg, destination=MemoryArg destAddr, moveSize=moveOp } :: 973 copyWin64Args(args, nStackArgs, newArgOffset+size, regs') 974 | [] => 975 Move{source=MemoryArg {base=r10, offset=nStackArgs*8, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64} :: 976 Move{source=RegisterArg rax, destination=MemoryArg destAddr, moveSize=moveOp} :: 977 copyWin64Args(args, nStackArgs+1, newArgOffset+size, []) 978 in 979 (* Structs of 1, 2, 4 and 8 bytes are passed as the corresponding int. *) 980 case (typeForm, size) of 981 (_, 0w1) => moveIntArg Move8 982 | (_, 0w2) => moveIntArg Move16 983 984 | (CTypeFloatingPt, 0w4) => 985 ( 986 case regs of 987 (_, fpReg) :: regs' => 988 XMMStoreToMemory{ toStore=fpReg, address=destAddr, precision=SinglePrecision} :: 989 copyWin64Args(args, nStackArgs, newArgOffset+size, regs') 990 | [] => moveIntArg Move32 991 ) 992 | (_, 0w4) => (* 32-bits *) moveIntArg Move32 993 994 | (CTypeFloatingPt, 0w8) => 995 ( 996 case regs of 997 (_, fpReg) :: regs' => 998 XMMStoreToMemory{ toStore=fpReg, address=destAddr, precision=DoublePrecision} :: 999 copyWin64Args(args, nStackArgs, newArgOffset+size, regs') 1000 | [] => moveIntArg Move64 1001 ) 1002 | (_, 0w8) => (* 64-bits. *) moveIntArg Move64 1003 1004 | (CTypeStruct _, _) => 1005 (* Structures of other size are passed by reference. We need to copy the source 1006 structure into our stack area. Since rsi and rdi aren't used as args and 1007 rcx is only used for the first argument we can copy the argument now. *) 1008 ( 1009 case regs of 1010 (arg, _) :: regs' => 1011 moveMemory{source=(arg, 0), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @ 1012 copyWin64Args(args, nStackArgs, newArgOffset+size, regs') 1013 | [] => 1014 moveMemory{source=(r10, nStackArgs*8), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @ 1015 copyWin64Args(args, nStackArgs+1, newArgOffset+size, []) 1016 ) 1017 1018 | _ => raise Foreign.Foreign "Unrecognised type for function argument" 1019 end 1020 1021 val copyArgsFromRegsAndStack = 1022 if resultStructByRef 1023 then (* If we're returning a struct by reference we copy the address into r9 and pass that 1024 as the result address. *) 1025 Move{source=RegisterArg rcx, destination=RegisterArg r9, moveSize=Move64} :: 1026 copyWin64Args(args, 0, 0w0, tl win64ArgRegs) 1027 else copyWin64Args(args, 0, 0w0, win64ArgRegs) 1028 1029 local 1030 fun getNextSize (arg, argOffset) = 1031 let val {size, align, ...} = arg in alignUp(argOffset, align) + size end 1032 in 1033 val argumentSpace = List.foldl getNextSize 0w0 args 1034 end 1035 1036 val resultOffset = alignUp(argumentSpace, align) (* Offset of result area *) 1037 1038 val (loadResults, resultSize) = 1039 if typeForm = CTypeVoid orelse resultStructByRef 1040 then ([], 0w0) 1041 else 1042 let 1043 val resultMem = {base=rsp, offset=Word.toInt resultOffset, index=NoIndex} 1044 val resultCode = 1045 case (typeForm, size) of 1046 (CTypeSignedInt, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move8X64}] 1047 | (_, 0w1) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move8}] 1048 | (CTypeSignedInt, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move16X64}] 1049 | (_, 0w2) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move16}] 1050 | (CTypeFloatingPt, 0w4) => [XMMArith{opc=SSE2MoveFloat, source=MemoryArg resultMem, output=xmm0}] 1051 | (CTypeSignedInt, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move32X64}] 1052 | (_, 0w4) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move32}] 1053 | (CTypeFloatingPt, 0w8) => [XMMArith{opc=SSE2MoveDouble, source=MemoryArg resultMem, output=xmm0}] 1054 | (_, 0w8) => [Move{source=MemoryArg resultMem, destination=RegisterArg rax, moveSize=Move64}] 1055 | _ => raise Foreign.Foreign "Unrecognised result type" 1056 in 1057 (resultCode, size) 1058 end 1059 1060 (* Stack space. The stack must be 16 byte aligned. We've pushed 8 regs and a return address 1061 so add a further 8 bytes to bring it back into alignment. If we're returning a struct 1062 by reference, though, we've pushed 9 regs so don't add 8. *) 1063 val stackToAllocate = 1064 Word.toInt(alignUp(resultOffset + resultSize, 0w16)) + (if resultStructByRef then 0 else 8) 1065 in 1066 [ 1067 (* Push callee-save registers. *) 1068 PushToStack(RegisterArg rbp), PushToStack(RegisterArg rbx), PushToStack(RegisterArg r12), PushToStack(RegisterArg r13), 1069 PushToStack(RegisterArg r14), PushToStack(RegisterArg r15), PushToStack(RegisterArg rdi), PushToStack(RegisterArg rsi) 1070 ] @ 1071 ( 1072 (* If we're returning a struct by reference we have to return the address in rax even though 1073 it's been set by the caller. Save this address. *) 1074 if resultStructByRef 1075 then [PushToStack(RegisterArg rcx)] 1076 else [] 1077 ) @ 1078 [ 1079 (* Set r10 to point to the original stack args if any. This is beyond the pushed regs and also the 32-byte area. *) 1080 LoadAddress{ output=r10, offset=if resultStructByRef then 112 else 104, base=SOME rsp, index=NoIndex, opSize=nativeWordOpSize}, 1081 (* Allocate stack space. *) 1082 ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize}, 1083 (* Move the function we're calling, in rax, into r13, a callee-save register *) 1084 moveRR{source=rax, output=r13, opSize=polyWordOpSize} 1085 ] 1086 @ copyArgsFromRegsAndStack @ 1087 [ 1088 (* Get the value for rbp. *) 1089 (* This is a problem for 32-in-64. The value of getThreadDataCall is an object ID but rbx may well no 1090 longer hold the heap base address. We use a special inline constant to hold the full 64-bit address. *) 1091 LoadAbsolute{value=getThreadDataCall, destination=rcx}, 1092 CallAddress(MemoryArg{base=rcx, offset=0, index=NoIndex}), 1093 moveRR{source=rax, output=rbp, opSize=nativeWordOpSize}, 1094 (* Save the address of the argument and result area. *) 1095 moveRR{source=rsp, output=rcx, opSize=nativeWordOpSize}, 1096 (* Switch to the ML stack. *) 1097 storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize), 1098 loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), 1099 (* Load the ML heap pointer. *) 1100 loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize), 1101 (* Now move the function closure into the closure register ready for the call. *) 1102 moveRR{source=r13, output=rdx, opSize=polyWordOpSize} 1103 ] @ 1104 (* Reload the heap base address in 32-in-64. *) 1105 ( if targetArch = ObjectId32Bit then [loadMemory(rbx, rbp, memRegSavedRbx, nativeWordOpSize)] else [] ) 1106 @ boxRegAsSysWord(rcx, rax, []) @ 1107 ( 1108 (* If we're returning a struct by reference the address for the result will have been passed in the 1109 first argument. We use that as the result area. Otherwise point to the result area on the stack. *) 1110 if resultStructByRef 1111 then loadMemory(rcx, r10, ~112, nativeWordOpSize) 1112 else ArithToGenReg{opc=ADD, output=rcx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=nativeWordOpSize} 1113 ) :: boxRegAsSysWord(rcx, mlArg2Reg, [rax]) @ 1114 [ 1115 (* Call the ML function using the full closure call. *) 1116 CallAddress( 1117 if targetArch = ObjectId32Bit 1118 then MemoryArg{base=rbx, index=Index4 rdx, offset=0} 1119 else MemoryArg{base=rdx, index=NoIndex, offset=0}), 1120 (* Save the ML stack pointer because we may have grown the stack. Switch to the C stack. *) 1121 storeMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), 1122 loadMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize), 1123 storeMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize) 1124 ] @ loadResults @ 1125 [ 1126 (* Remove the stack space. *) 1127 ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize} 1128 ] @ 1129 ( if resultStructByRef then [PopR rax] else [] ) @ 1130 [ 1131 PopR rsi, PopR rdi, PopR r15, PopR r14, PopR r13, PopR r12, PopR rbx, PopR rbp, (* Restore callee-save registers. *) 1132 ReturnFromFunction 0 (* Caller removes any stack arguments. *) 1133 ] 1134 end 1135 end 1136 1137 local 1138 (* The rules for passing structs in SysV on X86/64 are complicated but most of the special 1139 cases don't apply. We don't support floating point larger than 8 bytes, packed structures 1140 or C++ constructors. It then reduces to the following: 1141 Structures of up to 8 bytes are passed in a single register and of 8-16 bytes in two 1142 registers. Larger structures are passed on the stack. The question is whether to use 1143 general registers or SSE2 registers. Each 8 byte chunk is considered independently after 1144 any internal structs have been unwrapped. Each chunk will consist of either a single 1145 8-byte value (i.e.. a pointer, int64_t or a double) or one or more smaller values and 1146 possibly some padding. An SSE2 register is used if the value is a double, two floats 1147 or a single float and padding. Otherwise it must have at least one shorter int-like 1148 type (e.g. int, char, short etc) in which case a general register is used. That 1149 applies even if it also contains a float. If, having selected the kind of 1150 registers to be used, there are not enough for the whole struct it is passed 1151 on the stack. 1152 1153 We don't really need this for simple arguments but it's easier to consider 1154 them all together. *) 1155 datatype argClass = ArgInMemory | ArgInRegs of { firstInSSE: bool, secondInSSE: bool } 1156 1157 fun classifyArg arg = 1158 let 1159 val {size, ...} = arg 1160 1161 (* Unwrap the struct and any internal structs. *) 1162 fun getFields([], _) = [] 1163 1164 | getFields(field::fields, offset) = 1165 let 1166 val {size, align, typeForm} = field 1167 val alignedOffset = alignUp(offset, align) (* Align this even if it's a sub-struct *) 1168 in 1169 case typeForm of 1170 CTypeVoid => raise Foreign.Foreign "Void cannot be used for a function argument" 1171 | CTypeStruct elements => 1172 getFields(elements, alignedOffset) @ getFields(fields, alignedOffset+size) 1173 | _ => (typeForm, alignedOffset) :: getFields(fields, alignedOffset+size) 1174 end 1175 1176 val isSSE = 1177 List.all (fn (CTypeFloatingPt, _) => true | _ => false) 1178 in 1179 if size > 0w16 1180 then ArgInMemory 1181 else 1182 let 1183 val fieldsAndOffsets = getFields([arg], 0w0) 1184 in 1185 if size <= 0w8 (* Only the first register will be used. *) 1186 then ArgInRegs{firstInSSE=isSSE fieldsAndOffsets, secondInSSE=false} 1187 else 1188 let 1189 val (first8Bytes, second8Bytes) = 1190 List.partition (fn (_, off) => off <= 0w8) fieldsAndOffsets 1191 in 1192 ArgInRegs{firstInSSE=isSSE first8Bytes, secondInSSE=isSSE second8Bytes} 1193 end 1194 end 1195 end 1196 1197 val sysVGenRegs = [rdi, rsi, rdx, rcx, r8, r9] 1198 and sysVFPRegs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7] 1199 1200 (* Store a register into upto 8 bytes. Most values will involve a single store but odd-sized 1201 structs can require shifts and multiple stores. N.B. May modify the source register. *) 1202 fun storeUpTo8(reg, base, offset, size) = 1203 let 1204 val moveOp = 1205 if size = 0w8 then Move64 else if size >= 0w4 then Move32 else if size >= 0w2 then Move16 else Move8 1206 in 1207 [Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset, index=NoIndex}, moveSize=moveOp}] 1208 end @ 1209 ( 1210 if size = 0w6 orelse size = 0w7 1211 then 1212 [ 1213 ShiftConstant{ shiftType=SHR, output=reg, shift=0w32, opSize=OpSize64 }, 1214 Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset+4, index=NoIndex}, moveSize=Move16} 1215 ] 1216 else [] 1217 ) @ 1218 ( 1219 if size = 0w3 orelse size = 0w5 orelse size = 0w7 1220 then 1221 [ 1222 ShiftConstant{ shiftType=SHR, output=reg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 }, 1223 Move{source=RegisterArg reg, destination=MemoryArg {base=base, offset=offset+Word.toInt(size-0w1), index=NoIndex}, moveSize=Move8} 1224 ] 1225 else [] 1226 ) 1227 1228 in 1229 fun callUnix64Bits(args, result) = 1230 let 1231 val argWorkReg = r10 (* Not used for any arguments. *) 1232 val resultAreaPtr = r12 (* Saved value of r8 - This is callee save. *) 1233 val argPtrReg = r11 (* Pointer to argument area - Can't use mlArg2Reg since that's RSI on 32-in-64. *) 1234 1235 fun loadSysV64Args([], stackOffset, _, _, _, code, preCode) = (code, stackOffset, preCode) 1236 1237 | loadSysV64Args(arg::args, stackOffset, argOffset, gRegs, fpRegs, code, preCode) = 1238 let 1239 val {size, align, typeForm, ...} = arg 1240 1241 (* Load a value into a register. Normally the size will be 1, 2, 4 or 8 bytes and 1242 this will just involve a simple load. Structs, though, can be of any size up 1243 to 8 bytes. *) 1244 fun loadRegister(reg, offset, size) = 1245 let 1246 (* We don't necessarily have to sign-extend. There's a comment in libffi that 1247 suggests that LVM expects it even though the SysV ABI doesn't require it. *) 1248 val moveOp = 1249 if size = 0w8 1250 then Move64 1251 else if typeForm = CTypeSignedInt andalso size = 0w4 1252 then Move32X64 1253 else if size >= 0w4 1254 then Move32 1255 else if typeForm = CTypeSignedInt andalso size = 0w2 1256 then Move16X64 1257 else if size >= 0w2 1258 then Move16 1259 else if typeForm = CTypeSignedInt andalso size = 0w1 1260 then Move8X64 else Move8 1261 in 1262 [Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset, index=NoIndex}, destination=RegisterArg reg, moveSize=moveOp}] 1263 end @ 1264 ( 1265 if size = 0w6 orelse size = 0w7 1266 then 1267 [ 1268 Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset + 4, index=NoIndex}, 1269 destination=RegisterArg argWorkReg, moveSize=Move16}, 1270 ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=0w32, opSize=OpSize64 }, 1271 ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 } 1272 ] 1273 else [] 1274 ) @ 1275 ( 1276 if size = 0w3 orelse size = 0w5 orelse size = 0w7 1277 then 1278 [ 1279 Move{source=MemoryArg{base=argPtrReg, offset=Word.toInt offset + Word.toInt(size-0w1), index=NoIndex}, 1280 destination=RegisterArg argWorkReg, moveSize=Move8}, 1281 ShiftConstant{ shiftType=SHL, output=argWorkReg, shift=Word8.fromLargeWord(Word.toLargeWord((size-0w1)*0w8)), opSize=OpSize64 }, 1282 ArithToGenReg{ opc=OR, output=reg, source=RegisterArg argWorkReg, opSize=OpSize64 } 1283 ] 1284 else [] 1285 ) 1286 1287 val newArgOffset = alignUp(argOffset, align) 1288 val word1Addr = {base=argPtrReg, offset=Word.toInt newArgOffset, index=NoIndex} 1289 val word2Addr = {base=argPtrReg, offset=Word.toInt newArgOffset + 8, index=NoIndex} 1290 in 1291 case (classifyArg arg, size > 0w8, gRegs, fpRegs) of 1292 (* 8 bytes or smaller - single general reg. This is the usual case. *) 1293 (ArgInRegs{firstInSSE=false, ...}, false, gReg :: gRegs', fpRegs') => 1294 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1295 loadRegister(gReg, newArgOffset, size) @ code, preCode) 1296 1297 (* 8 bytes or smaller - single SSE reg. Usual case for real arguments. *) 1298 | (ArgInRegs{firstInSSE=true, ...}, false, gRegs', fpReg :: fpRegs') => 1299 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1300 XMMArith{opc=if size = 0w4 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } :: code, 1301 preCode) 1302 1303 (* 9-16 bytes - both values in general regs. *) 1304 | (ArgInRegs{firstInSSE=false, secondInSSE=false}, true, gReg1 :: gReg2 :: gRegs', fpRegs') => 1305 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1306 Move{source=MemoryArg word1Addr, destination=RegisterArg gReg1, moveSize=Move64} :: 1307 loadRegister(gReg2, newArgOffset+0w8, size-0w8) @ code, preCode) 1308 1309 (* 9-16 bytes - first in general, second in SSE. *) 1310 | (ArgInRegs{firstInSSE=false, secondInSSE=true}, true, gReg :: gRegs', fpReg :: fpRegs') => 1311 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1312 Move{source=MemoryArg word1Addr, destination=RegisterArg gReg, moveSize=Move64} :: 1313 XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble, source=MemoryArg word2Addr, output=fpReg } :: code, 1314 preCode) 1315 1316 (* 9-16 bytes - first in SSE, second in general. *) 1317 | (ArgInRegs{firstInSSE=true, secondInSSE=false}, true, gReg :: gRegs', fpReg :: fpRegs') => 1318 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1319 XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg } :: 1320 loadRegister(gReg, newArgOffset+0w8, size-0w8) @ code, 1321 preCode) 1322 1323 | (* 9-16 bytes - both values in SSE regs. *) 1324 (ArgInRegs{firstInSSE=true, secondInSSE=true}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') => 1325 loadSysV64Args(args, stackOffset, newArgOffset+size, gRegs', fpRegs', 1326 XMMArith{opc=SSE2MoveDouble, source=MemoryArg word1Addr, output=fpReg1 } :: 1327 XMMArith{opc=if size = 0w12 then SSE2MoveFloat else SSE2MoveDouble, 1328 source=MemoryArg word2Addr, output=fpReg2 } :: code, 1329 preCode) 1330 1331 | (_, _, gRegs', fpRegs') => (* Either larger than 16 bytes or we've run out of the right kind of registers. *) 1332 (* Move the argument in the preCode. It's possible a large struct could be the first argument 1333 and if we left it until the end RDI and RSI would already have been loaded. 1334 Structs are passed by value on the stack not, as in Win64, by reference. *) 1335 let 1336 val space = intAlignUp(Word.toInt size, 0w8) 1337 in 1338 loadSysV64Args(args, stackOffset+space, newArgOffset+size, gRegs', fpRegs', code, 1339 ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt space), opSize=nativeWordOpSize} :: 1340 moveMemory{source=(argPtrReg, Word.toInt newArgOffset), destination=(rsp, 0), length=Word.toInt size} @ preCode) 1341 end 1342 end 1343 1344 (* The rules for returning structs are similar to those for parameters. *) 1345 local 1346 (* Store a result register into the result area. In almost all cases 1347 this is very simple: the only complication is with structs of odd sizes. *) 1348 fun storeResult(reg, offset, size) = storeUpTo8(reg, resultAreaPtr, offset, size) 1349 1350 val {size, typeForm, ...} = result 1351 in 1352 val (getResult, passArgAddress) = 1353 if typeForm = CTypeVoid 1354 then ([], false) 1355 else case (classifyArg result, size > 0w8) of 1356 (* 8 bytes or smaller - returned in RAX - Normal case for int-like results. *) 1357 (ArgInRegs{firstInSSE=false, ...}, false) => (storeResult(rax, 0, size), false) 1358 1359 (* 8 bytes or smaller - returned in XMM0 - Normal case for real results. *) 1360 | (ArgInRegs{firstInSSE=true, ...}, false) => 1361 ([XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex}, 1362 precision=if size = 0w4 then SinglePrecision else DoublePrecision}], false) 1363 1364 (* 9-16 bytes - returned in RAX/RDX. *) 1365 | (ArgInRegs{firstInSSE=false, secondInSSE=false}, true) => 1366 (storeResult(rax, 0, 0w8) @ storeResult(rdx, 0, size-0w8), false) 1367 1368 (* 9-16 bytes - first in RAX, second in XMM0. *) 1369 | (ArgInRegs{firstInSSE=false, secondInSSE=true}, true) => 1370 (XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=8, index=NoIndex}, 1371 precision=if size = 0w12 then SinglePrecision else DoublePrecision} :: 1372 storeResult(rax, 0, 0w8), false) 1373 1374 (* 9-16 bytes - first in XMM0, second in RAX. *) 1375 | (ArgInRegs{firstInSSE=true, secondInSSE=false}, true) => 1376 (XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex}, precision=DoublePrecision} :: 1377 storeResult(rax, 8, size-0w8), false) 1378 1379 (* 9-16 bytes - both values in SSE regs.*) 1380 | (ArgInRegs{firstInSSE=true, secondInSSE=true}, true) => 1381 ([XMMStoreToMemory{toStore=xmm0, address={base=resultAreaPtr, offset=0, index=NoIndex}, precision=DoublePrecision}, 1382 XMMStoreToMemory{toStore=xmm1, address={base=resultAreaPtr, offset=8, index=NoIndex}, 1383 precision=if size = 0w12 then SinglePrecision else DoublePrecision}], false) 1384 1385 | _ => ([], true) (* Have to pass the address of the area in memory *) 1386 end 1387 1388 val (argCode, argStack, preArgCode) = 1389 if passArgAddress (* If we have to pass the address of the result struct it goes in rdi. *) 1390 then loadSysV64Args(args, 0, 0w0, tl sysVGenRegs, sysVFPRegs, 1391 [moveRR{source=resultAreaPtr, output=rdi, opSize=nativeWordOpSize}], []) 1392 else loadSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, [], []) 1393 1394 local 1395 val align = argStack mod 16 1396 in 1397 (* Always align the stack. *) 1398 val preArgAlign = if align = 0 then 0 else 16-align 1399 end 1400 1401 in 1402 (* Save heap ptr. Needed in case we have a callback. *) 1403 [storeMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize)] @ 1404 ( 1405 (* Put the destination address into a callee save resgister. 1406 We have to put the C address in there now because an ML address wouldn't be updated 1407 by a possible GC in a callback. *) 1408 if #typeForm( result) <> CTypeVoid 1409 then [loadHeapMemory(resultAreaPtr, r8, 0, nativeWordOpSize)] 1410 else [] 1411 ) @ 1412 [ 1413 (* Save the stack pointer. *) 1414 storeMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Save ML stack and switch to C stack. *) 1415 loadMemory(esp, ebp, memRegCStackPtr, nativeWordOpSize) (* Load the saved C stack pointer. *) 1416 ] @ 1417 ( 1418 if preArgAlign = 0 1419 then [] 1420 else [ArithToGenReg{opc=SUB, output=esp, source=NonAddressConstArg(LargeInt.fromInt preArgAlign), opSize=nativeWordOpSize}] 1421 ) @ 1422 ( 1423 (* The second argument is a SysWord containing the address of a malloced area of memory 1424 with the actual arguments in it. *) 1425 if null args 1426 then [] 1427 else [loadHeapMemory(argPtrReg, mlArg2Reg, 0, nativeWordOpSize)] 1428 ) @ preArgCode @ argCode @ 1429 [ 1430 let 1431 (* The entry point is in a SysWord.word value in RAX. *) 1432 val entryPoint = 1433 case targetArch of 1434 ObjectId32Bit => MemoryArg{base=ebx, offset=0, index=Index4 eax} 1435 | _ => MemoryArg{base=eax, offset=0, index=NoIndex} 1436 in 1437 (* Call the function. We're discarding the value in rsp so no need to remove args. *) 1438 CallAddress entryPoint 1439 end, 1440 loadMemory(esp, ebp, memRegStackPtr, nativeWordOpSize), (* Restore the ML stack pointer. *) 1441 (* Reload the heap pointer. If we've called back to ML this could well have changed. *) 1442 loadMemory(r15, ebp, memRegLocalMPointer, nativeWordOpSize) 1443 ] @ (* Store the result in the destination. *) getResult @ [ ReturnFromFunction 0 ] 1444 end (* callUnix64Bits *) 1445 1446 fun closureUnix64Bits(args, result) = 1447 let 1448 fun moveSysV64Args([], _, _, _, _, moveFromStack) = moveFromStack 1449 1450 | moveSysV64Args(arg::args, stackSpace, argOffset, gRegs, fpRegs, moveFromStack) = 1451 let 1452 val {size, align, ...} = arg 1453 fun storeRegister(reg, offset, size) = storeUpTo8(reg, rsp, offset, size) 1454 val newArgOffset = alignUp(argOffset, align) 1455 val word1Addr = {base=rsp, offset=Word.toInt newArgOffset, index=NoIndex} 1456 val word2Addr = {base=rsp, offset=Word.toInt newArgOffset + 8, index=NoIndex} 1457 in 1458 case (classifyArg arg, size > 0w8, gRegs, fpRegs) of 1459 (* 8 bytes or smaller - single general reg. This is the usual case. *) 1460 (ArgInRegs{firstInSSE=false, ...}, false, gReg :: gRegs', fpRegs') => 1461 storeRegister(gReg, Word.toInt newArgOffset, size) @ 1462 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1463 1464 (* 8 bytes or smaller - single SSE reg. Usual case for real arguments. *) 1465 | (ArgInRegs{firstInSSE=true, ...}, false, gRegs', fpReg :: fpRegs') => 1466 XMMStoreToMemory{precision=if size = 0w4 then SinglePrecision else DoublePrecision, address=word1Addr, toStore=fpReg } :: 1467 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1468 1469 (* 9-16 bytes - both values in general regs. *) 1470 | (ArgInRegs{firstInSSE=false, secondInSSE=false}, true, gReg1 :: gReg2 :: gRegs', fpRegs') => 1471 Move{source=MemoryArg word1Addr, destination=RegisterArg gReg1, moveSize=Move64} :: 1472 storeRegister(gReg2, Word.toInt newArgOffset+8, size-0w8) @ 1473 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1474 1475 (* 9-16 bytes - first in general, second in SSE. *) 1476 | (ArgInRegs{firstInSSE=false, secondInSSE=true}, true, gReg :: gRegs', fpReg :: fpRegs') => 1477 Move{source=MemoryArg word1Addr, destination=RegisterArg gReg, moveSize=Move64} :: 1478 XMMStoreToMemory{precision=if size = 0w12 then SinglePrecision else DoublePrecision, address=word2Addr, toStore=fpReg } :: 1479 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1480 1481 (* 9-16 bytes - first in SSE, second in general. *) 1482 | (ArgInRegs{firstInSSE=true, secondInSSE=false}, true, gReg :: gRegs', fpReg :: fpRegs') => 1483 XMMStoreToMemory{precision=DoublePrecision, address=word1Addr, toStore=fpReg } :: 1484 storeRegister(gReg, Word.toInt newArgOffset+8, size-0w8) @ 1485 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1486 1487 | (* 9-16 bytes - both values in SSE regs. *) 1488 (ArgInRegs{firstInSSE=true, secondInSSE=true}, true, gRegs', fpReg1 :: fpReg2 :: fpRegs') => 1489 XMMStoreToMemory{precision=DoublePrecision, address=word1Addr, toStore=fpReg1 } :: 1490 XMMStoreToMemory{precision=if size = 0w12 then SinglePrecision else DoublePrecision, 1491 address=word2Addr, toStore=fpReg2 } :: 1492 moveSysV64Args(args, stackSpace, newArgOffset+size, gRegs', fpRegs', moveFromStack) 1493 1494 | (_, _, gRegs', fpRegs') => 1495 (* Either larger than 16 bytes or we've run out of the right kind of register. 1496 Structs larger than 16 bytes are passed by value on the stack. Move the 1497 argument after we've stored all the registers in particular rsi and rdi. *) 1498 let 1499 val space = intAlignUp(Word.toInt size, 0w8) 1500 in 1501 moveSysV64Args(args, stackSpace+space, newArgOffset+size, gRegs', fpRegs', 1502 moveMemory{source=(r10, stackSpace), destination=(rsp, Word.toInt newArgOffset), length=Word.toInt size} @ 1503 moveFromStack) 1504 end 1505 end 1506 1507 (* Result structs larger than 16 bytes are returned by reference. *) 1508 val resultStructByRef = #size ( result) > 0w16 1509 1510 val copyArgsFromRegsAndStack = 1511 if resultStructByRef (* rdi contains the address for the result. *) 1512 then moveSysV64Args(args, 0, 0w0, tl sysVGenRegs, sysVFPRegs, []) 1513 else moveSysV64Args(args, 0, 0w0, sysVGenRegs, sysVFPRegs, []) 1514 1515 local 1516 fun getNextSize (arg, argOffset) = 1517 let val {size, align, ...} = arg in alignUp(argOffset, align) + size end 1518 in 1519 val argumentSpace = List.foldl getNextSize 0w0 args 1520 end 1521 1522 (* Allocate a 16-byte area for any results returned in registers. This will not be used 1523 if the result is a structure larger than 16-bytes. *) 1524 val resultOffset = alignUp(argumentSpace, 0w8) 1525 (* Ensure the stack is 16 bytes aligned. We've pushed 6 regs and a return address 1526 so add a further 8 bytes to bring it back into alignment. If we're returning a struct 1527 by reference, though, we've pushed 7 regs so don't add 8. *) 1528 val stackToAllocate = 1529 Word.toInt(alignUp(resultOffset + 0w16, 0w16)) + (if resultStructByRef then 0 else 8) 1530 1531 (* The rules for returning structs are similar to those for parameters. *) 1532 local 1533 (* The result area is always 16 bytes wide so we can load the result without risking reading outside. 1534 At least at the moment we ignore any sign extension. *) 1535 val {size, typeForm, ...} = result 1536 val resultOffset = Word.toInt resultOffset 1537 in 1538 val loadResults = 1539 if typeForm = CTypeVoid 1540 then [] 1541 else case (classifyArg result, size > 0w8) of 1542 (* 8 bytes or smaller - returned in RAX - Normal case for int-like results. *) 1543 (ArgInRegs{firstInSSE=false, ...}, false) => 1544 [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}] 1545 1546 (* 8 bytes or smaller - returned in XMM0 - Normal case for real results. *) 1547 | (ArgInRegs{firstInSSE=true, ...}, false) => 1548 [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex}, 1549 precision=if size = 0w4 then SinglePrecision else DoublePrecision}] 1550 1551 (* 9-16 bytes - returned in RAX/RDX. *) 1552 | (ArgInRegs{firstInSSE=false, secondInSSE=false}, true) => 1553 [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}, 1554 Move{source=MemoryArg {base=rsp, offset=resultOffset+8, index=NoIndex}, destination=RegisterArg rdx, moveSize=Move64}] 1555 1556 (* 9-16 bytes - first in RAX, second in XMM0. *) 1557 | (ArgInRegs{firstInSSE=false, secondInSSE=true}, true) => 1558 [Move{source=MemoryArg {base=rsp, offset=resultOffset, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}, 1559 XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset+8, index=NoIndex}, 1560 precision=if size = 0w12 then SinglePrecision else DoublePrecision}] 1561 1562 (* 9-16 bytes - first in XMM0, second in RAX. *) 1563 | (ArgInRegs{firstInSSE=true, secondInSSE=false}, true) => 1564 [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex}, precision=DoublePrecision}, 1565 Move{source=MemoryArg {base=rsp, offset=resultOffset+8, index=NoIndex}, destination=RegisterArg rax, moveSize=Move64}] 1566 1567 (* 9-16 bytes - both values in SSE regs.*) 1568 | (ArgInRegs{firstInSSE=true, secondInSSE=true}, true) => 1569 [XMMStoreToMemory{toStore=xmm0, address={base=rsp, offset=resultOffset, index=NoIndex}, precision=DoublePrecision}, 1570 XMMStoreToMemory{toStore=xmm1, address={base=rsp, offset=resultOffset+8, index=NoIndex}, 1571 precision=if size = 0w12 then SinglePrecision else DoublePrecision}] 1572 1573 | _ => [] (* Have to pass the address of the area in memory *) 1574 end 1575 in 1576 [ 1577 (* Push callee-save registers. *) 1578 PushToStack(RegisterArg rbp), PushToStack(RegisterArg rbx), PushToStack(RegisterArg r12), 1579 PushToStack(RegisterArg r13), PushToStack(RegisterArg r14), PushToStack(RegisterArg r15) 1580 ] @ 1581 ( 1582 (* If we're returning a struct by reference we have to return the address in rax even though 1583 it's been set by the caller. Save this address. *) 1584 if resultStructByRef 1585 then [PushToStack(RegisterArg rdi)] 1586 else [] 1587 ) @ 1588 [ 1589 (* Set r10 to point to the original stack args if any. *) 1590 LoadAddress{ output=r10, offset=if resultStructByRef then 64 else 56, base=SOME rsp, index=NoIndex, opSize=nativeWordOpSize}, 1591 (* Allocate stack space. *) 1592 ArithToGenReg{opc=SUB, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize}, 1593 (* Move the function we're calling, in rax, into r13, a callee-save register *) 1594 moveRR{source=rax, output=r13, opSize=polyWordOpSize} 1595 ] 1596 @ copyArgsFromRegsAndStack @ 1597 [ 1598 (* Get the value for rbp. This has to be an absolute address in 32-in-64. *) 1599 LoadAbsolute{value=getThreadDataCall, destination=rcx}, 1600 CallAddress(MemoryArg{base=rcx, offset=0, index=NoIndex}), 1601 moveRR{source=rax, output=rbp, opSize=nativeWordOpSize}, 1602 (* Save the address of the argument and result area. *) 1603 moveRR{source=rsp, output=rcx, opSize=nativeWordOpSize}, 1604 (* Switch to the ML stack. *) 1605 storeMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize), 1606 loadMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), 1607 (* Load the ML heap pointer. *) 1608 loadMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize), 1609 (* Now move the function closure into the closure register ready for the call. *) 1610 moveRR{source=r13, output=rdx, opSize=polyWordOpSize} 1611 ] @ 1612 (* Reload the heap base address in 32-in-64. *) 1613 ( if targetArch = ObjectId32Bit then [loadMemory(rbx, rbp, memRegSavedRbx, nativeWordOpSize)] else [] ) 1614 @ boxRegAsSysWord(rcx, rax, []) @ 1615 ( 1616 (* If we're returning a struct by reference the address for the result will have been passed in the 1617 first argument. We use that as the result area. Otherwise point to the result area on the stack. *) 1618 if resultStructByRef 1619 then loadMemory(rcx, r10, ~64, nativeWordOpSize) 1620 else ArithToGenReg{opc=ADD, output=rcx, source=NonAddressConstArg(Word.toLargeInt resultOffset), opSize=nativeWordOpSize} 1621 ) :: boxRegAsSysWord(rcx, mlArg2Reg, [rax]) @ 1622 [ 1623 (* Call the ML function using the full closure call. *) 1624 CallAddress( 1625 if targetArch = ObjectId32Bit 1626 then MemoryArg{base=rbx, index=Index4 rdx, offset=0} 1627 else MemoryArg{base=rdx, index=NoIndex, offset=0}), 1628 (* Save the ML stack pointer because we may have grown the stack. Switch to the C stack. *) 1629 storeMemory(rsp, rbp, memRegStackPtr, nativeWordOpSize), 1630 loadMemory(rsp, rbp, memRegCStackPtr, nativeWordOpSize), 1631 storeMemory(r15, rbp, memRegLocalMPointer, nativeWordOpSize) 1632 ] @ loadResults @ 1633 [ 1634 (* Remove the stack space. *) 1635 ArithToGenReg{opc=ADD, output=rsp, source=NonAddressConstArg(LargeInt.fromInt stackToAllocate), opSize=nativeWordOpSize} 1636 ] @ 1637 ( if resultStructByRef then [PopR rax] else [] ) @ 1638 [ 1639 PopR r15, PopR r14, PopR r13, PopR r12, PopR rbx, PopR rbp, (* Restore callee-save registers. *) 1640 ReturnFromFunction 0 (* Caller removes any stack arguments. *) 1641 ] 1642 end 1643 end 1644 1645 fun foreignCall(abi: ffiABI, args: cType list, result: cType): Address.machineWord = 1646 let 1647 val code = 1648 case abi of 1649 FFI_UNIX64 => callUnix64Bits(args, result) 1650 | FFI_WIN64 => callWindows64Bits(args, result) 1651 | abi => call32Bits(abi, args, result) 1652 1653 val functionName = "foreignCall" 1654 val debugSwitches = 1655 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), 1656 Universal.tagInject DEBUG.assemblyCodeTag true*)] 1657 val profileObject = createProfileObject functionName 1658 val newCode = codeCreate (functionName, profileObject, debugSwitches) 1659 val closure = makeConstantClosure() 1660 val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure} 1661 in 1662 closureAsAddress closure 1663 end 1664 1665 (* Build a callback function. The arguments are the abi, the list of argument types and the result type. 1666 The result is an ML function that takes an ML function, f, as its argument, registers it as a callback and 1667 returns the C function as its result. When the C function is called the arguments are copied into 1668 temporary memory and the vector passed to f along with the address of the memory for the result. 1669 "f" stores the result in it when it returns and the result is then passed back as the result of the 1670 callback. 1671 N.B. This returns a closure cell which contains the address of the code. It can be used as a 1672 SysWord.word value except that while it exists the code will not be GCed. *) 1673 fun buildCallBack(abi: ffiABI, args: cType list, result: cType): Address.machineWord = 1674 let 1675 val code = 1676 case abi of 1677 FFI_UNIX64 => closureUnix64Bits(args, result) 1678 | FFI_WIN64 => closureWindows64Bits(args, result) 1679 | abi => closure32Bits(abi, args, result) 1680 1681 val functionName = "foreignCallBack(2)" 1682 val debugSwitches = 1683 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), 1684 Universal.tagInject DEBUG.assemblyCodeTag true*)] 1685 val profileObject = createProfileObject functionName 1686 val newCode = codeCreate (functionName, profileObject, debugSwitches) 1687 val closure = makeConstantClosure() 1688 val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure} 1689 val stage2Code = closureAsAddress closure 1690 1691 fun resultFunction f = 1692 let 1693 (* Generate a small function to load the address of f into rax/eax and then jump to stage2. 1694 The idea is that it should be possible to generate this eventually in a single RTS call. 1695 That could be done by using a version of this as a model. *) 1696 val codeAddress = 1697 (* In the native code versions we extract the code address from the closure. 1698 We don't do that in 32-in-64 and instead the RTS does it. *) 1699 if targetArch = ObjectId32Bit 1700 then stage2Code 1701 else Address.loadWord(Address.toAddress stage2Code, 0w0) 1702 val code = 1703 [ 1704 Move{source=AddressConstArg(Address.toMachineWord f), destination=RegisterArg rax, moveSize=opSizeToMove polyWordOpSize}, 1705 JumpAddress(AddressConstArg codeAddress) 1706 ] 1707 val functionName = "foreignCallBack(1)" 1708 val debugSwitches = 1709 [(*Universal.tagInject Pretty.compilerOutputTag (Pretty.prettyPrint(print, 70)), 1710 Universal.tagInject DEBUG.assemblyCodeTag true*)] 1711 val profileObject = createProfileObject functionName 1712 val newCode = codeCreate (functionName, profileObject, debugSwitches) 1713 val closure = makeConstantClosure() 1714 val () = X86OPTIMISE.generateCode{code=newCode, labelCount=0, ops=code, resultClosure=closure} 1715 val res = closureAsAddress closure 1716 (*val _ = print("Address is " ^ (LargeWord.toString(RunCall.unsafeCast res)) ^ "\n")*) 1717 in 1718 res 1719 end 1720 in 1721 Address.toMachineWord resultFunction 1722 end 1723 1724end; 1725