1(* 2 Copyright David C. J. Matthews 1989, 2000, 2009-10, 2012-13, 2015-20 3 4 Based on original code: 5 Copyright (c) 2000 6 Cambridge University Technical Services Limited 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License version 2.1 as published by the Free Software Foundation. 11 12 This library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with this library; if not, write to the Free Software 19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20*) 21 22(* 23 Title: Code Generator Routines. 24 Author: Dave Matthews, Cambridge University Computer Laboratory 25 Copyright Cambridge University 1989 26*) 27 28(* This module contains the code vector and operations to insert code into 29 it. Each procedure is compiled into a separate segment. Initially it is 30 compiled into a fixed size segment, and then copied into a segment of the 31 correct size at the end. 32 This module contains all the definitions of the X86 opCodes and registers. 33 It uses "codeseg" to create and operate on the segment itself. 34 *) 35 36functor X86OUTPUTCODE ( 37structure DEBUG: DEBUG 38structure PRETTY: PRETTYSIG (* for compilerOutTag *) 39structure CODE_ARRAY: CODEARRAYSIG 40 41) : X86CODESIG = 42 43struct 44 open CODE_ARRAY 45 open DEBUG 46 open Address 47 open Misc 48 49 (* May be targeted at native 32-bit, native 64-bit or X86/64 with 32-bit words 50 and addresses as object Ids. *) 51 datatype targetArch = Native32Bit | Native64Bit | ObjectId32Bit 52 53 val targetArch = 54 case PolyML.architecture() of 55 "I386" => Native32Bit 56 | "X86_64" => Native64Bit 57 | "X86_64_32" => ObjectId32Bit 58 | _ => raise InternalError "Unknown target architecture" 59 60 (* Some checks - *) 61 val () = 62 case (targetArch, wordSize, nativeWordSize) of 63 (Native32Bit, 0w4, 0w4) => () 64 | (Native64Bit, 0w8, 0w8) => () 65 | (ObjectId32Bit, 0w4, 0w8) => () 66 | _ => raise InternalError "Mismatch of architecture and word-length" 67 68 val hostIsX64 = targetArch <> Native32Bit 69 70 infix 5 << <<+ <<- >> >>+ >>- ~>> ~>>+ ~>>- (* Shift operators *) 71 infix 3 andb orb xorb andbL orbL xorbL andb8 orb8 xorb8 72 73 val op << = Word.<< and op >> = Word.>> 74 val (*op <<+ = LargeWord.<< and *) op >>+ = LargeWord.>> 75 val op <<- = Word8.<< and op >>- = Word8.>> 76 77 val op orb8 = Word8.orb 78 val op andb8 = Word8.andb 79 80 val op andb = Word.andb (* and op andbL = LargeWord.andb *) 81 and op orb = Word.orb 82 83 val wordToWord8 = Word8.fromLargeWord o Word.toLargeWord 84 (*and word8ToWord = Word.fromLargeWord o Word8.toLargeWord*) 85 86 val exp2_16 = 0x10000 87 val exp2_31 = 0x80000000: LargeInt.int 88 89 (* Returns true if this a 32-bit machine or if the constant is within 32-bits. 90 This is exported to the higher levels. N.B. The test for not isX64 91 avoids a significant overhead with arbitrary precision arithmetic on 92 X86/32. *) 93 fun is32bit v = not hostIsX64 orelse ~exp2_31 <= v andalso v < exp2_31 94 95 (* tag a short constant *) 96 fun tag c = 2 * c + 1; 97 98 fun is8BitL (n: LargeInt.int) = ~ 0x80 <= n andalso n < 0x80 99 100 local 101 val shift = 102 if wordSize = 0w4 103 then 0w2 104 else if wordSize = 0w8 105 then 0w3 106 else raise InternalError "Invalid word size for x86_32 or x86+64" 107 in 108 fun wordsToBytes n = n << shift 109 and bytesToWords n = n >> shift 110 end 111 112 infix 6 addrPlus addrMinus; 113 114 (* All indexes into the code vector have type "addrs". This is really a legacy. *) 115 type addrs = Word.word 116 117 val addrZero = 0w0 118 119 (* This is the external label type used when constructing operations. *) 120 datatype label = Label of { labelNo: int } 121 122 (* Constants which are too large to go inline in the code are put in 123 a list and put at the end of the code. They are arranged so that 124 the garbage collector can find them and change them as necessary. 125 A reference to a constant is treated like a forward reference to a 126 label. *) 127 128 datatype code = 129 Code of 130 { 131 procName: string, (* Name of the procedure. *) 132 printAssemblyCode:bool, (* Whether to print the code when we finish. *) 133 printStream: string->unit, (* The stream to use *) 134 lowLevelOptimise: bool, (* Whether to do the low-level optimisation pass *) 135 profileObject : machineWord (* The profile object for this code. *) 136 } 137 138 (* Exported functions *) 139 fun lowLevelOptimise(Code{lowLevelOptimise, ...}) = lowLevelOptimise 140 141 (* EBP/RBP points to a structure that interfaces to the RTS. These are 142 offsets into that structure. *) 143 val memRegLocalMPointer = 0 (* Not used in 64-bit *) 144 and memRegHandlerRegister = Word.toInt nativeWordSize 145 and memRegLocalMbottom = 2 * Word.toInt nativeWordSize 146 and memRegStackLimit = 3 * Word.toInt nativeWordSize 147 and memRegExceptionPacket = 4 * Word.toInt nativeWordSize 148 and memRegCStackPtr = 6 * Word.toInt nativeWordSize 149 and memRegThreadSelf = 7 * Word.toInt nativeWordSize 150 and memRegStackPtr = 8 * Word.toInt nativeWordSize 151 and memRegHeapOverflowCall = 10 * Word.toInt nativeWordSize 152 and memRegStackOverflowCall = 11 * Word.toInt nativeWordSize 153 and memRegStackOverflowCallEx = 12 * Word.toInt nativeWordSize 154 and memRegSavedRbx = 14 * Word.toInt nativeWordSize (* Heap base in 32-in-64. *) 155 156 (* create and initialise a code segment *) 157 fun codeCreate (name : string, profObj, parameters) : code = 158 let 159 val printStream = PRETTY.getSimplePrinter(parameters, []) 160 in 161 Code 162 { 163 procName = name, 164 printAssemblyCode = DEBUG.getParameter DEBUG.assemblyCodeTag parameters, 165 printStream = printStream, 166 lowLevelOptimise = DEBUG.getParameter DEBUG.lowlevelOptimiseTag parameters, 167 profileObject = profObj 168 } 169 end 170 171 (* Put 1 unsigned byte at a given offset in the segment. *) 172 fun set8u (b, addr, seg) = byteVecSet (seg, addr, b) 173 174 (* Put 4 bytes at a given offset in the segment. *) 175 (* b0 is the least significant byte. *) 176 fun set4Bytes (b3, b2, b1, b0, addr, seg) = 177 let 178 val a = addr; 179 in 180 (* Little-endian *) 181 byteVecSet (seg, a, b0); 182 byteVecSet (seg, a + 0w1, b1); 183 byteVecSet (seg, a + 0w2, b2); 184 byteVecSet (seg, a + 0w3, b3) 185 end; 186 187 (* Put 1 unsigned word at a given offset in the segment. *) 188 fun set32u (ival: LargeWord.word, addr, seg) : unit = 189 let 190 val b3 = Word8.fromLargeWord (ival >>+ 0w24) 191 val b2 = Word8.fromLargeWord (ival >>+ 0w16) 192 val b1 = Word8.fromLargeWord (ival >>+ 0w8) 193 val b0 = Word8.fromLargeWord ival 194 in 195 set4Bytes (b3, b2, b1, b0, addr, seg) 196 end 197 198 (* Put 1 signed word at a given offset in the segment. *) 199 fun set32s (ival: LargeInt.int, addr, seg) = set32u(LargeWord.fromLargeInt ival, addr, seg) 200 201 fun byteSigned ival = 202 if ~0x80 <= ival andalso ival < 0x80 203 then Word8.fromInt ival 204 else raise InternalError "byteSigned: invalid byte" 205 206 (* Convert a large-word value to a little-endian byte sequence. *) 207 fun largeWordToBytes(_, 0) = [] 208 | largeWordToBytes(ival: LargeWord.word, n) = 209 Word8.fromLargeWord ival :: largeWordToBytes(ival >>+ 0w8, n-1) 210 211 fun word32Unsigned(ival: LargeWord.word) = largeWordToBytes(ival, 4) 212 213 fun int32Signed(ival: LargeInt.int) = 214 if is32bit ival 215 then word32Unsigned(LargeWord.fromLargeInt ival) 216 else raise InternalError "int32Signed: invalid word" 217 218 (* Registers. *) 219 datatype genReg = GeneralReg of Word8.word * bool 220 and fpReg = FloatingPtReg of Word8.word 221 and xmmReg = SSE2Reg of Word8.word 222 223 datatype reg = 224 GenReg of genReg 225 | FPReg of fpReg 226 | XMMReg of xmmReg 227 228 (* These are the real registers we have. The AMD extension encodes the 229 additional registers through the REX prefix. *) 230 val rax = GeneralReg (0w0, false) 231 val rcx = GeneralReg (0w1, false) 232 val rdx = GeneralReg (0w2, false) 233 val rbx = GeneralReg (0w3, false) 234 val rsp = GeneralReg (0w4, false) 235 val rbp = GeneralReg (0w5, false) 236 val rsi = GeneralReg (0w6, false) 237 val rdi = GeneralReg (0w7, false) 238 val eax = rax and ecx = rcx and edx = rdx and ebx = rbx 239 and esp = rsp and ebp = rbp and esi = rsi and edi = rdi 240 val r8 = GeneralReg (0w0, true) 241 val r9 = GeneralReg (0w1, true) 242 val r10 = GeneralReg (0w2, true) 243 val r11 = GeneralReg (0w3, true) 244 val r12 = GeneralReg (0w4, true) 245 val r13 = GeneralReg (0w5, true) 246 val r14 = GeneralReg (0w6, true) 247 val r15 = GeneralReg (0w7, true) 248 249 (* Floating point "registers". Actually entries on the floating point stack. 250 The X86 has a floating point stack with eight entries. *) 251 val fp0 = FloatingPtReg 0w0 252 and fp1 = FloatingPtReg 0w1 253 and fp2 = FloatingPtReg 0w2 254 and fp3 = FloatingPtReg 0w3 255 and fp4 = FloatingPtReg 0w4 256 and fp5 = FloatingPtReg 0w5 257 and fp6 = FloatingPtReg 0w6 258 and fp7 = FloatingPtReg 0w7 259 260 (* SSE2 Registers. These are used for floating point in 64-bity mode. 261 We only use XMM0-6 because the others are callee save and we don't 262 currently save them. *) 263 val xmm0 = SSE2Reg 0w0 264 and xmm1 = SSE2Reg 0w1 265 and xmm2 = SSE2Reg 0w2 266 and xmm3 = SSE2Reg 0w3 267 and xmm4 = SSE2Reg 0w4 268 and xmm5 = SSE2Reg 0w5 269 and xmm6 = SSE2Reg 0w6 270 and xmm7 = SSE2Reg 0w7 271 272 fun getReg (GeneralReg r) = r 273 fun mkReg n = GeneralReg n (* reg.up *) 274 275 (* The maximum size of the register vectors and masks. Although the 276 X86/32 has a floating point stack with eight entries it's much simpler 277 to treat it as having seven "real" registers. Items are pushed to the 278 stack and then stored and popped into the current location. It may be 279 possible to improve the code by some peephole optimisation. *) 280 val regs = 30 (* Include the X86/64 registers even if this is 32-bit. *) 281 282 (* The nth register (counting from 0). *) 283 (* Profiling shows that applying the constructors here creates a lot of 284 garbage. Create the entries once and then use vector indexing instead. *) 285 local 286 fun regN i = 287 if i < 8 288 then GenReg(GeneralReg(Word8.fromInt i, false)) 289 else if i < 16 290 then GenReg(GeneralReg(Word8.fromInt(i-8), true)) 291 else if i < 23 292 then FPReg(FloatingPtReg(Word8.fromInt(i-16))) 293 else XMMReg(SSE2Reg(Word8.fromInt(i-23))) 294 val regVec = Vector.tabulate(regs, regN) 295 in 296 fun regN i = Vector.sub(regVec, i) handle Subscript => raise InternalError "Bad register number" 297 end 298 299 (* The number of the register. *) 300 fun nReg(GenReg(GeneralReg(r, false))) = Word8.toInt r 301 | nReg(GenReg(GeneralReg(r, true))) = Word8.toInt r + 8 302 | nReg(FPReg(FloatingPtReg r)) = Word8.toInt r + 16 303 | nReg(XMMReg(SSE2Reg r)) = Word8.toInt r + 23 304 305 datatype opsize = SZByte | SZWord | SZDWord | SZQWord 306 307 (* Default size when printing regs. *) 308 val sz32_64 = if hostIsX64 then SZQWord else SZDWord 309 310 fun genRegRepr(GeneralReg (0w0, false), SZByte) = "al" 311 | genRegRepr(GeneralReg (0w1, false), SZByte) = "cl" 312 | genRegRepr(GeneralReg (0w2, false), SZByte) = "dl" 313 | genRegRepr(GeneralReg (0w3, false), SZByte) = "bl" 314 | genRegRepr(GeneralReg (0w4, false), SZByte) = "ah" 315 | genRegRepr(GeneralReg (0w5, false), SZByte) = "ch" 316 | genRegRepr(GeneralReg (0w6, false), SZByte) = "sil" (* Assume there's a Rex code that forces low-order reg *) 317 | genRegRepr(GeneralReg (0w7, false), SZByte) = "dil" 318 | genRegRepr(GeneralReg (reg, true), SZByte) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "b" 319 | genRegRepr(GeneralReg (0w0, false), SZDWord) = "eax" 320 | genRegRepr(GeneralReg (0w1, false), SZDWord) = "ecx" 321 | genRegRepr(GeneralReg (0w2, false), SZDWord) = "edx" 322 | genRegRepr(GeneralReg (0w3, false), SZDWord) = "ebx" 323 | genRegRepr(GeneralReg (0w4, false), SZDWord) = "esp" 324 | genRegRepr(GeneralReg (0w5, false), SZDWord) = "ebp" 325 | genRegRepr(GeneralReg (0w6, false), SZDWord) = "esi" 326 | genRegRepr(GeneralReg (0w7, false), SZDWord) = "edi" 327 | genRegRepr(GeneralReg (reg, true), SZDWord) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "d" 328 | genRegRepr(GeneralReg (0w0, false), SZQWord) = "rax" 329 | genRegRepr(GeneralReg (0w1, false), SZQWord) = "rcx" 330 | genRegRepr(GeneralReg (0w2, false), SZQWord) = "rdx" 331 | genRegRepr(GeneralReg (0w3, false), SZQWord) = "rbx" 332 | genRegRepr(GeneralReg (0w4, false), SZQWord) = "rsp" 333 | genRegRepr(GeneralReg (0w5, false), SZQWord) = "rbp" 334 | genRegRepr(GeneralReg (0w6, false), SZQWord) = "rsi" 335 | genRegRepr(GeneralReg (0w7, false), SZQWord) = "rdi" 336 | genRegRepr(GeneralReg (reg, true), SZQWord) = "r" ^ Int.toString(Word8.toInt reg +8) 337 | genRegRepr(GeneralReg (0w0, false), SZWord) = "ax" 338 | genRegRepr(GeneralReg (0w1, false), SZWord) = "cx" 339 | genRegRepr(GeneralReg (0w2, false), SZWord) = "dx" 340 | genRegRepr(GeneralReg (0w3, false), SZWord) = "bx" 341 | genRegRepr(GeneralReg (0w4, false), SZWord) = "sp" 342 | genRegRepr(GeneralReg (0w5, false), SZWord) = "bp" 343 | genRegRepr(GeneralReg (0w6, false), SZWord) = "si" 344 | genRegRepr(GeneralReg (0w7, false), SZWord) = "di" 345 | genRegRepr(GeneralReg (reg, true), SZWord) = "r" ^ Int.toString(Word8.toInt reg +8) ^ "w" 346 | genRegRepr _ = "unknown" (* Suppress warning because word values are not exhaustive. *) 347 348 and fpRegRepr(FloatingPtReg n) = "fp" ^ Word8.toString n 349 350 and xmmRegRepr(SSE2Reg n) = "xmm" ^ Word8.toString n 351 352 fun regRepr(GenReg r) = genRegRepr (r, sz32_64) 353 | regRepr(FPReg r) = fpRegRepr r 354 | regRepr(XMMReg r) = xmmRegRepr r 355 356 (* Install a pretty printer. This is simply for when this code is being 357 run under the debugger. N.B. We need PolyML.PrettyString here. *) 358 val () = PolyML.addPrettyPrinter(fn _ => fn _ => fn r => PolyML.PrettyString(regRepr r)) 359 360 datatype argType = ArgGeneral | ArgFP 361 362 (* Size of operand. OpSize64 is only valid in 64-bit mode. *) 363 datatype opSize = OpSize32 | OpSize64 364 365 structure RegSet = 366 struct 367 (* Implement a register set as a bit mask. *) 368 datatype regSet = RegSet of word 369 fun singleton r = RegSet(0w1 << Word.fromInt(nReg r)) 370 fun regSetUnion(RegSet r1, RegSet r2) = RegSet(Word.orb(r1, r2)) 371 fun regSetIntersect(RegSet r1, RegSet r2) = RegSet(Word.andb(r1, r2)) 372 373 local 374 fun addReg(acc, n) = 375 if n = regs then acc else addReg(regSetUnion(acc, singleton(regN n)), n+1) 376 in 377 val allRegisters = addReg(RegSet 0w0, 0) 378 end 379 380 val noRegisters = RegSet 0w0 381 382 fun inSet(r, rs) = regSetIntersect(singleton r, rs) <> noRegisters 383 384 fun regSetMinus(RegSet s1, RegSet s2) = RegSet(Word.andb(s1, Word.notb s2)) 385 386 val listToSet = List.foldl (fn(r, rs) => regSetUnion(singleton r, rs)) noRegisters 387 388 local 389 val regs = 390 case targetArch of 391 Native32Bit => [eax, ecx, edx, ebx, esi, edi] 392 | Native64Bit => [eax, ecx, edx, ebx, esi, edi, r8, r9, r10, r11, r12, r13, r14] 393 | ObjectId32Bit => [eax, ecx, edx, esi, edi, r8, r9, r10, r11, r12, r13, r14] 394 in 395 val generalRegisters = listToSet(map GenReg regs) 396 end 397 398 (* The floating point stack. Note that this excludes one item so it is always 399 possible to load a value onto the top of the FP stack. *) 400 val floatingPtRegisters = 401 listToSet(map FPReg [fp0, fp1, fp2, fp3, fp4, fp5, fp6(*, fp7*)]) 402 403 val sse2Registers = 404 listToSet(map XMMReg [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6]) 405 406 fun isAllRegs rs = rs = allRegisters 407 408 fun setToList (RegSet regSet)= 409 let 410 fun testBit (n, bit, res) = 411 if n = regs 412 then res 413 else testBit(n+1, bit << 0w1, 414 if (regSet andb bit) <> 0w0 415 then regN n :: res else res) 416 in 417 testBit(0, 0w1, []) 418 end 419 420 val cardinality = List.length o setToList 421 422 (* Choose one of the set. This chooses the least value which means that 423 the ordering of the registers is significant. This is a hot-spot 424 so is coded directly with the word operations. *) 425 fun oneOf(RegSet regSet) = 426 let 427 fun find(n, bit) = 428 if n = Word.fromInt regs then raise InternalError "oneOf: empty" 429 else if Word.andb(bit, regSet) <> 0w0 then n 430 else find(n+0w1, Word.<<(bit, 0w1)) 431 in 432 regN(Word.toInt(find(0w0, 0w1))) 433 end 434 435 fun regSetRepr regSet = 436 let 437 val regs = setToList regSet 438 in 439 "[" ^ String.concatWith "," (List.map regRepr regs) ^ "]" 440 end 441 442 (* Install a pretty printer for when this code is being debugged. *) 443 val () = PolyML.addPrettyPrinter(fn _ => fn _ => fn r => PolyML.PrettyString(regSetRepr r)) 444 end 445 446 open RegSet 447 448 datatype arithOp = ADD | OR (*|ADC | SBB*) | AND | SUB | XOR | CMP 449 450 fun arithOpToWord ADD = 0w0: Word8.word 451 | arithOpToWord OR = 0w1 452 | arithOpToWord AND = 0w4 453 | arithOpToWord SUB = 0w5 454 | arithOpToWord XOR = 0w6 455 | arithOpToWord CMP = 0w7 456 457 fun arithOpRepr ADD = "Add" 458 | arithOpRepr OR = "Or" 459 | arithOpRepr AND = "And" 460 | arithOpRepr SUB = "Sub" 461 | arithOpRepr XOR = "Xor" 462 | arithOpRepr CMP = "Cmp" 463 464 datatype shiftType = SHL | SHR | SAR 465 466 fun shiftTypeToWord SHL = 0w4: Word8.word 467 | shiftTypeToWord SHR = 0w5 468 | shiftTypeToWord SAR = 0w7 469 470 fun shiftTypeRepr SHL = "Shift Left Logical" 471 | shiftTypeRepr SHR = "Shift Right Logical" 472 | shiftTypeRepr SAR = "Shift Right Arithemetic" 473 474 datatype repOps = CMPS8 | MOVS8 | MOVS32 | STOS8 | STOS32 | MOVS64 | STOS64 475 476 fun repOpsToWord CMPS8 = 0wxa6: Word8.word 477 | repOpsToWord MOVS8 = 0wxa4 478 | repOpsToWord MOVS32 = 0wxa5 479 | repOpsToWord MOVS64 = 0wxa5 (* Plus Rex.w *) 480 | repOpsToWord STOS8 = 0wxaa 481 | repOpsToWord STOS32 = 0wxab 482 | repOpsToWord STOS64 = 0wxab (* Plus Rex.w *) 483 484 fun repOpsRepr CMPS8 = "CompareBytes" 485 | repOpsRepr MOVS8 = "MoveBytes" 486 | repOpsRepr MOVS32 = "MoveWords32" 487 | repOpsRepr MOVS64 = "MoveWords64" 488 | repOpsRepr STOS8 = "StoreBytes" 489 | repOpsRepr STOS32 = "StoreWords32" 490 | repOpsRepr STOS64 = "StoreWords64" 491 492 datatype fpOps = FADD | FMUL | FCOM | FCOMP | FSUB | FSUBR | FDIV | FDIVR 493 494 fun fpOpToWord FADD = 0w0: Word8.word 495 | fpOpToWord FMUL = 0w1 496 | fpOpToWord FCOM = 0w2 497 | fpOpToWord FCOMP = 0w3 498 | fpOpToWord FSUB = 0w4 499 | fpOpToWord FSUBR = 0w5 500 | fpOpToWord FDIV = 0w6 501 | fpOpToWord FDIVR = 0w7 502 503 fun fpOpRepr FADD = "FPAdd" 504 | fpOpRepr FMUL = "FPMultiply" 505 | fpOpRepr FCOM = "FPCompare" 506 | fpOpRepr FCOMP = "FPCompareAndPop" 507 | fpOpRepr FSUB = "FPSubtract" 508 | fpOpRepr FSUBR = "FPReverseSubtract" 509 | fpOpRepr FDIV = "FPDivide" 510 | fpOpRepr FDIVR = "FPReverseDivide" 511 512 datatype fpUnaryOps = FCHS | FABS | FLD1 | FLDZ 513 514 fun fpUnaryToWords FCHS = {rm=0w0:Word8.word, nnn=0w4: Word8.word} 515 | fpUnaryToWords FABS = {rm=0w1, nnn=0w4} 516 | fpUnaryToWords FLD1 = {rm=0w0, nnn=0w5} 517 | fpUnaryToWords FLDZ = {rm=0w6, nnn=0w5} 518 519 fun fpUnaryRepr FCHS = "FPChangeSign" 520 | fpUnaryRepr FABS = "FPAbs" 521 | fpUnaryRepr FLD1 = "FPLoadOne" 522 | fpUnaryRepr FLDZ = "FPLoadZero" 523 524 datatype branchOps = JO | JNO | JE | JNE | JL | JGE | JLE | JG | JB | JNB | JNA | JA | JP | JNP 525 526 fun branchOpToWord JO = 0wx0: Word8.word 527 | branchOpToWord JNO = 0wx1 528 | branchOpToWord JB = 0wx2 529 | branchOpToWord JNB = 0wx3 530 | branchOpToWord JE = 0wx4 531 | branchOpToWord JNE = 0wx5 532 | branchOpToWord JNA = 0wx6 533 | branchOpToWord JA = 0wx7 534 | branchOpToWord JP = 0wxa 535 | branchOpToWord JNP = 0wxb 536 | branchOpToWord JL = 0wxc 537 | branchOpToWord JGE = 0wxd 538 | branchOpToWord JLE = 0wxe 539 | branchOpToWord JG = 0wxf 540 541 fun branchOpRepr JO = "Overflow" 542 | branchOpRepr JNO = "NotOverflow" 543 | branchOpRepr JE = "Equal" 544 | branchOpRepr JNE = "NotEqual" 545 | branchOpRepr JL = "Less" 546 | branchOpRepr JGE = "GreaterOrEqual" 547 | branchOpRepr JLE = "LessOrEqual" 548 | branchOpRepr JG = "Greater" 549 | branchOpRepr JB = "Before" 550 | branchOpRepr JNB= "NotBefore" 551 | branchOpRepr JNA = "NotAfter" 552 | branchOpRepr JA = "After" 553 | branchOpRepr JP = "Parity" 554 | branchOpRepr JNP = "NoParity" 555 556 (* Invert a test. This is used if we want to change the 557 sense of a test from jumping if the condition is true to 558 jumping if it is false. *) 559 fun invertTest JE = JNE 560 | invertTest JNE = JE 561 | invertTest JA = JNA 562 | invertTest JB = JNB 563 | invertTest JNA = JA 564 | invertTest JNB = JB 565 | invertTest JL = JGE 566 | invertTest JG = JLE 567 | invertTest JLE = JG 568 | invertTest JGE = JL 569 | invertTest JO = JNO 570 | invertTest JNO = JO 571 | invertTest JP = JNP 572 | invertTest JNP = JP 573 574 datatype sse2Operations = 575 SSE2MoveDouble | SSE2MoveFloat | SSE2CompDouble | SSE2AddDouble | 576 SSE2SubDouble | SSE2MulDouble | SSE2DivDouble | 577 SSE2Xor | SSE2And | SSE2FloatToDouble | SSE2DoubleToFloat | 578 SSE2CompSingle | SSE2AddSingle | SSE2SubSingle | SSE2MulSingle | SSE2DivSingle 579 580 fun sse2OpRepr SSE2MoveDouble = "SSE2MoveDouble" 581 | sse2OpRepr SSE2MoveFloat = "SSE2MoveFloat" 582 | sse2OpRepr SSE2CompDouble = "SSE2CompDouble" 583 | sse2OpRepr SSE2AddDouble = "SSE2AddDouble" 584 | sse2OpRepr SSE2SubDouble = "SSE2SubDouble" 585 | sse2OpRepr SSE2MulDouble = "SSE2MulDouble" 586 | sse2OpRepr SSE2DivDouble = "SSE2DivDouble" 587 | sse2OpRepr SSE2Xor = "SSE2Xor" 588 | sse2OpRepr SSE2And = "SSE2And" 589 | sse2OpRepr SSE2CompSingle = "SSE2CompSingle" 590 | sse2OpRepr SSE2AddSingle = "SSE2AddSingle" 591 | sse2OpRepr SSE2SubSingle = "SSE2SubSingle" 592 | sse2OpRepr SSE2MulSingle = "SSE2MulSingle" 593 | sse2OpRepr SSE2DivSingle = "SSE2DivSingle" 594 | sse2OpRepr SSE2FloatToDouble = "SSE2FloatToDouble" 595 | sse2OpRepr SSE2DoubleToFloat = "SSE2DoubleToFloat" 596 597 (* Primary opCodes. N.B. only opCodes actually used are listed here. 598 If new instruction are added check they will be handled by the 599 run-time system in the event of trap. *) 600 datatype opCode = 601 Group1_8_A32 602 | Group1_8_A64 603 | Group1_32_A32 604 | Group1_32_A64 605 | Group1_8_a 606 | JMP_8 607 | JMP_32 608 | CALL_32 609 | MOVL_A_R32 610 | MOVL_A_R64 611 | MOVL_R_A32 612 | MOVL_R_A64 613 | MOVL_R_A16 614 | MOVB_R_A32 615 | MOVB_R_A64 of {forceRex: bool} 616 | PUSH_R of Word8.word 617 | POP_R of Word8.word 618 | Group5 619 | NOP 620 | LEAL32 621 | LEAL64 622 | MOVL_32_R of Word8.word 623 | MOVL_64_R of Word8.word 624 | MOVL_32_A32 625 | MOVL_32_A64 626 | MOVB_8_A 627 | POP_A 628 | RET 629 | RET_16 630 | CondJump of branchOps 631 | CondJump32 of branchOps 632 | SetCC of branchOps 633 | Arith32 of arithOp * Word8.word 634 | Arith64 of arithOp * Word8.word 635 | Group3_A32 636 | Group3_A64 637 | Group3_a 638 | Group2_8_A32 639 | Group2_8_A64 640 | Group2_CL_A32 641 | Group2_CL_A64 642 | Group2_1_A32 643 | Group2_1_A64 644 | PUSH_8 645 | PUSH_32 646 | TEST_ACC8 647 | LOCK_XADD32 648 | LOCK_XADD64 649 | FPESC of Word8.word 650 | XCHNG32 651 | XCHNG64 652 | REP (* Rep prefix *) 653 | MOVZB (* Needs escape code. *) 654 | MOVZW (* Needs escape code. *) 655 | MOVSXB32 (* Needs escape code. *) 656 | MOVSXW32 (* Needs escape code. *) 657 | MOVSXB64 (* Needs escape code. *) 658 | MOVSXW64 (* Needs escape code. *) 659 | IMUL32 (* Needs escape code. *) 660 | IMUL64 (* Needs escape code. *) 661 | SSE2StoreSingle (* movss with memory destination - needs escape sequence. *) 662 | SSE2StoreDouble (* movsd with memory destination - needs escape sequence. *) 663 | CQO_CDQ32 (* Sign extend before divide.. *) 664 | CQO_CDQ64 (* Sign extend before divide.. *) 665 | SSE2Ops of sse2Operations (* SSE2 instructions. *) 666 | CVTSI2SD32 667 | CVTSI2SD64 668 | HLT (* End of code marker. *) 669 | IMUL_C8_32 670 | IMUL_C8_64 671 | IMUL_C32_32 672 | IMUL_C32_64 673 | MOVDFromXMM (* move 32 bit value from XMM to general reg. *) 674 | MOVQToXMM (* move 64 bit value from general reg.to XMM *) 675 | PSRLDQ (* Shift XMM register *) 676 | LDSTMXCSR 677 | CVTSD2SI32 (* Double to 32-bit int *) 678 | CVTSD2SI64 (* Double to 64-bit int *) 679 | CVTSS2SI32 (* Single to 32-bit int *) 680 | CVTSS2SI64 (* Single to 64-bit int *) 681 | CVTTSD2SI32 (* Double to 32-bit int - truncate towards zero *) 682 | CVTTSD2SI64 (* Double to 64-bit int - truncate towards zero *) 683 | CVTTSS2SI32 (* Single to 32-bit int - truncate towards zero *) 684 | CVTTSS2SI64 (* Single to 64-bit int - truncate towards zero *) 685 | MOVSXD 686 | CMOV32 of branchOps 687 | CMOV64 of branchOps 688 689 690 fun opToInt Group1_8_A32 = 0wx83 691 | opToInt Group1_8_A64 = 0wx83 692 | opToInt Group1_32_A32 = 0wx81 693 | opToInt Group1_32_A64 = 0wx81 694 | opToInt Group1_8_a = 0wx80 695 | opToInt JMP_8 = 0wxeb 696 | opToInt JMP_32 = 0wxe9 697 | opToInt CALL_32 = 0wxe8 698 | opToInt MOVL_A_R32 = 0wx8b 699 | opToInt MOVL_A_R64 = 0wx8b 700 | opToInt MOVL_R_A32 = 0wx89 701 | opToInt MOVL_R_A64 = 0wx89 702 | opToInt MOVL_R_A16 = 0wx89 (* Also has an OPSIZE prefix. *) 703 | opToInt MOVB_R_A32 = 0wx88 704 | opToInt (MOVB_R_A64 _) = 0wx88 705 | opToInt (PUSH_R reg) = 0wx50 + reg 706 | opToInt (POP_R reg) = 0wx58 + reg 707 | opToInt Group5 = 0wxff 708 | opToInt NOP = 0wx90 709 | opToInt LEAL32 = 0wx8d 710 | opToInt LEAL64 = 0wx8d 711 | opToInt (MOVL_32_R reg) = 0wxb8 + reg 712 | opToInt (MOVL_64_R reg) = 0wxb8 + reg 713 | opToInt MOVL_32_A32 = 0wxc7 714 | opToInt MOVL_32_A64 = 0wxc7 715 | opToInt MOVB_8_A = 0wxc6 716 | opToInt POP_A = 0wx8f 717 | opToInt RET = 0wxc3 718 | opToInt RET_16 = 0wxc2 719 | opToInt (CondJump opc) = 0wx70 + branchOpToWord opc 720 | opToInt (CondJump32 opc) = 0wx80 + branchOpToWord opc (* Needs 0F prefix *) 721 | opToInt (SetCC opc) = 0wx90 + branchOpToWord opc (* Needs 0F prefix *) 722 | opToInt (Arith32 (ao,dw)) = arithOpToWord ao * 0w8 + dw 723 | opToInt (Arith64 (ao,dw)) = arithOpToWord ao * 0w8 + dw 724 | opToInt Group3_A32 = 0wxf7 725 | opToInt Group3_A64 = 0wxf7 726 | opToInt Group3_a = 0wxf6 727 | opToInt Group2_8_A32 = 0wxc1 728 | opToInt Group2_8_A64 = 0wxc1 729 | opToInt Group2_1_A32 = 0wxd1 730 | opToInt Group2_1_A64 = 0wxd1 731 | opToInt Group2_CL_A32 = 0wxd3 732 | opToInt Group2_CL_A64 = 0wxd3 733 | opToInt PUSH_8 = 0wx6a 734 | opToInt PUSH_32 = 0wx68 735 | opToInt TEST_ACC8 = 0wxa8 736 | opToInt LOCK_XADD32 = 0wxC1 (* Needs lock and escape prefixes. *) 737 | opToInt LOCK_XADD64 = 0wxC1 (* Needs lock and escape prefixes. *) 738 | opToInt (FPESC n) = 0wxD8 orb8 n 739 | opToInt XCHNG32 = 0wx87 740 | opToInt XCHNG64 = 0wx87 741 | opToInt REP = 0wxf3 742 | opToInt MOVZB = 0wxb6 (* Needs escape code. *) 743 | opToInt MOVZW = 0wxb7 (* Needs escape code. *) 744 | opToInt MOVSXB32 = 0wxbe (* Needs escape code. *) 745 | opToInt MOVSXW32 = 0wxbf (* Needs escape code. *) 746 | opToInt MOVSXB64 = 0wxbe (* Needs escape code. *) 747 | opToInt MOVSXW64 = 0wxbf (* Needs escape code. *) 748 | opToInt IMUL32 = 0wxaf (* Needs escape code. *) 749 | opToInt IMUL64 = 0wxaf (* Needs escape code. *) 750 | opToInt SSE2StoreSingle = 0wx11 (* Needs F3 0F escape. *) 751 | opToInt SSE2StoreDouble = 0wx11 (* Needs F2 0F escape. *) 752 | opToInt CQO_CDQ32 = 0wx99 753 | opToInt CQO_CDQ64 = 0wx99 754 | opToInt (SSE2Ops SSE2MoveDouble) = 0wx10 (* Needs F2 0F escape. *) 755 | opToInt (SSE2Ops SSE2MoveFloat) = 0wx10 (* Needs F3 0F escape. *) 756 | opToInt (SSE2Ops SSE2CompDouble) = 0wx2E (* Needs 66 0F escape. *) 757 | opToInt (SSE2Ops SSE2AddDouble) = 0wx58 (* Needs F2 0F escape. *) 758 | opToInt (SSE2Ops SSE2SubDouble) = 0wx5c (* Needs F2 0F escape. *) 759 | opToInt (SSE2Ops SSE2MulDouble) = 0wx59 (* Needs F2 0F escape. *) 760 | opToInt (SSE2Ops SSE2DivDouble) = 0wx5e (* Needs F2 0F escape. *) 761 | opToInt (SSE2Ops SSE2CompSingle) = 0wx2E (* Needs 0F escape. *) 762 | opToInt (SSE2Ops SSE2AddSingle) = 0wx58 (* Needs F3 0F escape. *) 763 | opToInt (SSE2Ops SSE2SubSingle) = 0wx5c (* Needs F3 0F escape. *) 764 | opToInt (SSE2Ops SSE2MulSingle) = 0wx59 (* Needs F3 0F escape. *) 765 | opToInt (SSE2Ops SSE2DivSingle) = 0wx5e (* Needs F3 0F escape. *) 766 | opToInt (SSE2Ops SSE2And) = 0wx54 (* Needs 66 0F escape. *) 767 | opToInt (SSE2Ops SSE2Xor) = 0wx57 (* Needs 66 0F escape. *) 768 | opToInt (SSE2Ops SSE2FloatToDouble) = 0wx5A (* Needs F3 0F escape. *) 769 | opToInt (SSE2Ops SSE2DoubleToFloat) = 0wx5A (* Needs F2 0F escape. *) 770 | opToInt CVTSI2SD32 = 0wx2a (* Needs F2 0F escape. *) 771 | opToInt CVTSI2SD64 = 0wx2a (* Needs F2 0F escape. *) 772 | opToInt HLT = 0wxf4 773 | opToInt IMUL_C8_32 = 0wx6b 774 | opToInt IMUL_C8_64 = 0wx6b 775 | opToInt IMUL_C32_32 = 0wx69 776 | opToInt IMUL_C32_64 = 0wx69 777 | opToInt MOVDFromXMM = 0wx7e (* Needs 66 0F escape. *) 778 | opToInt MOVQToXMM = 0wx6e (* Needs 66 0F escape. *) 779 | opToInt PSRLDQ = 0wx73 (* Needs 66 0F escape. *) 780 | opToInt LDSTMXCSR = 0wxae (* Needs 0F prefix. *) 781 | opToInt CVTSD2SI32 = 0wx2d (* Needs F2 0F prefix. *) 782 | opToInt CVTSD2SI64 = 0wx2d (* Needs F2 0F prefix and rex.w. *) 783 | opToInt CVTSS2SI32 = 0wx2d (* Needs F3 0F prefix. *) 784 | opToInt CVTSS2SI64 = 0wx2d (* Needs F3 0F prefix and rex.w. *) 785 | opToInt CVTTSD2SI32 = 0wx2c (* Needs F2 0F prefix. *) 786 | opToInt CVTTSD2SI64 = 0wx2c (* Needs F2 0F prefix. *) 787 | opToInt CVTTSS2SI32 = 0wx2c (* Needs F3 0F prefix. *) 788 | opToInt CVTTSS2SI64 = 0wx2c (* Needs F3 0F prefix and rex.w. *) 789 | opToInt MOVSXD = 0wx63 790 | opToInt (CMOV32 opc) = 0wx40 + branchOpToWord opc (* Needs 0F prefix *) 791 | opToInt (CMOV64 opc) = 0wx40 + branchOpToWord opc (* Needs 0F prefix and rex.w *) 792 793 datatype mode = 794 Based0 (* mod = 0 *) 795 | Based8 (* mod = 1 *) 796 | Based32 (* mod = 2 *) 797 | Register (* mod = 3 *) ; 798 799 (* Put together the three fields which make up the mod r/m byte. *) 800 fun modrm (md : mode, rg: Word8.word, rm : Word8.word) : Word8.word = 801 let 802 val _ = if rg > 0w7 then raise InternalError "modrm: bad rg" else () 803 val _ = if rm > 0w7 then raise InternalError "modrm: bad rm" else () 804 val modField: Word8.word = 805 case md of 806 Based0 => 0w0 807 | Based8 => 0w1 808 | Based32 => 0w2 809 | Register => 0w3 810 in 811 (modField <<- 0w6) orb8 (rg <<- 0w3) orb8 rm 812 end 813 814 (* REX prefix *) 815 fun rex {w,r,x,b} = 816 0wx40 orb8 (if w then 0w8 else 0w0) orb8 (if r then 0w4 else 0w0) orb8 817 (if x then 0w2 else 0w0) orb8 (if b then 0w1 else 0w0) 818 819 (* The X86 has the option to include an index register and to scale it. *) 820 datatype indexType = 821 NoIndex | Index1 of genReg | Index2 of genReg | Index4 of genReg | Index8 of genReg 822 823 (* Lock, Opsize and REPNE prefixes come before the REX. *) 824 fun opcodePrefix LOCK_XADD32 = [0wxF0] (* Requires LOCK prefix. *) 825 | opcodePrefix LOCK_XADD64 = [0wxF0] (* Requires LOCK prefix. *) 826 | opcodePrefix MOVL_R_A16 = [0wx66] (* Requires OPSIZE prefix. *) 827 | opcodePrefix SSE2StoreSingle = [0wxf3] 828 | opcodePrefix SSE2StoreDouble = [0wxf2] 829 | opcodePrefix(SSE2Ops SSE2CompDouble) = [0wx66] 830 | opcodePrefix(SSE2Ops SSE2And) = [0wx66] 831 | opcodePrefix(SSE2Ops SSE2Xor) = [0wx66] 832 | opcodePrefix(SSE2Ops SSE2CompSingle) = [] (* No prefix *) 833 | opcodePrefix(SSE2Ops SSE2MoveDouble) = [0wxf2] 834 | opcodePrefix(SSE2Ops SSE2AddDouble) = [0wxf2] 835 | opcodePrefix(SSE2Ops SSE2SubDouble) = [0wxf2] 836 | opcodePrefix(SSE2Ops SSE2MulDouble) = [0wxf2] 837 | opcodePrefix(SSE2Ops SSE2DivDouble) = [0wxf2] 838 | opcodePrefix(SSE2Ops SSE2DoubleToFloat) = [0wxf2] 839 | opcodePrefix(SSE2Ops SSE2MoveFloat) = [0wxf3] 840 | opcodePrefix(SSE2Ops SSE2AddSingle) = [0wxf3] 841 | opcodePrefix(SSE2Ops SSE2SubSingle) = [0wxf3] 842 | opcodePrefix(SSE2Ops SSE2MulSingle) = [0wxf3] 843 | opcodePrefix(SSE2Ops SSE2DivSingle) = [0wxf3] 844 | opcodePrefix(SSE2Ops SSE2FloatToDouble) = [0wxf3] 845 | opcodePrefix CVTSI2SD32 = [0wxf2] 846 | opcodePrefix CVTSI2SD64 = [0wxf2] 847 | opcodePrefix MOVDFromXMM = [0wx66] 848 | opcodePrefix MOVQToXMM = [0wx66] 849 | opcodePrefix PSRLDQ = [0wx66] 850 | opcodePrefix CVTSD2SI32 = [0wxf2] 851 | opcodePrefix CVTSD2SI64 = [0wxf2] 852 | opcodePrefix CVTSS2SI32 = [0wxf3] 853 | opcodePrefix CVTSS2SI64 = [0wxf3] 854 | opcodePrefix CVTTSD2SI32 = [0wxf2] 855 | opcodePrefix CVTTSD2SI64 = [0wxf2] 856 | opcodePrefix CVTTSS2SI32 = [0wxf3] 857 | opcodePrefix CVTTSS2SI64 = [0wxf3] 858 | opcodePrefix _ = [] 859 860 (* A few instructions require an escape. Escapes come after the REX. *) 861 fun escapePrefix MOVZB = [0wx0f] 862 | escapePrefix MOVZW = [0wx0f] 863 | escapePrefix MOVSXB32 = [0wx0f] 864 | escapePrefix MOVSXW32 = [0wx0f] 865 | escapePrefix MOVSXB64 = [0wx0f] 866 | escapePrefix MOVSXW64 = [0wx0f] 867 | escapePrefix LOCK_XADD32 = [0wx0f] 868 | escapePrefix LOCK_XADD64 = [0wx0f] 869 | escapePrefix IMUL32 = [0wx0f] 870 | escapePrefix IMUL64 = [0wx0f] 871 | escapePrefix(CondJump32 _) = [0wx0f] 872 | escapePrefix(SetCC _) = [0wx0f] 873 | escapePrefix SSE2StoreSingle = [0wx0f] 874 | escapePrefix SSE2StoreDouble = [0wx0f] 875 | escapePrefix(SSE2Ops _) = [0wx0f] 876 | escapePrefix CVTSI2SD32 = [0wx0f] 877 | escapePrefix CVTSI2SD64 = [0wx0f] 878 | escapePrefix MOVDFromXMM = [0wx0f] 879 | escapePrefix MOVQToXMM = [0wx0f] 880 | escapePrefix PSRLDQ = [0wx0f] 881 | escapePrefix LDSTMXCSR = [0wx0f] 882 | escapePrefix CVTSD2SI32 = [0wx0f] 883 | escapePrefix CVTSD2SI64 = [0wx0f] 884 | escapePrefix CVTSS2SI32 = [0wx0f] 885 | escapePrefix CVTSS2SI64 = [0wx0f] 886 | escapePrefix CVTTSD2SI32 = [0wx0f] 887 | escapePrefix CVTTSD2SI64 = [0wx0f] 888 | escapePrefix CVTTSS2SI32 = [0wx0f] 889 | escapePrefix CVTTSS2SI64 = [0wx0f] 890 | escapePrefix(CMOV32 _) = [0wx0f] 891 | escapePrefix(CMOV64 _) = [0wx0f] 892 | escapePrefix _ = [] 893 894 (* Generate an opCode byte after doing any pending operations. *) 895 fun opCodeBytes(opb:opCode, rx) = 896 let 897 val rexByte = 898 case rx of 899 NONE => [] 900 | SOME rxx => 901 if hostIsX64 then [rex rxx] 902 else raise InternalError "opCodeBytes: rex prefix in 32 bit mode"; 903 in 904 opcodePrefix opb @ rexByte @ escapePrefix opb @ [opToInt opb] 905 end 906 907 fun rexByte(opb, rrX, rbX, riX) = 908 let 909 (* We need a rex prefix if we need to set the length to 64-bit. *) 910 val need64bit = 911 case opb of 912 Group1_8_A64 => true (* Arithmetic operations - must be 64-bit *) 913 | Group1_32_A64 => true (* Arithmetic operations - must be 64-bit *) 914 | Group2_1_A64 => true (* 1-bit shifts - must be 64-bit *) 915 | Group2_8_A64 => true (* n-bit shifts - must be 64-bit *) 916 | Group2_CL_A64 => true (* Shifts by value in CL *) 917 | Group3_A64 => true (* Test, Not, Mul etc. *) 918 | Arith64 (_, _) => true 919 | MOVL_A_R64 => true (* Needed *) 920 | MOVL_R_A64 => true (* Needed *) 921 | XCHNG64 => true 922 | LEAL64 => true (* Needed to ensure the result is 64-bits *) 923 | MOVL_64_R _ => true (* Needed *) 924 | MOVL_32_A64 => true (* Needed *) 925 | IMUL64 => true (* Needed to ensure the result is 64-bits *) 926 | LOCK_XADD64 => true (* Needed to ensure the result is 64-bits *) 927 | CQO_CDQ64 => true (* It's only CQO if there's a Rex prefix. *) 928 | CVTSI2SD64 => true (* This affects the size of the integer source. *) 929 | IMUL_C8_64 => true 930 | IMUL_C32_64 => true 931 | MOVQToXMM => true 932 | CVTSD2SI64 => true (* This affects the size of the integer source. *) 933 | CVTSS2SI64 => true 934 | CVTTSD2SI64 => true 935 | CVTTSS2SI64 => true 936 | MOVSXD => true 937 | CMOV64 _ => true 938 | MOVSXB64 => true 939 | MOVSXW64 => true 940 (* Group5 - We only use 2/4/6 and they don't need prefix *) 941 | _ => false 942 (* If we are using MOVB_R_A with SIL or DIL we need to force a REX prefix. 943 That's only possible in 64-bit mode. This also applies with Test and SetCC 944 but they are dealt with elsewhere. *) 945 val forceRex = 946 case opb of 947 MOVB_R_A64 {forceRex=true} => true (* This is allowed in X86/64 but not in X86/32. *) 948 | _ => false 949 in 950 if need64bit orelse rrX orelse rbX orelse riX orelse forceRex 951 then [rex{w=need64bit, r=rrX, b=rbX, x = riX}] 952 else [] 953 end 954 955 (* Register/register operation. *) 956 fun opReg(opb:opCode, (*dest*)GeneralReg(rrC, rrX), (*source*)GeneralReg(rbC, rbX)) = 957 let 958 val pref = opcodePrefix opb (* Any opsize or lock prefix. *) 959 val rex = rexByte(opb, rrX, rbX, false) 960 val esc = escapePrefix opb (* Generate the ESCAPE code if needed. *) 961 val opc = opToInt opb 962 val mdrm = modrm(Register, rrC, rbC) 963 in 964 pref @ rex @ esc @ [opc, mdrm] 965 end 966 967 (* Operations on a register where the second "register" is actually an operation code. *) 968 fun opRegPlus2(opb:opCode, rd: genReg, op2: Word8.word) = 969 let 970 val (rrC, rrX) = getReg rd 971 val pref = opcodePrefix opb (* Any opsize or lock prefix. *) 972 val rex = rexByte(opb, false, rrX, false) 973 val opc = opToInt opb 974 val mdrm = modrm(Register, op2, rrC) 975 in 976 pref @ rex @ [opc, mdrm] 977 end 978 979 local 980 (* General instruction form with modrm and optional sib bytes. rb is an option since the 981 base register may be omitted. This is used with LEA to tag integers. *) 982 fun opIndexedGen (opb:opCode, offset: LargeInt.int, rb: genReg option, ri: indexType, (rrC, rrX)) = 983 let 984 (* Base encoding. (Based0, 0w5) means "no base" so if we need ebp as the 985 base we have to use Based8 at least. *) 986 val (offsetCode, rbC, rbX) = 987 case rb of 988 NONE => (Based0, 0w5 (* no base register *), false) 989 | SOME rb => 990 let 991 val (rbC, rbX) = getReg rb 992 val base = 993 if offset = 0 andalso rbC <> 0wx5 (* Can't use ebp with Based0 *) 994 then Based0 (* no disp field *) 995 else if is8BitL offset 996 then Based8 (* use 8-bit disp field *) 997 else Based32 (* use 32-bit disp field *) 998 in 999 (base, rbC, rbX) 1000 end 1001 1002 (* Index coding. esp can't be used as an index so (0w4, false) means "no index". 1003 But r12 (0w4, true) CAN be. *) 1004 val ((riC, riX), scaleFactor) = 1005 case ri of 1006 NoIndex => ((0w4, false), 0w0) 1007 | Index1 i => (getReg i, 0w0) 1008 | Index2 i => (getReg i, 0w1) 1009 | Index4 i => (getReg i, 0w2) 1010 | Index8 i => (getReg i, 0w3) 1011 1012 (* If the base register is esp or r12 we have to use a sib byte even if 1013 there's no index. That's because 0w4 as a base register means "there's 1014 a SIB byte". *) 1015 val modRmAndOptionalSib = 1016 if rbC = 0w4 (* Code for esp and r12 *) orelse riC <> 0w4 orelse riX 1017 then 1018 let 1019 val mdrm = modrm(offsetCode, rrC, 0w4 (* s-i-b *)) 1020 val sibByte = (scaleFactor <<- 0w6) orb8 (riC <<- 0w3) orb8 rbC 1021 in 1022 [mdrm, sibByte] 1023 end 1024 else [modrm(offsetCode, rrC, rbC)] 1025 1026 (* Generate the disp field (if any) *) 1027 val dispField = 1028 case (offsetCode, rb) of 1029 (Based8, _) => [Word8.fromLargeInt offset] 1030 | (Based32, _) => int32Signed offset 1031 | (_, NONE) => (* 32 bit absolute used as base *) int32Signed offset 1032 | _ => [] 1033 in 1034 opcodePrefix opb @ rexByte(opb, rrX, rbX, riX) @ escapePrefix opb @ 1035 opToInt opb :: modRmAndOptionalSib @ dispField 1036 end 1037 in 1038 fun opEA(opb, offset, rb, r) = opIndexedGen(opb, offset, SOME rb, NoIndex, getReg r) 1039 (* Generate a opcode plus a second modrm byte but where the "register" field in 1040 the modrm byte is actually a code. *) 1041 and opPlus2(opb, offset, rb, op2) = opIndexedGen(opb, offset, SOME rb, NoIndex, (op2, false)) 1042 1043 and opIndexedPlus2(opb, offset, rb, ri, op2) = opIndexedGen(opb, offset, SOME rb, ri, (op2, false)) 1044 1045 fun opIndexed (opb, offset, rb, ri, rd) = 1046 opIndexedGen(opb, offset, rb, ri, getReg rd) 1047 1048 fun opAddress(opb, offset, rb, ri, rd) = opIndexedGen (opb, offset, SOME rb, ri, getReg rd) 1049 and mMXAddress(opb, offset, rb, ri, SSE2Reg rrC) = opIndexedGen(opb, offset, SOME rb, ri, (rrC, false)) 1050 and opAddressPlus2(opb, offset, rb, ri, op2) = 1051 opIndexedGen(opb, offset, SOME rb, ri, (op2, false)) 1052 end 1053 1054 (* An operation with an operand that needs to go in the constant area, or in the case of 1055 native 32-bit, where the constant is stored in an object and the address of the 1056 object is inline. This just puts in the instruction and the address. The details 1057 of the constant are dealt with in putConst. *) 1058 fun opConstantOperand(opb, (*dest*)GeneralReg(rrC, rrX)) = 1059 let 1060 val pref = opcodePrefix opb (* Any opsize or lock prefix. *) 1061 val rex = rexByte(opb, rrX, false, false) 1062 val esc = escapePrefix opb (* Generate the ESCAPE code if needed. *) 1063 val opc = opToInt opb 1064 val mdrm = modrm(Based0, rrC, 0w5 (* PC-relative or absolute *)) 1065 in 1066 pref @ rex @ esc @ [opc, mdrm] @ int32Signed(tag 0) 1067 end 1068 1069 fun immediateOperand (opn: arithOp, rd: genReg, imm: LargeInt.int, opSize) = 1070 if is8BitL imm 1071 then (* Can use one byte immediate *) 1072 opRegPlus2(case opSize of OpSize64 => Group1_8_A64 | OpSize32 => Group1_8_A32, rd, arithOpToWord opn) @ [Word8.fromLargeInt imm] 1073 else if is32bit imm 1074 then (* Need 32 bit immediate. *) 1075 opRegPlus2(case opSize of OpSize64 => Group1_32_A64 | OpSize32 => Group1_32_A32, rd, arithOpToWord opn) @ int32Signed imm 1076 else (* It won't fit in the immediate; put it in the non-address area. *) 1077 let 1078 val opc = case opSize of OpSize64 => Arith64 | OpSize32 => Arith32 1079 in 1080 opConstantOperand(opc(opn, 0w3 (* r/m to reg *)), rd) 1081 end 1082 1083 fun arithOpReg(opn: arithOp, rd: genReg, rs: genReg, opIs64) = 1084 opReg ((if opIs64 then Arith64 else Arith32) (opn, 0w3 (* r/m to reg *)), rd, rs) 1085 1086 type handlerLab = addrs ref 1087 1088 fun floatingPtOp{escape, md, nnn, rm} = 1089 opCodeBytes(FPESC escape, NONE) @ [(md <<- 0w6) orb8 (nnn <<- 0w3) orb8 rm] 1090 1091 datatype trapEntries = 1092 StackOverflowCall 1093 | StackOverflowCallEx 1094 | HeapOverflowCall 1095 1096 (* RTS call. We need to save any registers that may contain addresses to the stack. 1097 All the registers are preserved but not seen by the GC. *) 1098 fun rtsCall(rtsEntry, regSet) = 1099 let 1100 val entry = 1101 case rtsEntry of 1102 StackOverflowCall => memRegStackOverflowCall 1103 | StackOverflowCallEx => memRegStackOverflowCallEx 1104 | HeapOverflowCall => memRegHeapOverflowCall 1105 val regSet = List.foldl(fn (r, a) => (0w1 << Word.fromInt(nReg(GenReg r))) orb a) 0w0 regSet 1106 val callInstr = 1107 opPlus2(Group5, LargeInt.fromInt entry, ebp, 0w2 (* call *)) 1108 val regSetInstr = 1109 if regSet >= 0w256 1110 then [0wxca, (* This is actually a FAR RETURN *) 1111 wordToWord8 regSet, (* Low byte*) wordToWord8 (regSet >> 0w8) (* High byte*)] 1112 else if regSet <> 0w0 1113 then [0wxcd, (* This is actually INT n *) wordToWord8 regSet] 1114 else [] 1115 in 1116 callInstr @ regSetInstr 1117 end 1118 1119 (* Operations. *) 1120 type cases = word * label 1121 1122 type memoryAddress = { base: genReg, offset: int, index: indexType } 1123 1124 datatype 'reg regOrMemoryArg = 1125 RegisterArg of 'reg 1126 | MemoryArg of memoryAddress 1127 | NonAddressConstArg of LargeInt.int 1128 | AddressConstArg of machineWord 1129 1130 datatype moveSize = 1131 Move64 | Move32 | Move8 | Move16 | Move32X64 | Move8X32 | Move8X64 | Move16X32 | Move16X64 1132 and fpSize = SinglePrecision | DoublePrecision 1133 1134 datatype operation = 1135 Move of { source: genReg regOrMemoryArg, destination: genReg regOrMemoryArg, moveSize: moveSize } 1136 | PushToStack of genReg regOrMemoryArg 1137 | PopR of genReg 1138 | ArithToGenReg of { opc: arithOp, output: genReg, source: genReg regOrMemoryArg, opSize: opSize } 1139 | ArithMemConst of { opc: arithOp, address: memoryAddress, source: LargeInt.int, opSize: opSize } 1140 | ArithMemLongConst of { opc: arithOp, address: memoryAddress, source: machineWord } 1141 | ArithByteMemConst of { opc: arithOp, address: memoryAddress, source: Word8.word } 1142 | ShiftConstant of { shiftType: shiftType, output: genReg, shift: Word8.word, opSize: opSize } 1143 | ShiftVariable of { shiftType: shiftType, output: genReg, opSize: opSize } (* Shift amount is in ecx *) 1144 | ConditionalBranch of { test: branchOps, label: label } 1145 | SetCondition of { output: genReg, test: branchOps } 1146 | LoadAddress of { output: genReg, offset: int, base: genReg option, index: indexType, opSize: opSize } 1147 | TestByteBits of { arg: genReg regOrMemoryArg, bits: Word8.word } 1148 | CallRTS of {rtsEntry: trapEntries, saveRegs: genReg list } 1149 | AllocStore of { size: int, output: genReg, saveRegs: genReg list } 1150 | AllocStoreVariable of { size: genReg, output: genReg, saveRegs: genReg list } 1151 | StoreInitialised 1152 | CallAddress of genReg regOrMemoryArg 1153 | JumpAddress of genReg regOrMemoryArg 1154 | ReturnFromFunction of int 1155 | RaiseException of { workReg: genReg } 1156 | UncondBranch of label 1157 | ResetStack of { numWords: int, preserveCC: bool } 1158 | JumpLabel of label 1159 | LoadLabelAddress of { label: label, output: genReg } 1160 | RepeatOperation of repOps 1161 | DivideAccR of {arg: genReg, isSigned: bool, opSize: opSize } 1162 | DivideAccM of {base: genReg, offset: int, isSigned: bool, opSize: opSize } 1163 | AtomicXAdd of {address: memoryAddress, output: genReg, opSize: opSize } 1164 | FPLoadFromMemory of { address: memoryAddress, precision: fpSize } 1165 | FPLoadFromFPReg of { source: fpReg, lastRef: bool } 1166 | FPLoadFromConst of { constant: machineWord, precision: fpSize } 1167 | FPStoreToFPReg of { output: fpReg, andPop: bool } 1168 | FPStoreToMemory of { address: memoryAddress, precision: fpSize, andPop: bool } 1169 | FPArithR of { opc: fpOps, source: fpReg } 1170 | FPArithConst of { opc: fpOps, source: machineWord, precision: fpSize } 1171 | FPArithMemory of { opc: fpOps, base: genReg, offset: int, precision: fpSize } 1172 | FPUnary of fpUnaryOps 1173 | FPStatusToEAX 1174 | FPLoadInt of { base: genReg, offset: int, opSize: opSize } 1175 | FPFree of fpReg 1176 | MultiplyR of { source: genReg regOrMemoryArg, output: genReg, opSize: opSize } 1177 | XMMArith of { opc: sse2Operations, source: xmmReg regOrMemoryArg, output: xmmReg } 1178 | XMMStoreToMemory of { toStore: xmmReg, address: memoryAddress, precision: fpSize } 1179 | XMMConvertFromInt of { source: genReg, output: xmmReg, opSize: opSize } 1180 | SignExtendForDivide of opSize 1181 | XChng of { reg: genReg, arg: genReg regOrMemoryArg, opSize: opSize } 1182 | Negative of { output: genReg, opSize: opSize } 1183 | JumpTable of { cases: label list, jumpSize: jumpSize ref } 1184 | IndexedJumpCalc of { addrReg: genReg, indexReg: genReg, jumpSize: jumpSize ref } 1185 | MoveXMMRegToGenReg of { source: xmmReg, output: genReg } 1186 | MoveGenRegToXMMReg of { source: genReg, output: xmmReg } 1187 | XMMShiftRight of { output: xmmReg, shift: Word8.word } 1188 | FPLoadCtrlWord of memoryAddress (* Load FP control word. *) 1189 | FPStoreCtrlWord of memoryAddress (* Store FP control word. *) 1190 | XMMLoadCSR of memoryAddress (* Load combined control/status word. *) 1191 | XMMStoreCSR of memoryAddress (* Store combined control/status word. *) 1192 | FPStoreInt of memoryAddress 1193 | XMMStoreInt of { source: xmmReg regOrMemoryArg, output: genReg, precision: fpSize, isTruncate: bool } 1194 | CondMove of { test: branchOps, output: genReg, source: genReg regOrMemoryArg, opSize: opSize } 1195 | LoadAbsolute of { destination: genReg, value: machineWord } 1196 1197 and jumpSize = JumpSize2 | JumpSize8 1198 1199 type operations = operation list 1200 1201 fun printOperation(operation, stream) = 1202 let 1203 fun printGReg r = stream(genRegRepr(r, sz32_64)) 1204 val printFPReg = stream o fpRegRepr 1205 and printXMMReg = stream o xmmRegRepr 1206 fun printBaseOffset(b, x, i) = 1207 ( 1208 stream(Int.toString i); stream "("; printGReg b; stream ")"; 1209 case x of 1210 NoIndex => () 1211 | Index1 x => (stream "["; printGReg x; stream "]") 1212 | Index2 x => (stream "["; printGReg x; stream "*2]") 1213 | Index4 x => (stream "["; printGReg x; stream "*4]") 1214 | Index8 x => (stream "["; printGReg x; stream "*8]") 1215 ) 1216 fun printMemAddress({ base, offset, index }) = printBaseOffset(base, index, offset) 1217 1218 fun printRegOrMemoryArg printReg (RegisterArg r) = printReg r 1219 | printRegOrMemoryArg _ (MemoryArg{ base, offset, index }) = printBaseOffset(base, index, offset) 1220 | printRegOrMemoryArg _ (NonAddressConstArg c) = stream(LargeInt.toString c) 1221 | printRegOrMemoryArg _ (AddressConstArg c) = stream(Address.stringOfWord c) 1222 1223 fun printOpSize OpSize32 = "32" 1224 | printOpSize OpSize64 = "64" 1225 in 1226 case operation of 1227 Move { source, destination, moveSize } => 1228 ( 1229 case moveSize of 1230 Move64 => stream "Move64 " 1231 | Move32 => stream "Move32 " 1232 | Move8 => stream "Move8 " 1233 | Move16 => stream "Move16 " 1234 | Move32X64 => stream "Move32X64 " 1235 | Move8X32 => stream "Move8X32 " 1236 | Move8X64 => stream "Move8X64 " 1237 | Move16X32 => stream "Move16X32 " 1238 | Move16X64 => stream "Move16X64 "; 1239 printRegOrMemoryArg printGReg destination; stream " <= "; printRegOrMemoryArg printGReg source 1240 ) 1241 1242 | ArithToGenReg { opc, output, source, opSize } => 1243 (stream (arithOpRepr opc); stream "RR"; stream(printOpSize opSize); stream " "; printGReg output; stream " <= "; printRegOrMemoryArg printGReg source ) 1244 1245 | ArithMemConst { opc, address, source, opSize } => 1246 ( 1247 stream (arithOpRepr opc); stream "MC"; stream(printOpSize opSize); stream " "; 1248 printMemAddress address; 1249 stream " "; stream(LargeInt.toString source) 1250 ) 1251 1252 | ArithMemLongConst { opc, address, source } => 1253 ( 1254 stream (arithOpRepr opc ^ "MC "); printMemAddress address; 1255 stream " <= "; stream(Address.stringOfWord source) 1256 ) 1257 1258 | ArithByteMemConst { opc, address, source } => 1259 ( 1260 stream (arithOpRepr opc); stream "MC8"; stream " "; 1261 printMemAddress address; stream " "; stream(Word8.toString source) 1262 ) 1263 1264 | ShiftConstant { shiftType, output, shift, opSize } => 1265 ( 1266 stream(shiftTypeRepr shiftType); stream(printOpSize opSize); stream " "; printGReg output; 1267 stream " by "; stream(Word8.toString shift) 1268 ) 1269 1270 | ShiftVariable { shiftType, output, opSize } => (* Shift amount is in ecx *) 1271 ( 1272 stream(shiftTypeRepr shiftType); stream(printOpSize opSize); stream " "; printGReg output; stream " by ECX" 1273 ) 1274 1275 | ConditionalBranch { test, label=Label{labelNo, ...} } => 1276 ( 1277 stream "Jump"; stream(branchOpRepr test); stream " L"; stream(Int.toString labelNo) 1278 ) 1279 1280 | SetCondition { output, test } => 1281 ( 1282 stream "SetCC"; stream(branchOpRepr test); stream " => "; printGReg output 1283 ) 1284 1285 | PushToStack source => (stream "Push "; printRegOrMemoryArg printGReg source) 1286 1287 | PopR dest => (stream "PopR "; printGReg dest) 1288 1289 | LoadAddress{ output, offset, base, index, opSize } => 1290 ( 1291 stream "LoadAddress"; stream(printOpSize opSize); stream " "; 1292 case base of NONE => () | SOME r => (printGReg r; stream " + "); 1293 stream(Int.toString offset); 1294 case index of 1295 NoIndex => () 1296 | Index1 x => (stream " + "; printGReg x) 1297 | Index2 x => (stream " + "; printGReg x; stream "*2 ") 1298 | Index4 x => (stream " + "; printGReg x; stream "*4 ") 1299 | Index8 x => (stream " + "; printGReg x; stream "*8 "); 1300 stream " => "; printGReg output 1301 ) 1302 1303 | TestByteBits { arg, bits } => 1304 ( stream "TestByteBits "; printRegOrMemoryArg printGReg arg; stream " 0x"; stream(Word8.toString bits) ) 1305 1306 | CallRTS {rtsEntry, ...} => 1307 ( 1308 stream "CallRTS "; 1309 case rtsEntry of 1310 StackOverflowCall => stream "StackOverflowCall" 1311 | HeapOverflowCall => stream "HeapOverflow" 1312 | StackOverflowCallEx => stream "StackOverflowCallEx" 1313 ) 1314 1315 | AllocStore { size, output, ... } => 1316 (stream "AllocStore "; stream(Int.toString size); stream " => "; printGReg output ) 1317 1318 | AllocStoreVariable { output, size, ...} => 1319 (stream "AllocStoreVariable "; printGReg size; stream " => "; printGReg output ) 1320 1321 | StoreInitialised => stream "StoreInitialised" 1322 1323 | CallAddress source => (stream "CallAddress "; printRegOrMemoryArg printGReg source) 1324 | JumpAddress source => (stream "JumpAddress "; printRegOrMemoryArg printGReg source) 1325 1326 | ReturnFromFunction argsToRemove => 1327 (stream "ReturnFromFunction "; stream(Int.toString argsToRemove)) 1328 1329 | RaiseException { workReg } => (stream "RaiseException "; printGReg workReg) 1330 | UncondBranch(Label{labelNo, ...})=> 1331 (stream "UncondBranch L"; stream(Int.toString labelNo)) 1332 | ResetStack{numWords, preserveCC} => 1333 (stream "ResetStack "; stream(Int.toString numWords); if preserveCC then stream " preserve CC" else ()) 1334 | JumpLabel(Label{labelNo, ...}) => 1335 (stream "L"; stream(Int.toString labelNo); stream ":") 1336 | LoadLabelAddress{ label=Label{labelNo, ...}, output } => 1337 (stream "LoadLabelAddress L"; stream(Int.toString labelNo); stream "=>"; printGReg output) 1338 | RepeatOperation repOp => (stream "Repeat "; stream(repOpsRepr repOp)) 1339 | DivideAccR{arg, isSigned, opSize} => 1340 ( stream(if isSigned then "DivideSigned" else "DivideUnsigned"); stream(printOpSize opSize); stream " "; printGReg arg) 1341 | DivideAccM{base, offset, isSigned, opSize} => 1342 ( stream(if isSigned then "DivideSigned" else "DivideUnsigned"); stream(printOpSize opSize); stream " "; printBaseOffset(base, NoIndex, offset)) 1343 | AtomicXAdd{address, output, opSize} => 1344 (stream "LockedXAdd"; stream(printOpSize opSize); printMemAddress address; stream " <=> "; printGReg output) 1345 | FPLoadFromMemory{address, precision=DoublePrecision} => (stream "FPLoadDouble "; printMemAddress address) 1346 | FPLoadFromMemory{address, precision=SinglePrecision} => (stream "FPLoadSingle "; printMemAddress address) 1347 | FPLoadFromFPReg {source, lastRef} => 1348 (stream "FPLoad "; printFPReg source; if lastRef then stream " (LAST)" else()) 1349 | FPLoadFromConst{constant, precision} => 1350 ( 1351 case precision of DoublePrecision => stream "FPLoadD " | SinglePrecision => stream "FPLoadS"; 1352 stream(Address.stringOfWord constant) 1353 ) 1354 | FPStoreToFPReg{ output, andPop } => 1355 (if andPop then stream "FPStoreAndPop => " else stream "FPStore => "; printFPReg output) 1356 | FPStoreToMemory{ address, precision=DoublePrecision, andPop: bool } => 1357 ( 1358 if andPop then stream "FPStoreDoubleAndPop => " else stream "FPStoreDouble => "; 1359 printMemAddress address 1360 ) 1361 | FPStoreToMemory{ address, precision=SinglePrecision, andPop: bool } => 1362 ( 1363 if andPop then stream "FPStoreSingleAndPop => " else stream "FPStoreSingle => "; 1364 printMemAddress address 1365 ) 1366 | FPArithR{ opc, source } => (stream(fpOpRepr opc); stream " "; printFPReg source) 1367 | FPArithConst{ opc, source, precision } => 1368 (stream(fpOpRepr opc); case precision of DoublePrecision => stream "D " | SinglePrecision => stream "S "; stream(Address.stringOfWord source)) 1369 | FPArithMemory{ opc, base, offset, precision } => 1370 (stream(fpOpRepr opc); case precision of DoublePrecision => stream "D " | SinglePrecision => stream "S "; printBaseOffset(base, NoIndex, offset)) 1371 | FPUnary opc => stream(fpUnaryRepr opc) 1372 | FPStatusToEAX => (stream "FPStatus "; printGReg eax) 1373 | FPLoadInt { base, offset, opSize} => 1374 (stream "FPLoadInt"; stream(printOpSize opSize); stream " "; printBaseOffset(base, NoIndex, offset)) 1375 | FPFree reg => (stream "FPFree "; printFPReg reg) 1376 | MultiplyR {source, output, opSize } => 1377 (stream "MultiplyR"; stream(printOpSize opSize); stream " "; printRegOrMemoryArg printGReg source; stream " *=>"; printGReg output) 1378 | XMMArith { opc, source, output } => 1379 ( 1380 stream (sse2OpRepr opc ^ "RM "); printXMMReg output; stream " <= "; printRegOrMemoryArg printXMMReg source 1381 ) 1382 | XMMStoreToMemory { toStore, address, precision=DoublePrecision } => 1383 ( 1384 stream "MoveDouble "; printXMMReg toStore; stream " => "; printMemAddress address 1385 ) 1386 | XMMStoreToMemory { toStore, address, precision=SinglePrecision } => 1387 ( 1388 stream "MoveSingle "; printXMMReg toStore; stream " => "; printMemAddress address 1389 ) 1390 | XMMConvertFromInt { source, output, opSize } => 1391 ( 1392 stream "ConvertFromInt "; stream(printOpSize opSize); stream " "; printGReg source; stream " => "; printXMMReg output 1393 ) 1394 | SignExtendForDivide opSize => ( stream "SignExtendForDivide"; stream(printOpSize opSize) ) 1395 | XChng { reg, arg, opSize } => 1396 (stream "XChng"; stream(printOpSize opSize); stream " "; printGReg reg; stream " <=> "; printRegOrMemoryArg printGReg arg) 1397 | Negative { output, opSize } => 1398 (stream "Negative"; stream(printOpSize opSize); stream " "; printGReg output) 1399 | JumpTable{cases, ...} => 1400 List.app(fn(Label{labelNo, ...}) => (stream "UncondBranch L"; stream(Int.toString labelNo); stream "\n")) cases 1401 | IndexedJumpCalc { addrReg, indexReg, jumpSize=ref jumpSize } => 1402 ( 1403 stream "IndexedJumpCalc "; printGReg addrReg; stream " += "; printGReg indexReg; 1404 stream (case jumpSize of JumpSize2 => " * 2" | JumpSize8 => " * 8 ") 1405 ) 1406 | MoveXMMRegToGenReg { source, output } => 1407 ( 1408 stream "MoveXMMRegToGenReg "; printXMMReg source; stream " => "; printGReg output 1409 ) 1410 | MoveGenRegToXMMReg { source, output } => 1411 ( 1412 stream "MoveGenRegToXMMReg "; printGReg source; stream " => "; printXMMReg output 1413 ) 1414 | XMMShiftRight { output, shift } => 1415 ( 1416 stream "XMMShiftRight "; printXMMReg output; stream " by "; stream(Word8.toString shift) 1417 ) 1418 | FPLoadCtrlWord address => 1419 ( 1420 stream "FPLoadCtrlWord "; stream " => "; printMemAddress address 1421 ) 1422 | FPStoreCtrlWord address => 1423 ( 1424 stream "FPStoreCtrlWord "; stream " <= "; printMemAddress address 1425 ) 1426 | XMMLoadCSR address => 1427 ( 1428 stream "XMMLoadCSR "; stream " => "; printMemAddress address 1429 ) 1430 | XMMStoreCSR address => 1431 ( 1432 stream "XMMStoreCSR "; stream " <= "; printMemAddress address 1433 ) 1434 | FPStoreInt address => 1435 ( 1436 stream "FPStoreInt "; stream " <= "; printMemAddress address 1437 ) 1438 | XMMStoreInt{ source, output, precision, isTruncate } => 1439 ( 1440 stream "XMMStoreInt"; 1441 case precision of SinglePrecision => stream "Single" | DoublePrecision => stream "Double"; 1442 if isTruncate then stream "Truncate " else stream " "; 1443 printGReg output; stream " <= "; printRegOrMemoryArg printXMMReg source 1444 ) 1445 | CondMove { test, output, source, opSize } => 1446 ( 1447 stream "CondMove"; stream(branchOpRepr test); stream(printOpSize opSize); 1448 printGReg output; stream " <= "; printRegOrMemoryArg printGReg source 1449 ) 1450 | LoadAbsolute { destination, value } => 1451 ( stream "LoadAbsolute "; printGReg destination; stream " <= "; stream(Address.stringOfWord value) ) 1452 ; 1453 1454 stream "\n" 1455 end 1456 1457 datatype implement = ImplementGeneral | ImplementLiteral of machineWord 1458 1459 fun printLowLevelCode(ops, Code{printAssemblyCode, printStream, procName, ...}) = 1460 if printAssemblyCode 1461 then 1462 ( 1463 if procName = "" (* No name *) then printStream "?" else printStream procName; 1464 printStream ":\n"; 1465 List.app(fn i => printOperation(i, printStream)) ops; 1466 printStream "\n" 1467 ) 1468 else () 1469 1470(* val opLen = if isX64 then OpSize64 else OpSize32 *) 1471 1472 (* Code generate a list of operations. The list is in reverse order i.e. last instruction first. *) 1473 fun codeGenerate ops = 1474 let 1475 fun cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move64 }) = 1476 (* Move from one general register to another. N.B. Because we're using the 1477 "store" version of the Move the source and output are reversed. *) 1478 opReg(MOVL_R_A64, source, output) 1479 1480 | cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move32 }) = 1481 opReg(MOVL_R_A32, source, output) 1482 1483 | cgOp(Move{ source=NonAddressConstArg source, destination=RegisterArg output, moveSize=Move64}) = 1484 if targetArch <> Native32Bit 1485 then 1486 ( 1487 (* N.B. There is related code in getConstant that deals with PC-relative values and 1488 also checks the range of constants that need to be in the constant area. *) 1489 if source >= 0 andalso source < 0x100000000 1490 then (* Unsigned 32 bits. We can use a 32-bit instruction to set the 1491 value because it will zero extend to 64-bits. 1492 This may also allow us to save a rex byte. *) 1493 let 1494 val (rc, rx) = getReg output 1495 val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE) 1496 in 1497 opb @ word32Unsigned(LargeWord.fromLargeInt source) 1498 end 1499 else if source >= ~0x80000000 andalso source < 0 1500 then (* Signed 32-bits. *) 1501 (* This is not scanned in 64-bit mode because 32-bit values aren't 1502 big enough to contain addresses. *) 1503 opRegPlus2(MOVL_32_A64, output, 0w0) @ int32Signed source 1504 else (* Too big for 32-bits; put it in the non-word area. *) 1505 opConstantOperand(MOVL_A_R64, output) 1506 ) 1507 else (* 32-bit mode. *) 1508 ( 1509 (* The RTS scans for possible addresses in MOV instructions so we 1510 can only use MOV if this is a tagged value. If it isn't we have 1511 to use something else such as XOR/ADD. In particular this is used 1512 before LOCK XADD for atomic inc/dec. 1513 We expect Move to preserve the CC so shouldn't use anything that 1514 affects it. There was a previous comment that said that using 1515 LEA wasn't a good idea. Perhaps because it takes 6 bytes. *) 1516 if source mod 2 = 0 1517 then opIndexed(LEAL32, source, NONE, NoIndex, output) 1518 else 1519 let 1520 val (rc, rx) = getReg output 1521 val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE) 1522 in 1523 opb @ int32Signed source 1524 end 1525 ) 1526 1527 | cgOp(Move{ source=NonAddressConstArg source, destination=RegisterArg output, moveSize=Move32}) = 1528 if targetArch <> Native32Bit 1529 then 1530 ( 1531 (* N.B. There is related code in getConstant that deals with PC-relative values and 1532 also checks the range of constants that need to be in the constant area. *) 1533 if source >= 0 andalso source < 0x100000000 1534 then (* Unsigned 32 bits. We can use a 32-bit instruction to set the 1535 value because it will zero extend to 64-bits. 1536 This may also allow us to save a rex byte. *) 1537 let 1538 val (rc, rx) = getReg output 1539 val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE) 1540 in 1541 opb @ word32Unsigned(LargeWord.fromLargeInt source) 1542 end 1543 else if source >= ~0x80000000 andalso source < 0 1544 then (* Signed 32-bits. *) 1545 (* This is not scanned in 64-bit mode because 32-bit values aren't 1546 big enough to contain addresses. *) 1547 opRegPlus2(MOVL_32_A64, output, 0w0) @ int32Signed source 1548 else (* Too big for 32-bits; put it in the non-word area. *) 1549 opConstantOperand(MOVL_A_R64, output) 1550 ) 1551 else (* 32-bit mode. *) 1552 ( 1553 (* The RTS scans for possible addresses in MOV instructions so we 1554 can only use MOV if this is a tagged value. If it isn't we have 1555 to use something else such as XOR/ADD. In particular this is used 1556 before LOCK XADD for atomic inc/dec. 1557 We expect Move to preserve the CC so shouldn't use anything that 1558 affects it. There was a previous comment that said that using 1559 LEA wasn't a good idea. Perhaps because it takes 6 bytes. *) 1560 if source mod 2 = 0 1561 then opIndexed(LEAL32, source, NONE, NoIndex, output) 1562 else 1563 let 1564 val (rc, rx) = getReg output 1565 val opb = opCodeBytes(MOVL_32_R rc, if rx then SOME{w=false, r=false, b=rx, x=false} else NONE) 1566 in 1567 opb @ int32Signed source 1568 end 1569 ) 1570 1571 | cgOp(Move{ source=AddressConstArg _, destination=RegisterArg output, moveSize=Move64 }) = 1572 ( 1573 (* The constant area is currently PolyWords. That means we MUST use 1574 a 32-bit load in 32-in-64. *) 1575 targetArch = Native64Bit orelse raise InternalError "Move64 in 32-bit"; 1576 (* Put address constants in the constant area. *) 1577 opConstantOperand(MOVL_A_R64, output) 1578 ) 1579 1580 | cgOp(Move{ source=AddressConstArg _, destination=RegisterArg output, moveSize=Move32 }) = 1581 ( 1582 case targetArch of 1583 Native64Bit => raise InternalError "Move32 - AddressConstArg" 1584 1585 | ObjectId32Bit => 1586 (* Put address constants in the constant area. *) 1587 (* The constant area is currently PolyWords. That means we MUST use 1588 a 32-bit load in 32-in-64. *) 1589 opConstantOperand(MOVL_A_R32, output) 1590 1591 | Native32Bit => 1592 (* Immediate constant *) 1593 let 1594 val (rc, _) = getReg output 1595 in 1596 opCodeBytes(MOVL_32_R rc, NONE) @ int32Signed(tag 0) 1597 end 1598 ) 1599 1600 | cgOp(LoadAbsolute{ destination, ... }) = 1601 ( 1602 (* Immediate address constant. This is currently only used the special case of loading 1603 the address of PolyX86GetThreadData in a callback when we don't have rbx in 32-in-64. *) 1604 case targetArch of 1605 Native32Bit => 1606 let 1607 val (rc, _) = getReg destination 1608 in 1609 opCodeBytes(MOVL_32_R rc, NONE) @ int32Signed(tag 0) 1610 end 1611 1612 | Native64Bit => opConstantOperand(MOVL_A_R64, destination) 1613 1614 | ObjectId32Bit => 1615 let 1616 val (rc, rx) = getReg destination 1617 in 1618 opCodeBytes(MOVL_64_R rc, SOME{w=true, r=false, b=rx, x=false}) @ largeWordToBytes(LargeWord.fromLargeInt(tag 0), 8) 1619 end 1620 1621 ) 1622 1623 | cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move32 }) = 1624 opAddress(MOVL_A_R32, LargeInt.fromInt offset, base, index, output) 1625 1626 | cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move64 }) = 1627 opAddress(MOVL_A_R64, LargeInt.fromInt offset, base, index, output) 1628 1629 | cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8 }) = 1630 (* We don't need a REX.W bit here because the top 32-bits of a 1631 64-bit register will always be zeroed. *) 1632 opAddress(MOVZB, LargeInt.fromInt offset, base, index, output) 1633 1634 | cgOp(Move{source=RegisterArg source, destination=RegisterArg output, moveSize=Move8 }) = 1635 let 1636 (* Zero extend an 8-bit value in a register to 32/64 bits. *) 1637 val (rrC, rrX) = getReg output 1638 val (rbC, rbX) = getReg source 1639 (* We don't need a REX.W bit here because the top 32-bits of a 1640 64-bit register will always be zeroed but we may need a REX byte 1641 if we're using esi or edi. *) 1642 val rexByte = 1643 if rrC < 0w4 andalso not rrX andalso not rbX 1644 then NONE 1645 else if hostIsX64 1646 then SOME {w=false, r=rrX, b=rbX, x=false} 1647 else raise InternalError "Move8 with esi/edi" 1648 in 1649 opCodeBytes(MOVZB, rexByte) @ [modrm(Register, rrC, rbC)] 1650 end 1651 1652 | cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8X32 }) = 1653 opAddress(MOVSXB32, LargeInt.fromInt offset, base, index, output) 1654 1655 | cgOp(Move{source=MemoryArg{base, offset, index}, destination=RegisterArg output, moveSize=Move8X64 }) = 1656 (* But we will need a Rex.W here. *) 1657 opAddress(MOVSXB64, LargeInt.fromInt offset, base, index, output) 1658 1659 | cgOp(Move{moveSize=Move16, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) = 1660 (* No need for Rex.W *) 1661 opAddress(MOVZW, LargeInt.fromInt offset, base, index, output) 1662 1663 | cgOp(Move{moveSize=Move16X32, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) = 1664 opAddress(MOVSXW32, LargeInt.fromInt offset, base, index, output) 1665 1666 | cgOp(Move{moveSize=Move16X64, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) = 1667 (* But we do need Rex.W here *) 1668 opAddress(MOVSXW64, LargeInt.fromInt offset, base, index, output) 1669 1670 | cgOp(Move{moveSize=Move32X64, source=RegisterArg source, destination=RegisterArg output }) = 1671 (* We should have a REX.W bit here. *) 1672 opReg(MOVSXD, output, source) 1673 1674 | cgOp(Move{moveSize=Move32X64, source=MemoryArg{base, offset, index}, destination=RegisterArg output }) = 1675 (* We should have a REX.W bit here. *) 1676 opAddress(MOVSXD, LargeInt.fromInt offset, base, index, output) 1677 1678 | cgOp(Move{moveSize=Move32X64, ...}) = raise InternalError "cgOp: LoadNonWord Size32Bit" 1679 1680 | cgOp(LoadAddress{ offset, base, index, output, opSize }) = 1681 (* This provides a mixture of addition and multiplication in a single 1682 instruction. *) 1683 opIndexed(case opSize of OpSize64 => LEAL64 | OpSize32 => LEAL32, LargeInt.fromInt offset, base, index, output) 1684 1685 | cgOp(ArithToGenReg{ opc, output, source=RegisterArg source, opSize }) = 1686 arithOpReg (opc, output, source, opSize=OpSize64) 1687 1688 | cgOp(ArithToGenReg{ opc, output, source=NonAddressConstArg source, opSize }) = 1689 let 1690 (* On the X86/32 we use CMP with literal sources to compare with an 1691 address and the RTS searches for them in the code. Any 1692 non-address constant must be tagged. Most will be but we 1693 might want to use this to compare with the contents of a 1694 LargeWord value. *) 1695 val _ = 1696 if hostIsX64 orelse is8BitL source orelse opc <> CMP orelse IntInf.andb(source, 1) = 1 1697 then () 1698 else raise InternalError "CMP with constant that looks like an address" 1699 in 1700 immediateOperand(opc, output, source, opSize) 1701 end 1702 1703 | cgOp(ArithToGenReg{ opc, output, source=AddressConstArg _, opSize }) = 1704 (* This is only used for opc=CMP to compare addresses for equality. *) 1705 if hostIsX64 1706 then (* We use this in 32-in-64 as well as native 64-bit. *) 1707 opConstantOperand( 1708 (case opSize of OpSize64 => Arith64 | OpSize32 => Arith32) (opc, 0w3), output) 1709 else 1710 let 1711 val (rc, _) = getReg output 1712 val opb = opCodeBytes(Group1_32_A32 (* group1, 32 bit immediate *), NONE) 1713 val mdrm = modrm(Register, arithOpToWord opc, rc) 1714 in 1715 opb @ [mdrm] @ int32Signed(tag 0) 1716 end 1717 1718 | cgOp(ArithToGenReg{ opc, output, source=MemoryArg{offset, base, index}, opSize }) = 1719 opAddress((case opSize of OpSize64 => Arith64 | OpSize32 => Arith32) (opc, 0w3), 1720 LargeInt.fromInt offset, base, index, output) 1721 1722 | cgOp(ArithByteMemConst{ opc, address={offset, base, index}, source }) = 1723 opIndexedPlus2(Group1_8_a (* group1, 8 bit immediate *), 1724 LargeInt.fromInt offset, base, index, arithOpToWord opc) @ [source] 1725 1726 | cgOp(ArithMemConst{ opc, address={offset, base, index}, source, opSize }) = 1727 if is8BitL source 1728 then (* Can use one byte immediate *) 1729 opIndexedPlus2(case opSize of OpSize64 => Group1_8_A64 | OpSize32 => Group1_8_A32 (* group1, 8 bit immediate *), 1730 LargeInt.fromInt offset, base, index, arithOpToWord opc) @ [Word8.fromLargeInt source] 1731 else (* Need 32 bit immediate. *) 1732 opIndexedPlus2(case opSize of OpSize64 => Group1_32_A64 | OpSize32 => Group1_32_A32(* group1, 32 bit immediate *), 1733 LargeInt.fromInt offset, base, index, arithOpToWord opc) @ int32Signed source 1734 1735 | cgOp(ArithMemLongConst{ opc, address={offset, base, index}, ... }) = 1736 (* Currently this is always a comparison. It is only valid in 32-bit mode because 1737 the constant is only 32-bits. *) 1738 if hostIsX64 1739 then raise InternalError "ArithMemLongConst in 64-bit mode" 1740 else 1741 let 1742 val opb = opIndexedPlus2 (Group1_32_A32, LargeInt.fromInt offset, base, index, arithOpToWord opc) 1743 in 1744 opb @ int32Signed(tag 0) 1745 end 1746 1747 | cgOp(ShiftConstant { shiftType, output, shift, opSize }) = 1748 if shift = 0w1 1749 then opRegPlus2(case opSize of OpSize64 => Group2_1_A64 | OpSize32 => Group2_1_A32, output, shiftTypeToWord shiftType) 1750 else opRegPlus2(case opSize of OpSize64 => Group2_8_A64 | OpSize32 => Group2_8_A32, output, shiftTypeToWord shiftType) @ [shift] 1751 1752 | cgOp(ShiftVariable { shiftType, output, opSize }) = 1753 opRegPlus2(case opSize of OpSize64 => Group2_CL_A64 | OpSize32 => Group2_CL_A32, output, shiftTypeToWord shiftType) 1754 1755 | cgOp(TestByteBits{arg=RegisterArg reg, bits}) = 1756 let 1757 (* Test the bottom bit and jump depending on its value. This is used 1758 for tag tests in arbitrary precision operations and also for testing 1759 for short/long values. *) 1760 val (regNum, rx) = getReg reg 1761 in 1762 if reg = eax 1763 then (* Special instruction for testing accumulator. Can use an 8-bit test. *) 1764 opCodeBytes(TEST_ACC8, NONE) @ [bits] 1765 else if hostIsX64 1766 then 1767 let 1768 (* We can use a REX code to force it to always use the low order byte. *) 1769 val opb = opCodeBytes(Group3_a, 1770 if rx orelse regNum >= 0w4 then SOME{w=false, r=false, b=rx, x=false} else NONE) 1771 val mdrm = modrm (Register, 0w0 (* test *), regNum) 1772 in 1773 opb @ [mdrm, bits] 1774 end 1775 else if reg = ebx orelse reg = ecx orelse reg = edx (* can we use an 8-bit test? *) 1776 then (* Yes. The register value refers to low-order byte. *) 1777 let 1778 val opb = opCodeBytes(Group3_a, NONE) 1779 val mdrm = modrm(Register, 0w0 (* test *), regNum) 1780 in 1781 opb @ [mdrm, bits] 1782 end 1783 else 1784 let 1785 val opb = opCodeBytes(Group3_A32, NONE) 1786 val mdrm = modrm (Register, 0w0 (* test *), regNum) 1787 in 1788 opb @ mdrm :: word32Unsigned(Word8.toLarge bits) 1789 end 1790 end 1791 1792 | cgOp(TestByteBits{arg=MemoryArg{base, offset, index}, bits}) = 1793 (* Test the tag bit and set the condition code. *) 1794 opIndexedPlus2(Group3_a, LargeInt.fromInt offset, base, index, 0w0 (* test *)) @ [ bits] 1795 1796 | cgOp(TestByteBits _) = raise InternalError "cgOp: TestByteBits" 1797 1798 | cgOp(ConditionalBranch{ test=opc, ... }) = opCodeBytes(CondJump32 opc, NONE) @ word32Unsigned 0w0 1799 1800 | cgOp(SetCondition{ output, test}) = 1801 let 1802 val (rrC, rx) = getReg output 1803 (* In 64-bit mode we can specify the low-order byte of RSI/RDI but we 1804 must use a REX prefix. This isn't possible in 32-bit mode. *) 1805 in 1806 if hostIsX64 orelse rrC < 0w4 1807 then 1808 let 1809 val opb = opCodeBytes(SetCC test, 1810 if rx orelse rrC >= 0w4 then SOME{w=false, r=false, b=rx, x=false} else NONE) 1811 val mdrm = modrm (Register, 0w0, rrC) 1812 in 1813 opb @ [mdrm] 1814 end 1815 else raise InternalError "High byte register" 1816 end 1817 1818 | cgOp(CallRTS{rtsEntry, saveRegs}) = rtsCall(rtsEntry, saveRegs) 1819 1820 | cgOp(RepeatOperation repOp) = 1821 let 1822 (* We don't explicitly clear the direction flag. Should that be done? *) 1823 val opb = opCodeBytes(REP, NONE) 1824 (* Put in a rex prefix to force 64-bit mode. *) 1825 val optRex = 1826 if case repOp of STOS64 => true | MOVS64 => true | _ => false 1827 then [rex{w=true, r=false, b=false, x=false}] 1828 else [] 1829 val repOp = repOpsToWord repOp 1830 in 1831 opb @ optRex @ [repOp] 1832 end 1833 1834 | cgOp(DivideAccR{arg, isSigned, opSize}) = 1835 opRegPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, arg, if isSigned then 0w7 else 0w6) 1836 1837 | cgOp(DivideAccM{base, offset, isSigned, opSize}) = 1838 opPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, LargeInt.fromInt offset, base, if isSigned then 0w7 else 0w6) 1839 1840 | cgOp(AtomicXAdd{address={offset, base, index}, output, opSize}) = 1841 (* Locked exchange-and-add. We need the lock prefix before the REX prefix. *) 1842 opAddress(case opSize of OpSize64 => LOCK_XADD64 | OpSize32 => LOCK_XADD32, LargeInt.fromInt offset, base, index, output) 1843 1844 | cgOp(PushToStack(RegisterArg reg)) = 1845 let 1846 val (rc, rx) = getReg reg 1847 in 1848 (* Always 64-bit but a REX prefix may be needed for the register. *) 1849 opCodeBytes(PUSH_R rc, if rx then SOME{w=false, b = true, x=false, r = false } else NONE) 1850 end 1851 1852 | cgOp(PushToStack(MemoryArg{base, offset, index})) = 1853 opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w6 (* push *)) 1854 1855 | cgOp(PushToStack(NonAddressConstArg constnt)) = 1856 if is8BitL constnt 1857 then opCodeBytes(PUSH_8, NONE) @ [Word8.fromLargeInt constnt] 1858 else if is32bit constnt 1859 then opCodeBytes(PUSH_32, NONE) @ int32Signed constnt 1860 else (* It won't fit in the immediate; put it in the non-address area. *) 1861 let 1862 val opb = opCodeBytes(Group5, NONE) 1863 val mdrm = modrm(Based0, 0w6 (* push *), 0w5 (* PC rel *)) 1864 in 1865 opb @ [mdrm] @ int32Signed(tag 0) 1866 end 1867 1868 | cgOp(PushToStack(AddressConstArg _)) = 1869 ( 1870 case targetArch of 1871 Native64Bit => (* Put it in the constant area. *) 1872 let 1873 val opb = opCodeBytes(Group5, NONE) 1874 val mdrm = modrm(Based0, 0w6 (* push *), 0w5 (* PC rel *)); 1875 in 1876 opb @ [mdrm] @ int32Signed(tag 0) 1877 end 1878 | Native32Bit => opCodeBytes(PUSH_32, NONE) @ int32Signed(tag 0) 1879 | ObjectId32Bit => 1880 (* We can't do this. The constant area contains 32-bit quantities 1881 and 32-bit literals are sign-extended rather than zero-extended. *) 1882 raise InternalError "PushToStack:AddressConstArg" 1883 ) 1884 1885 | cgOp(PopR reg ) = 1886 let 1887 val (rc, rx) = getReg reg 1888 in 1889 (* Always 64-bit but a REX prefix may be needed for the register. 1890 Because the register is encoded in the instruction the rex bit for 1891 the register is b not r. *) 1892 opCodeBytes(POP_R rc, if rx then SOME{w=false, b = true, x=false, r = false } else NONE) 1893 end 1894 1895 | cgOp(Move{source=RegisterArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move64}) = 1896 opAddress(MOVL_R_A64, LargeInt.fromInt offset, base, index, toStore) 1897 1898 | cgOp(Move{source=RegisterArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move32}) = 1899 opAddress(MOVL_R_A32, LargeInt.fromInt offset, base, index, toStore) 1900 1901 | cgOp(Move{source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move64 }) = 1902 ( 1903 (* Short constant. In 32-bit mode this is scanned as a possible address. That means 1904 we can't have an untagged constant in it. That's not a problem in 64-bit mode. 1905 There's a special check for using this to set the length word on newly allocated 1906 memory. *) 1907 targetArch <> Native32Bit orelse toStore = 0 orelse toStore mod 2 = 1 orelse offset = ~ (Word.toInt wordSize) 1908 orelse raise InternalError "cgOp: StoreConstToMemory not tagged"; 1909 opAddressPlus2(MOVL_32_A64, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed toStore 1910 ) 1911 1912 | cgOp(Move{source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}, moveSize=Move32 }) = 1913 ( 1914 (* Short constant. In 32-bit mode this is scanned as a possible address. That means 1915 we can't have an untagged constant in it. That's not a problem in 64-bit mode. 1916 There's a special check for using this to set the length word on newly allocated 1917 memory. *) 1918 targetArch <> Native32Bit orelse toStore = 0 orelse toStore mod 2 = 1 orelse offset = ~ (Word.toInt wordSize) 1919 orelse raise InternalError "cgOp: StoreConstToMemory not tagged"; 1920 opAddressPlus2(MOVL_32_A32, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed toStore 1921 ) 1922 1923 | cgOp(Move{source=AddressConstArg _, destination=MemoryArg{offset, base, index}, moveSize=Move32}) = 1924 (* This is not used for addresses even in 32-in-64. We don't scan for addresses after MOVL_32_A. *) 1925 if targetArch <> Native32Bit 1926 then raise InternalError "StoreLongConstToMemory in 64-bit mode" 1927 else opAddressPlus2(MOVL_32_A32, LargeInt.fromInt offset, base, index, 0w0) @ int32Signed (tag 0) 1928 1929 | cgOp(Move{source=AddressConstArg _, destination=MemoryArg _, ...}) = 1930 raise InternalError "cgOp: Move - AddressConstArg => MemoryArg" 1931 1932 | cgOp(Move{ moveSize = Move8, source=RegisterArg toStore, destination=MemoryArg{offset, base, index} }) = 1933 let 1934 val (rrC, _) = getReg toStore 1935 (* In 64-bit mode we can specify the low-order byte of RSI/RDI but we 1936 must use a REX prefix. This isn't possible in 32-bit mode. *) 1937 val opcode = 1938 if hostIsX64 then MOVB_R_A64{forceRex= rrC >= 0w4} 1939 else if rrC < 0w4 then MOVB_R_A32 1940 else raise InternalError "High byte register" 1941 in 1942 opAddress(opcode, LargeInt.fromInt offset, base, index, toStore) 1943 end 1944 1945 | cgOp(Move{ moveSize = Move16, source=RegisterArg toStore, destination=MemoryArg{offset, base, index}}) = 1946 opAddress(MOVL_R_A16, LargeInt.fromInt offset, base, index, toStore) 1947 1948 | cgOp(Move{ moveSize = Move8, source=NonAddressConstArg toStore, destination=MemoryArg{offset, base, index}}) = 1949 opAddressPlus2(MOVB_8_A, LargeInt.fromInt offset, base, index, 0w0) @ 1950 [Word8.fromLargeInt toStore] 1951 1952 | cgOp(Move _) = raise InternalError "Move: Unimplemented arguments" 1953 1954 (* Allocation is dealt with by expanding the code. *) 1955 | cgOp(AllocStore _) = raise InternalError "cgOp: AllocStore" 1956 1957 | cgOp(AllocStoreVariable _) = raise InternalError "cgOp: AllocStoreVariable" 1958 1959 | cgOp StoreInitialised = raise InternalError "cgOp: StoreInitialised" 1960 1961 | cgOp(CallAddress(NonAddressConstArg _)) = (* Call to the start of the code. Offset is patched in later. *) 1962 opCodeBytes (CALL_32, NONE) @ int32Signed 0 1963 1964 | cgOp(CallAddress(AddressConstArg _)) = 1965 if targetArch = Native64Bit 1966 then 1967 let 1968 val opc = opCodeBytes(Group5, NONE) 1969 val mdrm = modrm(Based0, 0w2 (* call *), 0w5 (* PC rel *)) 1970 in 1971 opc @ [mdrm] @ int32Signed(tag 0) 1972 end 1973 (* Because this is a relative branch we need to point this at itself. 1974 Until it is set to the relative offset of the destination it 1975 needs to contain an address within the code and this could 1976 be the last instruction. *) 1977 else opCodeBytes (CALL_32, NONE) @ int32Signed ~5 1978 1979 | cgOp(CallAddress(RegisterArg reg)) = opRegPlus2(Group5, reg, 0w2 (* call *)) 1980 1981 | cgOp(CallAddress(MemoryArg{base, offset, index})) = 1982 opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w2 (* call *)) 1983 1984 | cgOp(JumpAddress(NonAddressConstArg _)) = 1985 (* Jump to the start of the current function. Offset is patched in later. *) 1986 opCodeBytes (JMP_32, NONE) @ int32Signed 0 1987 1988 | cgOp(JumpAddress (AddressConstArg _)) = 1989 if targetArch = Native64Bit 1990 then 1991 let 1992 val opb = opCodeBytes (Group5, NONE) 1993 val mdrm = modrm(Based0, 0w4 (* jmp *), 0w5 (* PC rel *)) 1994 in 1995 opb @ [mdrm] @ int32Signed(tag 0) 1996 end 1997 else opCodeBytes (JMP_32, NONE) @ int32Signed ~5 (* As with Call. *) 1998 1999 | cgOp(JumpAddress (RegisterArg reg)) = 2000 (* Used as part of indexed case - not for entering a function. *) 2001 opRegPlus2(Group5, reg, 0w4 (* jmp *)) 2002 2003 | cgOp(JumpAddress(MemoryArg{base, offset, index})) = 2004 opAddressPlus2(Group5, LargeInt.fromInt offset, base, index, 0w4 (* jmp *)) 2005 2006 | cgOp(ReturnFromFunction args) = 2007 if args = 0 2008 then opCodeBytes(RET, NONE) 2009 else 2010 let 2011 val offset = Word.fromInt args * nativeWordSize 2012 in 2013 opCodeBytes(RET_16, NONE) @ [wordToWord8 offset, wordToWord8(offset >> 0w8)] 2014 end 2015 2016 | cgOp (RaiseException { workReg }) = 2017 opEA(if hostIsX64 then MOVL_A_R64 else MOVL_A_R32, LargeInt.fromInt memRegHandlerRegister, ebp, workReg) @ 2018 opAddressPlus2(Group5, 0, workReg, NoIndex, 0w4 (* jmp *)) 2019 2020 | cgOp(UncondBranch _) = opToInt JMP_32 :: word32Unsigned 0w0 2021 2022 | cgOp(ResetStack{numWords, preserveCC}) = 2023 let 2024 val bytes = Word.toLargeInt(Word.fromInt numWords * nativeWordSize) 2025 in 2026 (* If we don't need to preserve the CC across the reset we use ADD since 2027 it's shorter. *) 2028 if preserveCC 2029 then opEA(if hostIsX64 then LEAL64 else LEAL32, bytes, esp, esp) 2030 else immediateOperand(ADD, esp, bytes, if hostIsX64 then OpSize64 else OpSize32) 2031 end 2032 2033 | cgOp(JumpLabel _) = [] (* No code. *) 2034 2035 | cgOp(LoadLabelAddress{ output, ... }) = 2036 (* Load the address of a label. Used when setting up an exception handler or 2037 in indexed cases. *) 2038 (* On X86/64 we can use pc-relative addressing to set the start of the handler. 2039 On X86/32 we have to load the address of the start of the code and add an offset. *) 2040 if hostIsX64 2041 then opConstantOperand(LEAL64, output) 2042 else 2043 let 2044 val (rc, _) = getReg output 2045 in 2046 opCodeBytes(MOVL_32_R rc , NONE) @ int32Signed(tag 0) @ 2047 opRegPlus2(Group1_32_A32, output, arithOpToWord ADD) @ int32Signed 0 2048 end 2049 2050 | cgOp (FPLoadFromMemory {address={ base, offset, index }, precision}) = 2051 let 2052 val loadInstr = 2053 case precision of 2054 DoublePrecision => FPESC 0w5 2055 | SinglePrecision => FPESC 0w1 2056 in 2057 opAddressPlus2(loadInstr, LargeInt.fromInt offset, base, index, 0wx0) 2058 end 2059 2060 | cgOp (FPLoadFromFPReg{source=FloatingPtReg fp, ...}) = 2061 (* Assume there's nothing currently on the stack. *) 2062 floatingPtOp({escape=0w1, md=0w3, nnn=0w0, rm= fp + 0w0}) (* FLD ST(r1) *) 2063 2064 | cgOp (FPLoadFromConst {precision, ...} ) = 2065 (* The real constant here is actually the address of a memory 2066 object. FLD takes the address as the argument and in 32-bit mode 2067 we use an absolute address. In 64-bit mode we need to put the 2068 constant at the end of the code segment and use PC-relative 2069 addressing which happens to be encoded in the same way. 2070 There are special cases for zero and one but it's probably too 2071 much work to detect them. *) 2072 let 2073 val esc = case precision of SinglePrecision => 0w1 | DoublePrecision => 0w5 2074 val opb = opCodeBytes(FPESC esc, NONE) (* FLD [Constant] *) 2075 val mdrm = modrm (Based0, 0w0, 0w5 (* constant address/PC-relative *)) 2076 in 2077 opb @ [mdrm] @ int32Signed(tag 0) 2078 end 2079 2080 | cgOp (FPStoreToFPReg{ output=FloatingPtReg dest, andPop }) = 2081 (* Assume there's one item on the stack. *) 2082 floatingPtOp({escape=0w5, md=0w3, nnn=if andPop then 0wx3 else 0wx2, 2083 rm = dest+0w1(* One item *)}) (* FSTP ST(n+1) *) 2084 2085 | cgOp (FPStoreToMemory{address={ base, offset, index}, precision, andPop }) = 2086 let 2087 val storeInstr = 2088 case precision of 2089 DoublePrecision => FPESC 0w5 2090 | SinglePrecision => FPESC 0w1 2091 val subInstr = if andPop then 0wx3 else 0wx2 2092 in 2093 opAddressPlus2(storeInstr, LargeInt.fromInt offset, base, index, subInstr) 2094 end 2095 2096 | cgOp (FPArithR{ opc, source = FloatingPtReg src}) = 2097 floatingPtOp({escape=0w0, md=0w3, nnn=fpOpToWord opc, 2098 rm=src + 0w1 (* One item already there *)}) 2099 2100 | cgOp (FPArithConst{ opc, precision, ... }) = 2101 (* See comment on FPLoadFromConst *) 2102 let 2103 val fpesc = case precision of DoublePrecision => 0w4 | SinglePrecision => 0w0 2104 val opb = opCodeBytes(FPESC fpesc, NONE) (* FADD etc [constnt] *) 2105 val mdrm = modrm (Based0, fpOpToWord opc, 0w5 (* constant address *)) 2106 in 2107 opb @ [mdrm] @ int32Signed(tag 0) 2108 end 2109 2110 | cgOp (FPArithMemory{ opc, base, offset, precision }) = 2111 let 2112 val fpesc = case precision of DoublePrecision => 0w4 | SinglePrecision => 0w0 2113 in 2114 opPlus2(FPESC fpesc, LargeInt.fromInt offset, base, fpOpToWord opc) (* FADD/FMUL etc [r2] *) 2115 end 2116 2117 | cgOp (FPUnary opc ) = 2118 let 2119 val {rm, nnn} = fpUnaryToWords opc 2120 in 2121 floatingPtOp({escape=0w1, md=0w3, nnn=nnn, rm=rm}) (* FCHS etc *) 2122 end 2123 2124 | cgOp (FPStatusToEAX ) = 2125 opCodeBytes(FPESC 0w7, NONE) @ [0wxe0] (* FNSTSW AX *) 2126 2127 | cgOp (FPFree(FloatingPtReg reg)) = 2128 floatingPtOp({escape=0w5, md=0w3, nnn=0w0, rm=reg}) (* FFREE FP(n) *) 2129 2130 | cgOp (FPLoadInt{base, offset, opSize=OpSize64}) = 2131 (* fildl (esp) in 32-bit mode or fildq (esp) in 64-bit mode. *) 2132 opPlus2(FPESC 0w7, LargeInt.fromInt offset, base, 0w5) 2133 2134 | cgOp (FPLoadInt{base, offset, opSize=OpSize32}) = 2135 (* fildl (esp) in 32-bit mode or fildq (esp) in 64-bit mode. *) 2136 opPlus2(FPESC 0w3, LargeInt.fromInt offset, base, 0w0) 2137 2138 | cgOp (MultiplyR {source=RegisterArg srcReg, output, opSize}) = 2139 (* We use the 0F AF form of IMUL rather than the Group3 MUL or IMUL 2140 because the former allows us to specify the destination register. 2141 The Group3 forms produce double length results in RAX:RDX/EAX:EDX 2142 but we only ever want the low-order half. *) 2143 opReg(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32 (* 2 byte opcode *), output, srcReg) 2144 2145 | cgOp (MultiplyR {source=MemoryArg{base, offset, index}, output, opSize}) = 2146 (* This may be used for large-word multiplication. *) 2147 opAddress(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32 (* 2 byte opcode *), LargeInt.fromInt offset, base, index, output) 2148 2149 | cgOp(MultiplyR {source=NonAddressConstArg constnt, output, opSize}) = 2150 (* If the constant is an 8-bit or 32-bit value we are actually using a 2151 three-operand instruction where the argument can be a register or memory 2152 and the destination register does not need to be the same as the source. *) 2153 if is8BitL constnt 2154 then opReg(case opSize of OpSize64 => IMUL_C8_64 | OpSize32 => IMUL_C8_32, output, output) @ [Word8.fromLargeInt constnt] 2155 else if is32bit constnt 2156 then opReg(case opSize of OpSize64 => IMUL_C32_64 | OpSize32 => IMUL_C32_32, output, output) @ int32Signed constnt 2157 else opConstantOperand(case opSize of OpSize64 => IMUL64 | OpSize32 => IMUL32, output) 2158 2159 | cgOp(MultiplyR {source=AddressConstArg _, ...}) = 2160 raise InternalError "Multiply - address constant" 2161 2162 | cgOp (XMMArith { opc, source=MemoryArg{base, offset, index}, output }) = 2163 mMXAddress(SSE2Ops opc, LargeInt.fromInt offset, base, index, output) 2164 2165 | cgOp (XMMArith { opc, source=AddressConstArg _, output=SSE2Reg rrC }) = 2166 let 2167 (* The real constant here is actually the address of an 8-byte memory 2168 object. In 32-bit mode we put this address into the code and retain 2169 this memory object. In 64-bit mode we copy the real value out of the 2170 memory object into the non-address constant area and use 2171 PC-relative addressing. These happen to be encoded the same 2172 way. *) 2173 val opb = opCodeBytes(SSE2Ops opc, NONE) 2174 val mdrm = modrm (Based0, rrC, 0w5 (* constant address/PC-relative *)) 2175 in 2176 opb @ [mdrm] @ int32Signed(tag 0) 2177 end 2178 2179 | cgOp (XMMArith { opc, source=RegisterArg(SSE2Reg rrS), output=SSE2Reg rrC }) = 2180 let 2181 val oper = SSE2Ops opc 2182 val pref = opcodePrefix oper 2183 val esc = escapePrefix oper 2184 val opc = opToInt oper 2185 val mdrm = modrm(Register, rrC, rrS) 2186 in 2187 pref @ esc @ [opc, mdrm] 2188 end 2189 2190 | cgOp (XMMArith { opc, source=NonAddressConstArg _, output=SSE2Reg rrC }) = 2191 let 2192 val _ = hostIsX64 orelse raise InternalError "XMMArith-NonAddressConstArg in 32-bit mode" 2193 (* This is currently used for 32-bit float arguments but can equally be 2194 used for 64-bit values since the actual argument will always be put 2195 in the 64-bit constant area. *) 2196 val opb = opCodeBytes(SSE2Ops opc, NONE) 2197 val mdrm = modrm (Based0, rrC, 0w5 (* constant address/PC-relative *)) 2198 in 2199 opb @ [mdrm] @ int32Signed(tag 0) 2200 end 2201 2202 | cgOp (XMMStoreToMemory { toStore, address={base, offset, index}, precision }) = 2203 let 2204 val oper = 2205 case precision of 2206 DoublePrecision => SSE2StoreDouble 2207 | SinglePrecision => SSE2StoreSingle 2208 in 2209 mMXAddress(oper, LargeInt.fromInt offset, base, index, toStore) 2210 end 2211 2212 | cgOp (XMMConvertFromInt { source, output=SSE2Reg rrC, opSize }) = 2213 let 2214 (* The source is a general register and the output a XMM register. *) 2215 (* TODO: The source can be a memory location. *) 2216 val (rbC, rbX) = getReg source 2217 val oper = case opSize of OpSize64 => CVTSI2SD64 | OpSize32 => CVTSI2SD32 2218 in 2219 (* This is a special case with both an XMM and general register. *) 2220 opcodePrefix oper @ rexByte(oper, false, rbX, false) @ 2221 escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)] 2222 end 2223 2224 | cgOp (SignExtendForDivide OpSize64) = 2225 opCodeBytes(CQO_CDQ64, SOME {w=true, r=false, b=false, x=false}) 2226 2227 | cgOp (SignExtendForDivide OpSize32) = 2228 opCodeBytes(CQO_CDQ32, NONE) 2229 2230 | cgOp (XChng { reg, arg=RegisterArg regY, opSize }) = 2231 opReg(case opSize of OpSize64 => XCHNG64 | OpSize32 => XCHNG32, reg, regY) 2232 2233 | cgOp (XChng { reg, arg=MemoryArg{offset, base, index}, opSize }) = 2234 opAddress(case opSize of OpSize64 => XCHNG64 | OpSize32 => XCHNG32, LargeInt.fromInt offset, base, index, reg) 2235 2236 | cgOp (XChng _) = raise InternalError "cgOp: XChng" 2237 2238 | cgOp (Negative {output, opSize}) = 2239 opRegPlus2(case opSize of OpSize64 => Group3_A64 | OpSize32 => Group3_A32, output, 0w3 (* neg *)) 2240 2241 | cgOp (JumpTable{cases, jumpSize=ref jumpSize}) = 2242 let 2243 val _ = jumpSize = JumpSize8 orelse raise InternalError "cgOp: JumpTable" 2244 (* Make one jump for each case and pad it 8 bytes with Nops. *) 2245 fun makeJump (_, l) = opToInt JMP_32 :: word32Unsigned 0w0 @ [opToInt NOP, opToInt NOP, opToInt NOP] @ l 2246 in 2247 List.foldl makeJump [] cases 2248 end 2249 2250 | cgOp(IndexedJumpCalc{ addrReg, indexReg, jumpSize=ref jumpSize }) = 2251 ( 2252 jumpSize = JumpSize8 orelse raise InternalError "cgOp: IndexedJumpCalc"; 2253 (* Should currently be JumpSize8 which requires a multiplier of 4 and 2254 4 to be subtracted to remove the shifted tag. *) 2255 opAddress(if hostIsX64 then LEAL64 else LEAL32, ~4, addrReg, Index4 indexReg, addrReg) 2256 ) 2257 2258 | cgOp(MoveXMMRegToGenReg { source=SSE2Reg rrC, output }) = 2259 let 2260 (* The source is a XMM register and the output a general register. *) 2261 val (rbC, rbX) = getReg output 2262 val oper = MOVDFromXMM 2263 in 2264 (* This is a special case with both an XMM and general register. *) 2265 opcodePrefix oper @ rexByte(oper, false, rbX, false) @ 2266 escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)] 2267 end 2268 2269 | cgOp(MoveGenRegToXMMReg { source, output=SSE2Reg rrC }) = 2270 let 2271 (* The source is a general register and the output a XMM register. *) 2272 val (rbC, rbX) = getReg source 2273 val oper = MOVQToXMM 2274 in 2275 (* This is a special case with both an XMM and general register. *) 2276 (* This needs to move the whole 64-bit value. TODO: This is inconsistent 2277 with MoveXMMRegToGenReg *) 2278 opcodePrefix oper @ rexByte(oper, false, rbX, false) @ 2279 escapePrefix oper @ [opToInt oper, modrm(Register, rrC, rbC)] 2280 end 2281 2282 | cgOp(XMMShiftRight { output=SSE2Reg rrC, shift }) = 2283 let 2284 val oper = PSRLDQ 2285 in 2286 opcodePrefix oper @ escapePrefix oper @ [opToInt oper, modrm(Register, 0w3, rrC), shift] 2287 end 2288 2289 | cgOp(FPLoadCtrlWord {base, offset, index}) = 2290 opIndexedPlus2(FPESC 0w1, LargeInt.fromInt offset, base, index, 0w5) 2291 2292 | cgOp(FPStoreCtrlWord {base, offset, index}) = 2293 opIndexedPlus2(FPESC 0w1, LargeInt.fromInt offset, base, index, 0w7) 2294 2295 | cgOp(XMMLoadCSR {base, offset, index}) = 2296 opIndexedPlus2(LDSTMXCSR, LargeInt.fromInt offset, base, index, 0w2) 2297 2298 | cgOp(XMMStoreCSR {base, offset, index}) = 2299 opIndexedPlus2(LDSTMXCSR, LargeInt.fromInt offset, base, index, 0w3) 2300 2301 | cgOp(FPStoreInt {base, offset, index}) = 2302 (* fistp dword ptr [esp] in 32-bit mode or fistp qword ptr [rsp] in 64-bit mode. *) 2303 if hostIsX64 2304 then opIndexedPlus2(FPESC 0w7, LargeInt.fromInt offset, base, index, 0w7) 2305 else opIndexedPlus2(FPESC 0w3, LargeInt.fromInt offset, base, index, 0w3) 2306 2307 | cgOp(XMMStoreInt {source, output, precision, isTruncate}) = 2308 let 2309 (* The destination is a general register. The source is an XMM register or memory. *) 2310 val (rbC, rbX) = getReg output 2311 val oper = 2312 case (hostIsX64, precision, isTruncate) of 2313 (false, DoublePrecision, false) => CVTSD2SI32 2314 | (true, DoublePrecision, false) => CVTSD2SI64 2315 | (false, SinglePrecision, false) => CVTSS2SI32 2316 | (true, SinglePrecision, false) => CVTSS2SI64 2317 | (false, DoublePrecision, true) => CVTTSD2SI32 2318 | (true, DoublePrecision, true) => CVTTSD2SI64 2319 | (false, SinglePrecision, true) => CVTTSS2SI32 2320 | (true, SinglePrecision, true) => CVTTSS2SI64 2321 in 2322 case source of 2323 MemoryArg{base, offset, index} => 2324 opAddress(oper, LargeInt.fromInt offset, base, index, output) 2325 | RegisterArg(SSE2Reg rrS) => 2326 opcodePrefix oper @ rexByte(oper, rbX, false, false) @ 2327 escapePrefix oper @ [opToInt oper, modrm(Register, rbC, rrS)] 2328 | _ => raise InternalError "XMMStoreInt: Not register or memory" 2329 end 2330 2331 | cgOp(CondMove { test, output, source=RegisterArg source, opSize=OpSize32 }) = 2332 opReg(CMOV32 test, output, source) 2333 2334 | cgOp(CondMove { test, output, source=RegisterArg source, opSize=OpSize64 }) = 2335 opReg(CMOV64 test, output, source) 2336 2337 | cgOp(CondMove { test, output, source=NonAddressConstArg _, opSize }) = 2338 ( 2339 (* We currently support only native-64 bit and put the constant in the 2340 non-address constant area. These are 64-bit values both in native 2341 64-bit and in 32-in-64. To support it in 32-bit mode we'd have to 2342 put the constant in a single-word object and put its absolute 2343 address into the code. *) 2344 targetArch <> Native32Bit orelse 2345 raise InternalError "CondMove: constant in 32-bit mode"; 2346 opConstantOperand((case opSize of OpSize32 => CMOV32 | OpSize64 => CMOV64) test, output) 2347 ) 2348 2349 | cgOp(CondMove { test, output, source=AddressConstArg _, opSize=OpSize64 }) = 2350 (* An address constant. The opSize must match the size of a polyWord since 2351 the value it going into the constant area. *) 2352 ( 2353 targetArch = Native64Bit orelse raise InternalError "CondMove: AddressConstArg"; 2354 opConstantOperand(CMOV64 test, output) 2355 ) 2356 2357 | cgOp(CondMove { test, output, source=AddressConstArg _, opSize=OpSize32 }) = 2358 ( 2359 (* We only support address constants in 32-in-64. *) 2360 targetArch = ObjectId32Bit orelse raise InternalError "CondMove: AddressConstArg"; 2361 opConstantOperand(CMOV32 test, output) 2362 ) 2363 2364 | cgOp(CondMove { test, output, source=MemoryArg{base, offset, index}, opSize=OpSize32 }) = 2365 opAddress(CMOV32 test, LargeInt.fromInt offset, base, index, output) 2366 2367 | cgOp(CondMove { test, output, source=MemoryArg{base, offset, index}, opSize=OpSize64 }) = 2368 opAddress(CMOV64 test, LargeInt.fromInt offset, base, index, output) 2369 2370 in 2371 List.rev(List.foldl (fn (c, list) => Word8Vector.fromList(cgOp c) :: list) [] ops) 2372 end 2373 2374 (* General function to process the code. ic is the byte counter within the original code. *) 2375 fun foldCode foldFn n (ops, byteList) = 2376 let 2377 fun doFold(oper :: operList, bytes :: byteList, ic, acc) = 2378 doFold(operList, byteList, ic + Word.fromInt(Word8Vector.length bytes), 2379 foldFn(oper, bytes, ic, acc)) 2380 | doFold(_, _, _, n) = n 2381 in 2382 doFold(ops, byteList, 0w0, n) 2383 end 2384 2385 (* Go through the code and update branch and similar instructions with the destinations 2386 of the branches. Long branches are converted to short where possible and the code 2387 is reprocessed. That might repeat if the effect of shorting one branch allows 2388 another to be shortened. *) 2389 fun fixupLabels(ops, bytesList, labelCount) = 2390 let 2391 (* Label array - initialise to 0wxff... . Every label should be defined 2392 but just in case, this is more likely to be detected in int32Signed. *) 2393 val labelArray = Array.array(labelCount, ~ 0w1) 2394 2395 (* First pass - Set the addresses of labels. *) 2396 fun setLabelAddresses(oper :: operList, bytes :: byteList, ic) = 2397 ( 2398 case oper of 2399 JumpLabel(Label{labelNo, ...}) => Array.update(labelArray, labelNo, ic) 2400 | _ => (); 2401 setLabelAddresses(operList, byteList, ic + Word.fromInt(Word8Vector.length bytes)) 2402 ) 2403 | setLabelAddresses(_, _, ic) = ic (* Return the length of the code. *) 2404 2405 fun fixup32(destination, bytes, ic) = 2406 let 2407 val brLength = Word8Vector.length bytes 2408 (* The offset is relative to the end of the branch instruction. *) 2409 val diff = Word.toInt destination - Word.toInt ic - brLength 2410 in 2411 Word8VectorSlice.concat[ 2412 Word8VectorSlice.slice(bytes, 0, SOME(brLength-4)), (* The original opcode. *) 2413 Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt diff))) 2414 ] 2415 end 2416 2417 fun fixupAddress(UncondBranch(Label{labelNo, ...}), bytes, ic, list) = 2418 let 2419 val destination = Array.sub(labelArray, labelNo) 2420 val brLength = Word8Vector.length bytes 2421 (* The offset is relative to the end of the branch instruction. *) 2422 val diff = Word.toInt destination - Word.toInt ic - brLength 2423 in 2424 if brLength = 2 2425 then (* It's a short branch. Take the original operand and set the relative offset. *) 2426 Word8Vector.fromList [opToInt JMP_8, byteSigned diff] :: list 2427 else if brLength <> 5 2428 then raise InternalError "fixupAddress" 2429 else (* 32-bit offset. If it will fit in a byte we can use a short branch. 2430 If this is a reverse branch we can actually use values up to -131 2431 here because we've calculated using the end of the long branch. *) 2432 if diff <= 127 andalso diff >= ~(128 + 3) 2433 then Word8Vector.fromList [opToInt JMP_8, 0w0 (* Fixed on next pass *)] :: list 2434 else Word8Vector.fromList(opToInt JMP_32 :: int32Signed(LargeInt.fromInt diff)) :: list 2435 end 2436 2437 | fixupAddress(ConditionalBranch{label=Label{labelNo, ...}, test, ...}, bytes, ic, list) = 2438 let 2439 val destination = Array.sub(labelArray, labelNo) 2440 val brLength = Word8Vector.length bytes 2441 (* The offset is relative to the end of the branch instruction. *) 2442 val diff = Word.toInt destination - Word.toInt ic - brLength 2443 in 2444 if brLength = 2 2445 then (* It's a short branch. Take the original operand and set the relative offset. *) 2446 Word8Vector.fromList [opToInt(CondJump test), byteSigned diff] :: list 2447 else if brLength <> 6 2448 then raise InternalError "fixupAddress" 2449 else if diff <= 127 andalso diff >= ~(128+4) 2450 then Word8Vector.fromList[opToInt(CondJump test), 0w0 (* Fixed on next pass *)] :: list 2451 else Word8Vector.fromList(opCodeBytes(CondJump32 test, NONE) @ int32Signed(LargeInt.fromInt diff)) :: list 2452 end 2453 2454 | fixupAddress(LoadLabelAddress{ label=Label{labelNo, ...}, ... }, brCode, ic, list) = 2455 let 2456 val destination = Array.sub(labelArray, labelNo) 2457 in 2458 if hostIsX64 2459 then (* This is a relative offset on the X86/64. *) 2460 fixup32(destination, brCode, ic) :: list 2461 else (* On X86/32 the address is relative to the start of the code so we simply put in 2462 the destination address. *) 2463 Word8VectorSlice.concat[ 2464 Word8VectorSlice.slice(brCode, 0, SOME(Word8Vector.length brCode-4)), 2465 Word8VectorSlice.full(Word8Vector.fromList(int32Signed(Word.toLargeInt destination)))] :: list 2466 end 2467 2468 | fixupAddress(JumpTable{cases, jumpSize as ref JumpSize8}, brCode: Word8Vector.vector, ic, list) = 2469 let 2470 (* Each branch is a 32-bit jump padded up to 8 bytes. *) 2471 fun processCase(Label{labelNo, ...} :: cases, offset, ic) = 2472 fixup32(Array.sub(labelArray, labelNo), 2473 Word8VectorSlice.vector(Word8VectorSlice.slice(brCode, offset, SOME 5)), ic) :: 2474 Word8VectorSlice.vector(Word8VectorSlice.slice(brCode, offset+5, SOME 3)) :: 2475 processCase(cases, offset+8, ic+0w8) 2476 | processCase _ = [] 2477 (* Could we use short branches? If all of the branches were short the 2478 table would be smaller so the offsets we use would be less. 2479 Ignore backwards branches - could only occur if we have linked labels 2480 in a loop. *) 2481 val newStartOfCode = ic + Word.fromInt(List.length cases * 6) 2482 fun tryShort(Label{labelNo, ...} :: cases, ic) = 2483 let 2484 val destination = Array.sub(labelArray, labelNo) 2485 in 2486 if destination > ic + 0w2 andalso destination - ic - 0w2 < 0w127 2487 then tryShort(cases, ic+0w2) 2488 else false 2489 end 2490 | tryShort _ = true 2491 2492 val newCases = 2493 if tryShort(cases, newStartOfCode) 2494 then 2495 ( 2496 jumpSize := JumpSize2; 2497 (* Generate a short branch table. *) 2498 List.map(fn _ => Word8Vector.fromList [opToInt JMP_8, 0w0 (* Fixed on next pass *)]) cases 2499 ) 2500 else processCase(cases, 0, ic) 2501 in 2502 Word8Vector.concat newCases :: list 2503 end 2504 2505 | fixupAddress(JumpTable{cases, jumpSize=ref JumpSize2}, _, ic, list) = 2506 let 2507 (* Each branch is a short jump. *) 2508 fun processCase(Label{labelNo, ...} :: cases, offset, ic) = 2509 let 2510 val destination = Array.sub(labelArray, labelNo) 2511 val brLength = 2 2512 val diff = Word.toInt destination - Word.toInt ic - brLength 2513 in 2514 Word8Vector.fromList[opToInt JMP_8, byteSigned diff] :: processCase(cases, offset+2, ic+0w2) 2515 end 2516 | processCase _ = [] 2517 in 2518 Word8Vector.concat(processCase(cases, 0, ic)) :: list 2519 end 2520 2521 (* If we've shortened a jump table we have to change the indexing. *) 2522 | fixupAddress(IndexedJumpCalc{ addrReg, indexReg, jumpSize=ref JumpSize2 }, _, _, list) = 2523 (* On x86/32 it might be shorter to use DEC addrReg; ADD addrReg, indexReg. *) 2524 Word8Vector.fromList(opAddress(if hostIsX64 then LEAL64 else LEAL32, ~1, addrReg, Index1 indexReg, addrReg)) :: list 2525 2526 | fixupAddress(CallAddress(NonAddressConstArg _), brCode, ic, list) = 2527 let 2528 val brLen = Word8Vector.length brCode 2529 in 2530 (* Call to the start of the code. Offset is -(bytes to start). *) 2531 Word8VectorSlice.concat[ 2532 Word8VectorSlice.slice(brCode, 0, SOME(brLen-4)), (* The original opcode. *) 2533 Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt(~(Word.toInt ic+brLen))))) 2534 ] :: list 2535 end 2536 2537 | fixupAddress(JumpAddress(NonAddressConstArg _), brCode, ic, list) = 2538 let 2539 val brLen = Word8Vector.length brCode 2540 in 2541 (* Call to the start of the code. Offset is -(bytes to start). *) 2542 Word8VectorSlice.concat[ 2543 Word8VectorSlice.slice(brCode, 0, SOME(brLen-4)), (* The original opcode. *) 2544 Word8VectorSlice.full(Word8Vector.fromList(int32Signed(LargeInt.fromInt(~(Word.toInt ic+brLen))))) 2545 ] :: list 2546 end 2547 2548 | fixupAddress(_, bytes, _, list) = bytes :: list 2549 2550 fun reprocess(bytesList, lastCodeSize) = 2551 let 2552 val fixedList = List.rev(foldCode fixupAddress [] (ops, bytesList)) 2553 val newCodeSize = setLabelAddresses(ops, fixedList, 0w0) 2554 in 2555 if newCodeSize = lastCodeSize 2556 then (fixedList, lastCodeSize) 2557 else if newCodeSize > lastCodeSize 2558 then raise InternalError "reprocess - size increased" 2559 else reprocess(fixedList, newCodeSize) 2560 end 2561 in 2562 reprocess(bytesList, setLabelAddresses(ops, bytesList, 0w0)) 2563 end 2564 2565 (* The handling of constants generally differs between 32- and 64-bits. In 32-bits we put all constants 2566 inline and the GC processes the code to find the addresss. For real values the "constant" is actually 2567 the address of the boxed real value. 2568 In 64-bit mode inline constants were used with the MOV instruction but this has now been removed. 2569 All constants are stored in one of two areas at the end of the 2570 code segment. Non-addresses, including the actual values of reals, are stored in the non-address area 2571 and addresses go in the address area. Only the latter is scanned by the GC. 2572 The address area is also used in 32-bit mode but only has the address of the function name and the 2573 address of the profile ref in it. *) 2574 datatype inline32constants = 2575 SelfAddress (* The address of the start of the code - inline absolute address 32-bit only *) 2576 | InlineAbsoluteAddress of machineWord (* An address in the code: 32-bit only *) 2577 | InlineRelativeAddress of machineWord (* A relative address: 32-bit only. *) 2578 2579 local 2580 (* Turn an integer constant into an 8-byte vector. *) 2581 fun intConst ival = LargeWord.fromLargeInt ival 2582 2583 (* Copy a real constant from memory into an 8-byte vector. *) 2584 fun realConst c = 2585 let 2586 val cAsAddr = toAddress c 2587 (* This may be a boxed real or, in 32-in-64 mode, a boxed float. *) 2588 val cLength = length cAsAddr * wordSize 2589 val _ = ((cLength = 0w8 orelse cLength = 0w4) andalso flags cAsAddr = F_bytes) orelse 2590 raise InternalError "realConst: Not a real number" 2591 fun getBytes(i, a) = 2592 if i = 0w0 then a 2593 else getBytes(i-0w1, a*0w256 + Word8.toLargeWord(loadByte(cAsAddr, i-0w1))) 2594 in 2595 getBytes(cLength, 0w0) 2596 end 2597 2598 fun getConstant(Move{ source=NonAddressConstArg source, moveSize=Move32, ...}, bytes, ic, (inl, addr, na)) = 2599 if targetArch <> Native32Bit 2600 then 2601 ( 2602 if source >= ~0x80000000 andalso source < 0x100000000 2603 then (* Signed or unsigned 32-bits. *) (inl, addr, na) 2604 else (* Too big for 32-bits. *) 2605 (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na) 2606 ) 2607 else (inl, addr, na) (* 32-bit mode. The constant will always be inline even if we've had to use LEA r,c *) 2608 2609 | getConstant(Move{ source=NonAddressConstArg source, moveSize=Move64, ...}, bytes, ic, (inl, addr, na)) = 2610 if targetArch <> Native32Bit 2611 then 2612 ( 2613 if source >= ~0x80000000 andalso source < 0x100000000 2614 then (* Signed or unsigned 32-bits. *) (inl, addr, na) 2615 else (* Too big for 32-bits. *) 2616 (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na) 2617 ) 2618 else (inl, addr, na) (* 32-bit mode. The constant will always be inline even if we've had to use XOR r,r; ADD r,c *) 2619 2620 | getConstant(Move{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2621 if targetArch <> Native32Bit 2622 then (* Address constants go in the constant area. *) 2623 (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na) 2624 else ((ic + Word.fromInt(Word8Vector.length bytes) - wordSize, InlineAbsoluteAddress source) :: inl, addr, na) 2625 2626 | getConstant(LoadAbsolute{value, ...}, bytes, ic, (inl, addr, na)) = 2627 if targetArch = Native64Bit 2628 then (* Address constants go in the constant area. *) 2629 (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, value) :: addr, na) 2630 (* This is the only case of an inline constant in 32-in-64 *) 2631 else ((ic + Word.fromInt(Word8Vector.length bytes) - nativeWordSize, InlineAbsoluteAddress value) :: inl, addr, na) 2632 2633 | getConstant(ArithToGenReg{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2634 if is32bit source 2635 then (inl, addr, na) 2636 else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na) 2637 2638 | getConstant(ArithToGenReg{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2639 if hostIsX64 2640 then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na) 2641 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na) 2642 2643 | getConstant(ArithMemLongConst{ source, ... }, bytes, ic, (inl, addr, na)) = (* 32-bit only. *) 2644 ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na) 2645 2646 | getConstant(PushToStack(NonAddressConstArg constnt), bytes, ic, (inl, addr, na)) = 2647 if is32bit constnt then (inl, addr, na) 2648 else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst constnt) :: na) 2649 2650 | getConstant(PushToStack(AddressConstArg constnt), bytes, ic, (inl, addr, na)) = 2651 if targetArch = Native64Bit 2652 then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, constnt) :: addr, na) 2653 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constnt) :: inl, addr, na) 2654 2655 | getConstant(CallAddress(AddressConstArg w), bytes, ic, (inl, addr, na)) = 2656 if targetArch = Native64Bit 2657 then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, w) :: addr, na) 2658 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineRelativeAddress w) :: inl, addr, na) 2659 2660 | getConstant(JumpAddress(AddressConstArg w), bytes, ic, (inl, addr, na)) = 2661 if targetArch = Native64Bit 2662 then (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, w) :: addr, na) 2663 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineRelativeAddress w) :: inl, addr, na) 2664 2665 | getConstant(LoadLabelAddress _, _, ic, (inl, addr, na)) = 2666 (* We need the address of the code itself but it's in the first of a pair of instructions. *) 2667 if hostIsX64 then (inl, addr, na) else ((ic + 0w1, SelfAddress) :: inl, addr, na) 2668 2669 | getConstant(FPLoadFromConst{constant, ...}, bytes, ic, (inl, addr, na)) = 2670 if hostIsX64 2671 then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst constant) :: na) 2672 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constant) :: inl, addr, na) 2673 2674 | getConstant(FPArithConst{ source, ... }, bytes, ic, (inl, addr, na)) = 2675 if hostIsX64 2676 then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst source) :: na) 2677 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress source) :: inl, addr, na) 2678 2679 | getConstant(XMMArith { source=AddressConstArg constVal, ... }, bytes, ic, (inl, addr, na)) = 2680 (* Real.real constant or, with 32-bit words, a Real32.real constant. *) 2681 if hostIsX64 2682 then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, realConst constVal) :: na) 2683 else ((ic + Word.fromInt(Word8Vector.length bytes) - 0w4, InlineAbsoluteAddress constVal) :: inl, addr, na) 2684 2685 | getConstant(XMMArith { source=NonAddressConstArg constVal, ... }, bytes, ic, (inl, addr, na)) = 2686 (* Real32.real constant in native 64-bit. *) 2687 (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst constVal) :: na) 2688 2689 | getConstant(MultiplyR{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2690 if is32bit source 2691 then (inl, addr, na) 2692 else (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na) 2693 2694 | getConstant(CondMove{ source=NonAddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2695 if targetArch <> Native32Bit 2696 then (inl, addr, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, intConst source) :: na) 2697 else (inl, addr, na) (* 32-bit mode. The constant will always be inline. *) 2698 2699 | getConstant(CondMove{ source=AddressConstArg source, ... }, bytes, ic, (inl, addr, na)) = 2700 if targetArch <> Native32Bit 2701 then (* Address constants go in the constant area. *) 2702 (inl, (ic + Word.fromInt(Word8Vector.length bytes) - 0w4, source) :: addr, na) 2703 else ((ic + Word.fromInt(Word8Vector.length bytes) - wordSize, InlineAbsoluteAddress source) :: inl, addr, na) 2704 2705 | getConstant(_, _, _, l) = l 2706 in 2707 val getConstants = foldCode getConstant ([], [], []) 2708 end 2709 2710 (* It is convenient to have AllocStore and AllocStoreVariable as primitives at the higher 2711 level but at this point it's better to expand them into their basic instructions. *) 2712 fun expandComplexOperations(instrs, oldLabelCount) = 2713 let 2714 val labelCount = ref oldLabelCount 2715 fun mkLabel() = Label{labelNo= !labelCount} before labelCount := !labelCount + 1 2716 2717 (* On X86/64 the local pointer is in r15. On X86/32 it's in memRegs. *) 2718 val localPointer = 2719 if hostIsX64 then RegisterArg r15 else MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex} 2720 2721 val nativeWordOpSize = if hostIsX64 then OpSize64 else OpSize32 2722 2723 fun allocStoreCommonCode (resultReg, isVarAlloc, regSaveSet: genReg list) = 2724 let 2725 val compare = 2726 ArithToGenReg{opc=CMP, output=resultReg, 2727 source=MemoryArg{base=ebp, offset=memRegLocalMbottom, index=NoIndex}, opSize=nativeWordOpSize} 2728 (* Normally we won't have run out of store so we want the default 2729 branch prediction to skip the test here. However doing that 2730 involves adding an extra branch which lengthens the code so 2731 it's probably not worth while. *) 2732 (* Just checking against the lower limit can fail 2733 in the situation where the heap pointer is at the low end of 2734 the address range and the store required is so large that the 2735 subtraction results in a negative number. In that case it 2736 will be > (unsigned) lower_limit so in addition we have 2737 to check that the result is < (unsigned) heap_pointer. 2738 This actually happened on Windows with X86-64. 2739 In theory this can happen with fixed-size allocations as 2740 well as variable allocations but in practice fixed-size 2741 allocations are going to be small enough that it's not a 2742 problem. *) 2743 val destLabel = mkLabel() 2744 val branches = 2745 if isVarAlloc 2746 then 2747 let 2748 val extraLabel = mkLabel() 2749 in 2750 [ConditionalBranch{test=JB, label=extraLabel}, 2751 ArithToGenReg{opc=CMP, output=resultReg, source=localPointer, opSize=nativeWordOpSize}, 2752 ConditionalBranch{test=JB, label=destLabel}, 2753 JumpLabel extraLabel] 2754 end 2755 else [ConditionalBranch{test=JNB, label=destLabel}] 2756 val callRts = CallRTS{rtsEntry=HeapOverflowCall, saveRegs=regSaveSet} 2757 val fixup = JumpLabel destLabel 2758 (* Update the heap pointer now we have the store. This is also 2759 used by the RTS in the event of a trap to work out how much 2760 store was being allocated. *) 2761 val update = 2762 if hostIsX64 then Move{source=RegisterArg resultReg, destination=RegisterArg r15, moveSize=Move64} 2763 else Move{source=RegisterArg resultReg, 2764 destination=MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex}, moveSize=Move32} 2765 in 2766 compare :: branches @ [callRts, fixup, update] 2767 end 2768 2769 fun doExpansion([], code, _) = code 2770 2771 | doExpansion(AllocStore {size, output, saveRegs} :: instrs, code, inAllocation) = 2772 let 2773 val _ = inAllocation andalso raise InternalError "doExpansion: Allocation started but not complete" 2774 val () = if List.exists (fn r => r = output) saveRegs then raise InternalError "AllocStore: in set" else () 2775 2776 val startCode = 2777 case targetArch of 2778 Native64Bit => 2779 let 2780 val bytes = (size + 1) * Word.toInt wordSize 2781 in 2782 [LoadAddress{output=output, offset = ~ bytes, base=SOME r15, index=NoIndex, opSize=OpSize64}] 2783 (* TODO: What if it's too big to fit? *) 2784 end 2785 | Native32Bit => 2786 let 2787 val bytes = (size + 1) * Word.toInt wordSize 2788 in 2789 [Move{source=MemoryArg{base=ebp, offset=memRegLocalMPointer, index=NoIndex}, 2790 destination=RegisterArg output, moveSize=Move32}, 2791 LoadAddress{output=output, offset = ~ bytes, base=SOME output, index=NoIndex, opSize=OpSize32}] 2792 end 2793 | ObjectId32Bit => 2794 let 2795 (* We must allocate an even number of words. *) 2796 val heapWords = if Int.rem(size, 2) = 1 then size+1 else size+2 2797 val bytes = heapWords * Word.toInt wordSize 2798 in 2799 [LoadAddress{output=output, offset = ~ bytes, base=SOME r15, index=NoIndex, opSize=OpSize64}] 2800 end 2801 2802 val resultCode = startCode @ allocStoreCommonCode(output, false, saveRegs) 2803 in 2804 doExpansion(instrs, (List.rev resultCode) @ code, true) 2805 end 2806 2807 | doExpansion(AllocStoreVariable {size, output, saveRegs} :: instrs, code, inAllocation) = 2808 let 2809 (* Allocates memory. The "size" register contains the number of words as a tagged int. *) 2810 val _ = inAllocation andalso raise InternalError "doExpansion: Allocation started but not complete" 2811 val () = if List.exists (fn r => r = output) saveRegs then raise InternalError "AllocStore: in set" else () 2812 (* Negate the length and add it to the current heap pointer. *) 2813 (* Compute the number of bytes into dReg. The length in sReg is the number 2814 of words as a tagged value so we need to multiply it, add wordSize to 2815 include one word for the header then subtract the, multiplied, tag. 2816 We use LEA here but want to avoid having an empty base register. *) 2817 val _ = size = output andalso raise InternalError "AllocStoreVariable : same register for size and output" 2818 val startCode = 2819 if wordSize = 0w8 (* 8-byte words *) 2820 then 2821 [ 2822 ArithToGenReg{opc=XOR, output=output, source=RegisterArg output, opSize=OpSize32 (* Rest is zeroed *)}, 2823 ArithToGenReg{opc=SUB, output=output, source=RegisterArg size, opSize=OpSize64}, 2824 LoadAddress{output=output, base=SOME r15, offset= ~(Word.toInt wordSize-4), index=Index4 output, opSize=OpSize64 } 2825 ] 2826 else (* 4 byte words *) 2827 [ 2828 LoadAddress{output=output, base=SOME size, offset=Word.toInt wordSize-2, 2829 index=Index1 size, opSize=nativeWordOpSize }, 2830 Negative{output=output, opSize=nativeWordOpSize}, 2831 ArithToGenReg{opc=ADD, output=output, source=localPointer, opSize=nativeWordOpSize} 2832 ] 2833 (* If this is 32-in-64 we need to round down to the next 8-byte boundary. *) 2834 val roundCode = 2835 if targetArch = ObjectId32Bit 2836 then [ArithToGenReg{opc=AND, output=output, source=NonAddressConstArg ~8, opSize=OpSize64 }] 2837 else [] 2838 val resultCode = startCode @ roundCode @ allocStoreCommonCode(output, true, saveRegs) 2839 in 2840 doExpansion(instrs, (List.rev resultCode) @ code, true) 2841 end 2842 2843 | doExpansion(StoreInitialised :: instrs, code, _) = doExpansion(instrs, code, false) 2844 2845 | doExpansion(instr :: instrs, code, inAlloc) = doExpansion(instrs, instr::code, inAlloc) 2846 2847 val expanded = List.rev(doExpansion(instrs, [], false)) 2848 in 2849 (expanded, !labelCount) 2850 end 2851 2852 2853 fun printCode (Code{procName, printStream, ...}, seg) = 2854 let 2855 val print = printStream 2856 val ptr = ref 0w0; 2857 (* prints a string representation of a number *) 2858 fun printValue v = 2859 if v < 0 then (print "-"; print(LargeInt.toString (~ v))) else print(LargeInt.toString v) 2860 2861 infix 3 +:= ; 2862 fun (x +:= y) = (x := !x + (y:word)); 2863 2864 fun get16s (a, seg) : int = 2865 let 2866 val b0 = Word8.toInt (codeVecGet (seg, a)); 2867 val b1 = Word8.toInt (codeVecGet (seg, a + 0w1)); 2868 val b1' = if b1 >= 0x80 then b1 - 0x100 else b1; 2869 in 2870 (b1' * 0x100) + b0 2871 end 2872 2873 fun get16u(a, seg) : int = 2874 Word8.toInt (codeVecGet (seg, a + 0w1)) * 0x100 + Word8.toInt (codeVecGet (seg, a)) 2875 2876 (* Get 1 unsigned byte from the given offset in the segment. *) 2877 fun get8u (a, seg) : Word8.word = codeVecGet (seg, a); 2878 2879 (* Get 1 signed byte from the given offset in the segment. *) 2880 fun get8s (a, seg) : int = Word8.toIntX (codeVecGet (seg, a)); 2881 2882 (* Get 1 signed 32 bit word from the given offset in the segment. *) 2883 fun get32s (a, seg) : LargeInt.int = 2884 let 2885 val b0 = Word8.toLargeInt (codeVecGet (seg, a)); 2886 val b1 = Word8.toLargeInt (codeVecGet (seg, a + 0w1)); 2887 val b2 = Word8.toLargeInt (codeVecGet (seg, a + 0w2)); 2888 val b3 = Word8.toLargeInt (codeVecGet (seg, a + 0w3)); 2889 val b3' = if b3 >= 0x80 then b3 - 0x100 else b3; 2890 val topHw = (b3' * 0x100) + b2; 2891 val bottomHw = (b1 * 0x100) + b0; 2892 in 2893 (topHw * exp2_16) + bottomHw 2894 end 2895 2896 fun get64s (a, seg) : LargeInt.int = 2897 let 2898 val b0 = Word8.toLargeInt (codeVecGet (seg, a)); 2899 val b1 = Word8.toLargeInt (codeVecGet (seg, a + 0w1)); 2900 val b2 = Word8.toLargeInt (codeVecGet (seg, a + 0w2)); 2901 val b3 = Word8.toLargeInt (codeVecGet (seg, a + 0w3)); 2902 val b4 = Word8.toLargeInt (codeVecGet (seg, a + 0w4)); 2903 val b5 = Word8.toLargeInt (codeVecGet (seg, a + 0w5)); 2904 val b6 = Word8.toLargeInt (codeVecGet (seg, a + 0w6)); 2905 val b7 = Word8.toLargeInt (codeVecGet (seg, a + 0w7)); 2906 val b7' = if b7 >= 0x80 then b7 - 0x100 else b7; 2907 in 2908 ((((((((b7' * 0x100 + b6) * 0x100 + b5) * 0x100 + b4) * 0x100 + b3) 2909 * 0x100 + b2) * 0x100) + b1) * 0x100) + b0 2910 end 2911 2912 fun print32 () = printValue (get32s (!ptr, seg)) before (ptr +:= 0w4) 2913 and print64 () = printValue (get64s (!ptr, seg)) before (ptr +:= 0w8) 2914 and print16 () = printValue (LargeInt.fromInt(get16s (!ptr, seg)) before (ptr +:= 0w2)) 2915 and print8 () = printValue (LargeInt.fromInt(get8s (!ptr, seg)) before (ptr +:= 0w1)) 2916 2917 fun printJmp () = 2918 let 2919 val valu = get8s (!ptr, seg) before ptr +:= 0w1 2920 in 2921 print (Word.fmt StringCvt.HEX (Word.fromInt valu + !ptr)) 2922 end 2923 2924 (* Print an effective address. The register field may designate a general register 2925 or an xmm register depending on the instruction. *) 2926 fun printEAGeneral printRegister (rex, sz) = 2927 let 2928 val modrm = codeVecGet (seg, !ptr) 2929 val () = ptr +:= 0w1 2930 (* Decode the Rex prefix if present. *) 2931 val rexX = (rex andb8 0wx2) <> 0w0 2932 val rexB = (rex andb8 0wx1) <> 0w0 2933 val prefix = 2934 case sz of 2935 SZByte => "byte ptr " 2936 | SZWord => "word ptr " 2937 | SZDWord => "dword ptr " 2938 | SZQWord => "qword ptr " 2939 in 2940 case (modrm >>- 0w6, modrm andb8 0w7, hostIsX64) of 2941 (0w3, rm, _) => printRegister(rm, rexB, sz) 2942 2943 | (md, 0w4, _) => 2944 let (* s-i-b present. *) 2945 val sib = codeVecGet (seg, !ptr) 2946 val () = ptr +:= 0w1 2947 val ss = sib >>- 0w6 2948 val index = (sib >>- 0w3) andb8 0w7 2949 val base = sib andb8 0w7 2950 in 2951 print prefix; 2952 2953 case (md, base, hostIsX64) of 2954 (0w1, _, _) => print8 () 2955 | (0w2, _, _) => print32 () 2956 | (0w0, 0w5, _) => print32 () (* Absolute in 32-bit mode. PC-relative in 64-bit ?? *) 2957 | _ => (); 2958 2959 print "["; 2960 2961 if md <> 0w0 orelse base <> 0w5 2962 then 2963 ( 2964 print (genRegRepr (mkReg (base, rexB), sz32_64)); 2965 if index = 0w4 then () else print "," 2966 ) 2967 else (); 2968 2969 if index = 0w4 andalso not rexX (* No index. *) 2970 then () 2971 else print (genRegRepr (mkReg(index, rexX), sz32_64) ^ 2972 (if ss = 0w0 then "*1" 2973 else if ss = 0w1 then "*2" 2974 else if ss = 0w2 then "*4" 2975 else "*8")); 2976 2977 print "]" 2978 end 2979 2980 | (0w0, 0w5, false) => (* Absolute address.*) (print prefix; print32 ()) 2981 2982 | (0w0, 0w5, _) => (* PC-relative in 64-bit *) 2983 (print prefix; print ".+"; print32 ()) 2984 2985 | (md, rm, _) => (* register plus offset. *) 2986 ( 2987 print prefix; 2988 if md = 0w1 then print8 () 2989 else if md = 0w2 then print32 () 2990 else (); 2991 2992 print ("[" ^ genRegRepr (mkReg(rm, rexB), sz32_64) ^ "]") 2993 ) 2994 end 2995 2996 (* For most instructions we want to print a general register. *) 2997 val printEA = 2998 printEAGeneral (fn (rm, rexB, sz) => print (genRegRepr (mkReg(rm, rexB), sz))) 2999 and printEAxmm = 3000 printEAGeneral (fn (rm, _, _) => print (xmmRegRepr(SSE2Reg rm))) 3001 3002 fun printArith opc = 3003 print 3004 (case opc of 3005 0 => "add " 3006 | 1 => "or " 3007 | 2 => "adc " 3008 | 3 => "sbb " 3009 | 4 => "and " 3010 | 5 => "sub " 3011 | 6 => "xor " 3012 | _ => "cmp " 3013 ) 3014 3015 fun printGvEv (opByte, rex, rexR, sz) = 3016 let 3017 (* Register is in next byte. *) 3018 val nb = codeVecGet (seg, !ptr) 3019 val reg = (nb >>- 0w3) andb8 0w7 3020 in 3021 printArith(Word8.toInt((opByte div 0w8) mod 0w8)); 3022 print "\t"; 3023 print (genRegRepr (mkReg(reg, rexR), sz)); 3024 print ","; 3025 printEA(rex, sz) 3026 end 3027 3028 fun printMovCToR (opByte, sz, rexB) = 3029 ( 3030 print "mov \t"; 3031 print(genRegRepr (mkReg (opByte mod 0w8, rexB), sz)); 3032 print ","; 3033 case sz of SZDWord => print32 () | SZQWord => print64 () | _ => print "???" 3034 ) 3035 3036 fun printShift (opByte, rex, sz) = 3037 let 3038 (* Opcode is determined by next byte. *) 3039 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3040 val opc = (nb div 8) mod 8 3041 in 3042 print 3043 (case opc of 3044 4 => "shl " 3045 | 5 => "shr " 3046 | 7 => "sar " 3047 | _ => "???" 3048 ); 3049 print "\t"; 3050 printEA(rex, sz); 3051 print ","; 3052 if opByte = opToInt Group2_1_A32 then print "1" 3053 else if opByte = opToInt Group2_CL_A32 then print "cl" 3054 else print8 () 3055 end 3056 3057 fun printFloat (opByte, rex) = 3058 let 3059 (* Opcode is in next byte. *) 3060 val opByte2 = codeVecGet (seg, !ptr) 3061 val nnn = (opByte2 >>- 0w3) andb8 0w7 3062 val escNo = opByte andb8 0wx7 3063 in 3064 if (opByte2 andb8 0wxC0) = 0wxC0 3065 then (* mod = 11 *) 3066 ( 3067 case (escNo, nnn, opByte2 andb8 0wx7 (* modrm *)) of 3068 (0w1, 0w4, 0w0) => print "fchs" 3069 | (0w1, 0w4, 0w1) => print "fabs" 3070 | (0w1, 0w5, 0w6) => print "fldz" 3071 | (0w1, 0w5, 0w1) => print "flf1" 3072 | (0w7, 0w4, 0w0) => print "fnstsw\tax" 3073 | (0w1, 0w5, 0w0) => print "fld1" 3074 | (0w1, 0w6, 0w3) => print "fpatan" 3075 | (0w1, 0w7, 0w2) => print "fsqrt" 3076 | (0w1, 0w7, 0w6) => print "fsin" 3077 | (0w1, 0w7, 0w7) => print "fcos" 3078 | (0w1, 0w6, 0w7) => print "fincstp" 3079 | (0w1, 0w6, 0w6) => print "fdecstp" 3080 | (0w3, 0w4, 0w2) => print "fnclex" 3081 | (0w5, 0w2, rno) => print ("fst \tst(" ^ Word8.toString rno ^ ")") 3082 | (0w5, 0w3, rno) => print ("fstp\tst(" ^ Word8.toString rno ^ ")") 3083 | (0w1, 0w0, rno) => print ("fld \tst(" ^ Word8.toString rno ^ ")") 3084 | (0w1, 0w1, rno) => print ("fxch\tst(" ^ Word8.toString rno ^ ")") 3085 | (0w0, 0w3, rno) => print ("fcomp\tst(" ^ Word8.toString rno ^ ")") 3086 | (0w0, 0w0, rno) => print ("fadd\tst,st(" ^ Word8.toString rno ^ ")") 3087 | (0w0, 0w1, rno) => print ("fmul\tst,st(" ^ Word8.toString rno ^ ")") 3088 | (0w0, 0w4, rno) => print ("fsub\tst,st(" ^ Word8.toString rno ^ ")") 3089 | (0w0, 0w5, rno) => print ("fsubr\tst,st(" ^ Word8.toString rno ^ ")") 3090 | (0w0, 0w6, rno) => print ("fdiv\tst,st(" ^ Word8.toString rno ^ ")") 3091 | (0w0, 0w7, rno) => print ("fdivr\tst,st(" ^ Word8.toString rno ^ ")") 3092 | (0w5, 0w0, rno) => print ("ffree\tst(" ^ Word8.toString rno ^ ")") 3093 | _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2)); 3094 ptr +:= 0w1 3095 ) 3096 else (* mod = 00, 01, 10 *) 3097 ( 3098 case (escNo, nnn) of 3099 (0w0, 0w0) => (print "fadd\t"; printEA(rex, SZDWord)) (* Single precision. *) 3100 | (0w0, 0w1) => (print "fmul\t"; printEA(rex, SZDWord)) 3101 | (0w0, 0w3) => (print "fcomp\t"; printEA(rex, SZDWord)) 3102 | (0w0, 0w4) => (print "fsub\t"; printEA(rex, SZDWord)) 3103 | (0w0, 0w5) => (print "fsubr\t"; printEA(rex, SZDWord)) 3104 | (0w0, 0w6) => (print "fdiv\t"; printEA(rex, SZDWord)) 3105 | (0w0, 0w7) => (print "fdivr\t"; printEA(rex, SZDWord)) 3106 | (0w1, 0w0) => (print "fld \t"; printEA(rex, SZDWord)) 3107 | (0w1, 0w2) => (print "fst\t"; printEA(rex, SZDWord)) 3108 | (0w1, 0w3) => (print "fstp\t"; printEA(rex, SZDWord)) 3109 | (0w1, 0w5) => (print "fldcw\t"; printEA(rex, SZWord)) (* Control word is 16 bits *) 3110 | (0w1, 0w7) => (print "fstcw\t"; printEA(rex, SZWord)) (* Control word is 16 bits *) 3111 | (0w3, 0w0) => (print "fild\t"; printEA(rex, SZDWord)) (* 32-bit int. *) 3112 | (0w7, 0w5) => (print "fild\t"; printEA(rex, SZQWord)) (* 64-bit int. *) 3113 | (0w3, 0w3) => (print "fistp\t"; printEA(rex, SZDWord)) (* 32-bit int. *) 3114 | (0w7, 0w7) => (print "fistp\t"; printEA(rex, SZQWord)) (* 64-bit int. *) 3115 | (0w4, 0w0) => (print "fadd\t"; printEA(rex, SZQWord)) (* Double precision. *) 3116 | (0w4, 0w1) => (print "fmul\t"; printEA(rex, SZQWord)) 3117 | (0w4, 0w3) => (print "fcomp\t"; printEA(rex, SZQWord)) 3118 | (0w4, 0w4) => (print "fsub\t"; printEA(rex, SZQWord)) 3119 | (0w4, 0w5) => (print "fsubr\t"; printEA(rex, SZQWord)) 3120 | (0w4, 0w6) => (print "fdiv\t"; printEA(rex, SZQWord)) 3121 | (0w4, 0w7) => (print "fdivr\t"; printEA(rex, SZQWord)) 3122 | (0w5, 0w0) => (print "fld \t"; printEA(rex, SZQWord)) 3123 | (0w5, 0w2) => (print "fst\t"; printEA(rex, SZQWord)) 3124 | (0w5, 0w3) => (print "fstp\t"; printEA(rex, SZQWord)) 3125 | _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2)) 3126 ) 3127 end 3128 3129 fun printJmp32 oper = 3130 let 3131 val valu = get32s (!ptr, seg) before (ptr +:= 0w4) 3132 in 3133 print oper; print "\t"; 3134 print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu)) 3135 end 3136 3137 fun printMask mask = 3138 let 3139 val wordMask = Word.fromInt mask 3140 fun printAReg n = 3141 if n = regs then () 3142 else 3143 ( 3144 if (wordMask andb (0w1 << Word.fromInt n)) <> 0w0 3145 then (print(regRepr(regN n)); print " ") 3146 else (); 3147 printAReg(n+1) 3148 ) 3149 in 3150 printAReg 0 3151 end 3152 3153 in 3154 3155 if procName = "" (* No name *) then print "?" else print procName; 3156 print ":\n"; 3157 3158 while get8u (!ptr, seg) <> 0wxf4 (* HLT. *) do 3159 let 3160 val () = print (Word.fmt StringCvt.HEX (!ptr)) (* The address in hex. *) 3161 val () = print "\t" 3162 3163 (* See if we have a lock prefix. *) 3164 val () = 3165 if get8u (!ptr, seg) = 0wxF0 3166 then (print "lock "; ptr := !ptr + 0w1) 3167 else () 3168 3169 val legacyPrefix = 3170 let 3171 val p = get8u (!ptr, seg) 3172 in 3173 if p = 0wxF2 orelse p = 0wxF3 orelse p = 0wx66 3174 then (ptr := !ptr + 0w1; p) 3175 else 0wx0 3176 end 3177 3178 (* See if we have a REX byte. *) 3179 val rex = 3180 let 3181 val b = get8u (!ptr, seg); 3182 in 3183 if b >= 0wx40 andalso b <= 0wx4f 3184 then (ptr := !ptr + 0w1; b) 3185 else 0w0 3186 end 3187 3188 val rexW = (rex andb8 0wx8) <> 0w0 3189 val rexR = (rex andb8 0wx4) <> 0w0 3190 val rexB = (rex andb8 0wx1) <> 0w0 3191 3192 val opByte = get8u (!ptr, seg) before ptr +:= 0w1 3193 3194 val sizeFromRexW = if rexW then SZQWord else SZDWord 3195 in 3196 case opByte of 3197 0wx03 => printGvEv (opByte, rex, rexR, sizeFromRexW) 3198 3199 | 0wx0b => printGvEv (opByte, rex, rexR, sizeFromRexW) 3200 3201 | 0wx0f => (* ESCAPE *) 3202 let 3203 (* Opcode is in next byte. *) 3204 val opByte2 = codeVecGet (seg, !ptr) 3205 val () = (ptr +:= 0w1) 3206 3207 fun printcmov movop = 3208 let 3209 val nb = codeVecGet (seg, !ptr) 3210 val reg = (nb >>- 0w3) andb8 0w7 3211 in 3212 print movop; 3213 print "\t"; 3214 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3215 print ","; 3216 printEA(rex, sizeFromRexW) 3217 end 3218 in 3219 case legacyPrefix of 3220 0w0 => 3221 ( 3222 case opByte2 of 3223 0wx2e => 3224 let (* ucomiss doesn't have a prefix. *) 3225 val nb = codeVecGet (seg, !ptr) 3226 val reg = SSE2Reg((nb >>- 0w3) andb8 0w7) 3227 in 3228 print "ucomiss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) 3229 end 3230 3231 | 0wx40 => printcmov "cmovo" 3232 | 0wx41 => printcmov "cmovno" 3233 | 0wx42 => printcmov "cmovb" 3234 | 0wx43 => printcmov "cmovnb" 3235 | 0wx44 => printcmov "cmove" 3236 | 0wx45 => printcmov "cmovne" 3237 | 0wx46 => printcmov "cmovna" 3238 | 0wx47 => printcmov "cmova" 3239 | 0wx48 => printcmov "cmovs" 3240 | 0wx49 => printcmov "cmovns" 3241 | 0wx4a => printcmov "cmovp" 3242 | 0wx4b => printcmov "cmovnp" 3243 | 0wx4c => printcmov "cmovl" 3244 | 0wx4d => printcmov "cmovge" 3245 | 0wx4e => printcmov "cmovle" 3246 | 0wx4f => printcmov "cmovg" 3247 3248 | 0wxC1 => 3249 let 3250 val nb = codeVecGet (seg, !ptr); 3251 val reg = (nb >>- 0w3) andb8 0w7 3252 in 3253 (* The address argument comes first in the assembly code. *) 3254 print "xadd\t"; 3255 printEA (rex, sizeFromRexW); 3256 print ","; 3257 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)) 3258 end 3259 3260 | 0wxB6 => 3261 let 3262 val nb = codeVecGet (seg, !ptr); 3263 val reg = (nb >>- 0w3) andb8 0w7 3264 in 3265 print "movzx\t"; 3266 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3267 print ","; 3268 printEA (rex, SZByte) 3269 end 3270 3271 | 0wxB7 => 3272 let 3273 val nb = codeVecGet (seg, !ptr); 3274 val reg = (nb >>- 0w3) andb8 0w7 3275 in 3276 print "movzx\t"; 3277 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3278 print ","; 3279 printEA (rex, SZWord) 3280 end 3281 3282 | 0wxBE => 3283 let 3284 val nb = codeVecGet (seg, !ptr); 3285 val reg = (nb >>- 0w3) andb8 0w7 3286 in 3287 print "movsx\t"; 3288 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3289 print ","; 3290 printEA (rex, SZByte) 3291 end 3292 3293 | 0wxBF => 3294 let 3295 val nb = codeVecGet (seg, !ptr); 3296 val reg = (nb >>- 0w3) andb8 0w7 3297 in 3298 print "movsx\t"; 3299 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3300 print ","; 3301 printEA (rex, SZWord) 3302 end 3303 3304 | 0wxAE => 3305 let 3306 (* Opcode is determined by the next byte. *) 3307 val opByte2 = codeVecGet (seg, !ptr); 3308 val nnn = (opByte2 >>- 0w3) andb8 0w7 3309 in 3310 case nnn of 3311 0wx2 => (print "ldmxcsr\t"; printEA(rex, SZDWord)) 3312 | 0wx3 => (print "stmxcsr\t"; printEA(rex, SZDWord)) 3313 | _ => (printValue(Word8.toLargeInt opByte); printValue(Word8.toLargeInt opByte2)) 3314 end 3315 3316 | 0wxAF => 3317 let 3318 val nb = codeVecGet (seg, !ptr); 3319 val reg = (nb >>- 0w3) andb8 0w7 3320 in 3321 print "imul\t"; 3322 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3323 print ","; 3324 printEA (rex, sizeFromRexW) 3325 end 3326 3327 | 0wx80 => printJmp32 "jo " 3328 | 0wx81 => printJmp32 "jno " 3329 | 0wx82 => printJmp32 "jb " 3330 | 0wx83 => printJmp32 "jnb " 3331 | 0wx84 => printJmp32 "je " 3332 | 0wx85 => printJmp32 "jne " 3333 | 0wx86 => printJmp32 "jna " 3334 | 0wx87 => printJmp32 "ja " 3335 | 0wx88 => printJmp32 "js " 3336 | 0wx89 => printJmp32 "jns " 3337 | 0wx8a => printJmp32 "jp " 3338 | 0wx8b => printJmp32 "jnp " 3339 | 0wx8c => printJmp32 "jl " 3340 | 0wx8d => printJmp32 "jge " 3341 | 0wx8e => printJmp32 "jle " 3342 | 0wx8f => printJmp32 "jg " 3343 3344 | 0wx90 => (print "seto\t"; printEA (rex, SZByte)) 3345 | 0wx91 => (print "setno\t"; printEA (rex, SZByte)) 3346 | 0wx92 => (print "setb\t"; printEA (rex, SZByte)) 3347 | 0wx93 => (print "setnb\t"; printEA (rex, SZByte)) 3348 | 0wx94 => (print "sete\t"; printEA (rex, SZByte)) 3349 | 0wx95 => (print "setne\t"; printEA (rex, SZByte)) 3350 | 0wx96 => (print "setna\t"; printEA (rex, SZByte)) 3351 | 0wx97 => (print "seta\t"; printEA (rex, SZByte)) 3352 | 0wx98 => (print "sets\t"; printEA (rex, SZByte)) 3353 | 0wx99 => (print "setns\t"; printEA (rex, SZByte)) 3354 | 0wx9a => (print "setp\t"; printEA (rex, SZByte)) 3355 | 0wx9b => (print "setnp\t"; printEA (rex, SZByte)) 3356 | 0wx9c => (print "setl\t"; printEA (rex, SZByte)) 3357 | 0wx9d => (print "setge\t"; printEA (rex, SZByte)) 3358 | 0wx9e => (print "setle\t"; printEA (rex, SZByte)) 3359 | 0wx9f => (print "setg\t"; printEA (rex, SZByte)) 3360 3361 | _ => (print "esc\t"; printValue(Word8.toLargeInt opByte2)) 3362 ) 3363 3364 | 0wxf2 => (* SSE2 instruction *) 3365 let 3366 val nb = codeVecGet (seg, !ptr) 3367 val rr = (nb >>- 0w3) andb8 0w7 3368 val reg = SSE2Reg rr 3369 in 3370 case opByte2 of 3371 0wx10 => ( print "movsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3372 | 0wx11 => ( print "movsd\t"; printEAxmm(rex, SZQWord); print ","; print(xmmRegRepr reg) ) 3373 | 0wx2a => ( print "cvtsi2sd\t"; print(xmmRegRepr reg); print ","; printEA(rex, sizeFromRexW) ) 3374 | 0wx2c => 3375 ( print "cvttsd2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW) ) 3376 | 0wx2d => 3377 ( print "cvtsd2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW) ) 3378 | 0wx58 => ( print "addsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3379 | 0wx59 => ( print "mulsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3380 | 0wx5a => ( print "cvtsd2ss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3381 | 0wx5c => ( print "subsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3382 | 0wx5e => ( print "divsd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3383 | b => (print "F2\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b)) 3384 end 3385 3386 | 0wxf3 => (* SSE2 instruction. *) 3387 let 3388 val nb = codeVecGet (seg, !ptr) 3389 val rr = (nb >>- 0w3) andb8 0w7 3390 val reg = SSE2Reg rr 3391 in 3392 case opByte2 of 3393 0wx10 => ( print "movss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3394 | 0wx11 => ( print "movss\t"; printEAxmm(rex, SZDWord); print ","; print(xmmRegRepr reg) ) 3395 | 0wx2c => 3396 ( print "cvttss2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW) ) 3397 | 0wx2d => 3398 ( print "cvtss2si\t"; print (genRegRepr (mkReg(rr, rexR), sizeFromRexW)); print ","; printEAxmm(rex, sizeFromRexW) ) 3399 | 0wx5a => ( print "cvtss2sd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3400 | 0wx58 => ( print "addss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3401 | 0wx59 => ( print "mulss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3402 | 0wx5c => ( print "subss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3403 | 0wx5e => ( print "divss\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZDWord) ) 3404 | b => (print "F3\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b)) 3405 end 3406 3407 | 0wx66 => (* SSE2 instruction *) 3408 let 3409 val nb = codeVecGet (seg, !ptr) 3410 val reg = SSE2Reg((nb >>- 0w3) andb8 0w7) 3411 in 3412 case opByte2 of 3413 0wx2e => ( print "ucomisd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3414 | 0wx54 => ( print "andpd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3415 | 0wx57 => ( print "xorpd\t"; print(xmmRegRepr reg); print ","; printEAxmm(rex, SZQWord) ) 3416 | 0wx6e => ( print (if rexW then "movq\t" else "movd\t"); print(xmmRegRepr reg); print ","; printEA(rex, sizeFromRexW) ) 3417 | 0wx7e => ( print (if rexW then "movq\t" else "movd\t"); printEA(rex, sizeFromRexW); print ","; print(xmmRegRepr reg) ) 3418 | 0wx73 => ( print "psrldq\t"; printEAxmm(rex, SZQWord); print ","; print8 ()) 3419 | b => (print "66\n"; print "0F\n"; print(Word8.fmt StringCvt.HEX b)) 3420 end 3421 3422 | _ => (print "esc\t"; printValue(Word8.toLargeInt opByte2)) 3423 end (* ESCAPE *) 3424 3425 | 0wx13 => printGvEv (opByte, rex, rexR, sizeFromRexW) 3426 3427 | 0wx1b => printGvEv (opByte, rex, rexR, sizeFromRexW) 3428 3429 | 0wx23 => printGvEv (opByte, rex, rexR, sizeFromRexW) 3430 3431 | 0wx2b => printGvEv (opByte, rex, rexR, sizeFromRexW) 3432 3433 | 0wx33 => printGvEv (opByte, rex, rexR, sizeFromRexW) 3434 3435 | 0wx3b => printGvEv (opByte, rex, rexR, sizeFromRexW) 3436 3437 (* Push and Pop. These are 64-bit on X86/64 whether there is REX prefix or not. *) 3438 | 0wx50 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3439 | 0wx51 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3440 | 0wx52 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3441 | 0wx53 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3442 | 0wx54 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3443 | 0wx55 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3444 | 0wx56 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3445 | 0wx57 => print ("push\t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3446 3447 | 0wx58 => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3448 | 0wx59 => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3449 | 0wx5a => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3450 | 0wx5b => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3451 | 0wx5c => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3452 | 0wx5d => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3453 | 0wx5e => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3454 | 0wx5f => print ("pop \t" ^ genRegRepr (mkReg (opByte mod 0w8, rexB), sz32_64)) 3455 3456 | 0wx63 => (* MOVSXD. This is ARPL in 32-bit mode but that's never used here. *) 3457 let 3458 val nb = codeVecGet (seg, !ptr) 3459 val reg = (nb >>- 0w3) andb8 0w7 3460 in 3461 print "movsxd\t"; 3462 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3463 print ","; 3464 printEA(rex, SZDWord) 3465 end 3466 3467 | 0wx68 => (print "push\t"; print32 ()) 3468 | 0wx69 => 3469 let 3470 (* Register is in next byte. *) 3471 val nb = codeVecGet (seg, !ptr) 3472 val reg = (nb >>- 0w3) andb8 0w7 3473 in 3474 print "imul\t"; print(genRegRepr (mkReg(reg, rexR), sizeFromRexW)); print ","; 3475 printEA(rex, sizeFromRexW); print ","; print32 () 3476 end 3477 | 0wx6a => (print "push\t"; print8 ()) 3478 | 0wx6b => 3479 let 3480 (* Register is in next byte. *) 3481 val nb = codeVecGet (seg, !ptr) 3482 val reg = (nb >>- 0w3) andb8 0w7 3483 in 3484 print "imul\t"; print(genRegRepr (mkReg(reg, rexR), sizeFromRexW)); print ","; 3485 printEA(rex, sizeFromRexW); print ","; print8 () 3486 end 3487 3488 | 0wx70 => (print "jo \t"; printJmp()) 3489 | 0wx71 => (print "jno \t"; printJmp()) 3490 | 0wx72 => (print "jb \t"; printJmp()) 3491 | 0wx73 => (print "jnb \t"; printJmp()) 3492 | 0wx74 => (print "je \t"; printJmp()) 3493 | 0wx75 => (print "jne \t"; printJmp()) 3494 | 0wx76 => (print "jna \t"; printJmp()) 3495 | 0wx77 => (print "ja \t"; printJmp()) 3496 | 0wx78 => (print "js \t"; printJmp()) 3497 | 0wx79 => (print "jns \t"; printJmp()) 3498 | 0wx7a => (print "jp \t"; printJmp()) 3499 | 0wx7b => (print "jnp \t"; printJmp()) 3500 | 0wx7c => (print "jl \t"; printJmp()) 3501 | 0wx7d => (print "jge \t"; printJmp()) 3502 | 0wx7e => (print "jle \t"; printJmp()) 3503 | 0wx7f => (print "jg \t"; printJmp()) 3504 3505 | 0wx80 => (* Group1_8_a *) 3506 let (* Memory, byte constant *) 3507 (* Opcode is determined by next byte. *) 3508 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3509 in 3510 printArith ((nb div 8) mod 8); 3511 print "\t"; 3512 printEA(rex, SZByte); 3513 print ","; 3514 print8 () 3515 end 3516 3517 | 0wx81 => 3518 let (* Memory, 32-bit constant *) 3519 (* Opcode is determined by next byte. *) 3520 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3521 in 3522 printArith ((nb div 8) mod 8); 3523 print "\t"; 3524 printEA(rex, sizeFromRexW); 3525 print ","; 3526 print32 () 3527 end 3528 3529 | 0wx83 => 3530 let (* Word memory, 8-bit constant *) 3531 (* Opcode is determined by next byte. *) 3532 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3533 in 3534 printArith ((nb div 8) mod 8); 3535 print "\t"; 3536 printEA(rex, sizeFromRexW); 3537 print ","; 3538 print8 () 3539 end 3540 3541 | 0wx87 => 3542 let (* xchng *) 3543 (* Register is in next byte. *) 3544 val nb = codeVecGet (seg, !ptr) 3545 val reg = (nb >>- 0w3) andb8 0w7 3546 in 3547 print "xchng \t"; 3548 printEA(rex, sizeFromRexW); 3549 print ","; 3550 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)) 3551 end 3552 3553 | 0wx88 => 3554 let (* mov eb,gb i.e a store *) 3555 (* Register is in next byte. *) 3556 val nb = Word8.toInt (codeVecGet (seg, !ptr)); 3557 val reg = (nb div 8) mod 8; 3558 in 3559 print "mov \t"; 3560 printEA(rex, SZByte); 3561 print ","; 3562 if rexR 3563 then print ("r" ^ Int.toString(reg+8) ^ "B") 3564 else case reg of 3565 0 => print "al" 3566 | 1 => print "cl" 3567 | 2 => print "dl" 3568 | 3 => print "bl" 3569 (* If there is a REX byte these select the low byte of the registers. *) 3570 | 4 => print (if rex = 0w0 then "ah" else "sil") 3571 | 5 => print (if rex = 0w0 then "ch" else "dil") 3572 | 6 => print (if rex = 0w0 then "dh" else "bpl") 3573 | 7 => print (if rex = 0w0 then "bh" else "spl") 3574 | _ => print ("r" ^ Int.toString reg) 3575 end 3576 3577 | 0wx89 => 3578 let (* mov ev,gv i.e. a store *) 3579 (* Register is in next byte. *) 3580 val nb = codeVecGet (seg, !ptr) 3581 val reg = (nb >>- 0w3) andb8 0w7 3582 in 3583 print "mov \t"; 3584 (* This may have an opcode prefix. *) 3585 printEA(rex, if legacyPrefix = 0wx66 then SZWord else sizeFromRexW); 3586 print ","; 3587 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)) 3588 end 3589 3590 | 0wx8b => 3591 let (* mov gv,ev i.e. a load *) 3592 (* Register is in next byte. *) 3593 val nb = codeVecGet (seg, !ptr) 3594 val reg = (nb >>- 0w3) andb8 0w7 3595 in 3596 print "mov \t"; 3597 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3598 print ","; 3599 printEA(rex, sizeFromRexW) 3600 end 3601 3602 | 0wx8d => 3603 let (* lea gv.M *) 3604 (* Register is in next byte. *) 3605 val nb = codeVecGet (seg, !ptr) 3606 val reg = (nb >>- 0w3) andb8 0w7 3607 in 3608 print "lea \t"; 3609 print (genRegRepr (mkReg(reg, rexR), sizeFromRexW)); 3610 print ","; 3611 printEA(rex, sizeFromRexW) 3612 end 3613 3614 | 0wx8f => (print "pop \t"; printEA(rex, sz32_64)) 3615 | 0wx90 => print "nop" 3616 3617 | 0wx99 => if rexW then print "cqo" else print "cdq" 3618 3619 | 0wx9e => print "sahf\n" 3620 3621 | 0wxa4 => (if legacyPrefix = 0wxf3 then print "rep " else (); print "movsb") 3622 | 0wxa5 => (if legacyPrefix = 0wxf3 then print "rep " else (); print "movsl") 3623 | 0wxa6 => (if legacyPrefix = 0wxf3 then print "repe " else (); print "cmpsb") 3624 3625 | 0wxa8 => (print "test\tal,"; print8 ()) 3626 3627 | 0wxaa => (if legacyPrefix = 0wxf3 then print "rep " else (); print "stosb") 3628 | 0wxab => 3629 ( 3630 if legacyPrefix = 0wxf3 then print "rep " else (); 3631 if rexW then print "stosq" else print "stosl" 3632 ) 3633 3634 | 0wxb8 => printMovCToR (opByte, sizeFromRexW, rexB) 3635 | 0wxb9 => printMovCToR (opByte, sizeFromRexW, rexB) 3636 | 0wxba => printMovCToR (opByte, sizeFromRexW, rexB) 3637 | 0wxbb => printMovCToR (opByte, sizeFromRexW, rexB) 3638 | 0wxbc => printMovCToR (opByte, sizeFromRexW, rexB) 3639 | 0wxbd => printMovCToR (opByte, sizeFromRexW, rexB) 3640 | 0wxbe => printMovCToR (opByte, sizeFromRexW, rexB) 3641 | 0wxbf => printMovCToR (opByte, sizeFromRexW, rexB) 3642 3643 | 0wxc1 => (* Group2_8_A *) printShift (opByte, rex, sizeFromRexW) 3644 3645 | 0wxc2 => (print "ret \t"; print16 ()) 3646 | 0wxc3 => print "ret" 3647 3648 | 0wxc6 => (* move 8-bit constant to memory *) 3649 ( 3650 print "mov \t"; 3651 printEA(rex, SZByte); 3652 print ","; 3653 print8 () 3654 ) 3655 3656 | 0wxc7 => (* move 32/64-bit constant to memory *) 3657 ( 3658 print "mov \t"; 3659 printEA(rex, sizeFromRexW); 3660 print ","; 3661 print32 () 3662 ) 3663 3664 | 0wxca => (* Register mask *) 3665 let 3666 val mask = get16u (!ptr, seg) before (ptr +:= 0w2) 3667 in 3668 print "SAVE\t"; 3669 printMask mask 3670 end 3671 3672 | 0wxcd => (* Register mask *) 3673 let 3674 val mask = get8u (!ptr, seg) before (ptr +:= 0w1) 3675 in 3676 print "SAVE\t"; 3677 printMask(Word8.toInt mask) 3678 end 3679 3680 | 0wxd1 => (* Group2_1_A *) printShift (opByte, rex, sizeFromRexW) 3681 3682 | 0wxd3 => (* Group2_CL_A *) printShift (opByte, rex, sizeFromRexW) 3683 3684 | 0wxd8 => printFloat (opByte, rex) (* Floating point escapes *) 3685 | 0wxd9 => printFloat (opByte, rex) 3686 | 0wxda => printFloat (opByte, rex) 3687 | 0wxdb => printFloat (opByte, rex) 3688 | 0wxdc => printFloat (opByte, rex) 3689 | 0wxdd => printFloat (opByte, rex) 3690 | 0wxde => printFloat (opByte, rex) 3691 | 0wxdf => printFloat (opByte, rex) 3692 3693 | 0wxe8 => 3694 let (* 32-bit relative call. *) 3695 val valu = get32s (!ptr, seg) before (ptr +:= 0w4) 3696 in 3697 print "call\t"; 3698 print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu)) 3699 end 3700 3701 | 0wxe9 => 3702 let (* 32-bit relative jump. *) 3703 val valu = get32s (!ptr, seg) before (ptr +:= 0w4) 3704 in 3705 print "jmp \t"; 3706 print (Word.fmt StringCvt.HEX (!ptr + Word.fromLargeInt valu)) 3707 end 3708 3709 | 0wxeb => (print "jmp \t"; printJmp()) 3710 3711 | 0wxf4 => print "hlt" (* Marker to indicate end-of-code. *) 3712 3713 | 0wxf6 => (* Group3_a *) 3714 let 3715 (* Opcode is determined by next byte. *) 3716 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3717 val opc = (nb div 8) mod 8 3718 in 3719 print 3720 (case opc of 3721 0 => "test" 3722 | 3 => "neg" 3723 | _ => "???" 3724 ); 3725 print "\t"; 3726 printEA(rex, SZByte); 3727 if opc = 0 then (print ","; print8 ()) else () 3728 end 3729 3730 | 0wxf7 => (* Group3_A *) 3731 let 3732 (* Opcode is determined by next byte. *) 3733 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3734 val opc = (nb div 8) mod 8 3735 in 3736 print 3737 (case opc of 3738 0 => "test" 3739 | 3 => "neg " 3740 | 4 => "mul " 3741 | 5 => "imul" 3742 | 6 => "div " 3743 | 7 => "idiv" 3744 | _ => "???" 3745 ); 3746 print "\t"; 3747 printEA(rex, sizeFromRexW); 3748 (* Test has an immediate operand. It's 32-bits even in 64-bit mode. *) 3749 if opc = 0 then (print ","; print32 ()) else () 3750 end 3751 3752 | 0wxff => (* Group5 *) 3753 let 3754 (* Opcode is determined by next byte. *) 3755 val nb = Word8.toInt (codeVecGet (seg, !ptr)) 3756 val opc = (nb div 8) mod 8 3757 in 3758 print 3759 (case opc of 3760 2 => "call" 3761 | 4 => "jmp " 3762 | 6 => "push" 3763 | _ => "???" 3764 ); 3765 print "\t"; 3766 printEA(rex, sz32_64) (* None of the cases we use need a prefix. *) 3767 end 3768 3769 | _ => print(Word8.fmt StringCvt.HEX opByte); 3770 3771 print "\n" 3772 end; (* end of while loop *) 3773 3774 print "\n" 3775 3776 end (* printCode *); 3777 3778 (* Although this is used locally it must be defined at the top level 3779 otherwise a new RTS function will be compiler every time the 3780 containing function is called *) 3781 val sortFunction: (machineWord * word) array -> bool = 3782 RunCall.rtsCallFast1 "PolySortArrayOfAddresses" 3783 3784 (* This actually does the final code-generation. *) 3785 fun generateCode 3786 {ops=operations, 3787 code=cvec as Code{procName, printAssemblyCode, printStream, profileObject, ...}, 3788 labelCount, resultClosure} : unit = 3789 let 3790 val (expanded, newLabelCount) = expandComplexOperations (operations, labelCount) 3791 3792 val () = printLowLevelCode(expanded, cvec) 3793 local 3794 val initialBytesList = codeGenerate expanded 3795 in 3796 (* Fixup labels and shrink long branches to short. *) 3797 val (bytesList, codeSize) = fixupLabels(expanded, initialBytesList, newLabelCount) 3798 end 3799 3800 local 3801 (* Extract the constants and the location of the references from the code. *) 3802 val (inlineConstants, addressConstants, nonAddressConstants) = getConstants(expanded, bytesList) 3803 3804 (* Sort the non-address constants to remove duplicates. There don't seem to be 3805 many in practice. 3806 Since we're not actually interested in the order but only 3807 sorting to remove duplicates we can use a stripped-down Quicksort. *) 3808 fun sort([], out) = out 3809 | sort((addr, median) :: tl, out) = partition(median, tl, [addr], [], [], out) 3810 3811 and partition(median, [], addrs, less, greater, out) = 3812 sort(less, sort(greater, (addrs, median) :: out)) 3813 | partition(median, (entry as (addr, value)) :: tl, addrs, less, greater, out) = 3814 if value = median 3815 then partition(median, tl, addr::addrs, less, greater, out) 3816 else if value < median 3817 then partition(median, tl, addrs, entry :: less, greater, out) 3818 else partition(median, tl, addrs, less, entry :: greater, out) 3819 3820 (* Non-address constants. We can't use any ordering on them because a GC could 3821 change the values half way through the sort. Instead we use a simple search 3822 for a small number of constants and use an RTS call for larger numbers. We 3823 want to avoid quadratic cost when there are large numbers. *) 3824 3825 val sortedConstants = 3826 if List.length addressConstants < 10 3827 then 3828 let 3829 fun findDups([], out) = out 3830 | findDups((addr, value) :: tl, out) = 3831 let 3832 fun partition(e as (a, v), (eq, neq)) = 3833 if PolyML.pointerEq(value, v) 3834 then (a :: eq, neq) 3835 else (eq, e :: neq) 3836 val (eqAddr, neq) = List.foldl partition ([addr], []) tl 3837 in 3838 findDups(neq, (eqAddr, value) :: out) 3839 end 3840 in 3841 findDups(addressConstants, []) 3842 end 3843 else 3844 let 3845 fun swap (a, b) = (b, a) 3846 val arrayToSort: (machineWord * word) array = 3847 Array.fromList (List.map swap addressConstants) 3848 val _ = sortFunction arrayToSort 3849 3850 fun makeList((v, a), []) = [([a], v)] 3851 | makeList((v, a), l as (aa, vv) :: tl) = 3852 if PolyML.pointerEq(v, vv) 3853 then (a :: aa, vv) :: tl 3854 else ([a], v) :: l 3855 in 3856 Array.foldl makeList [] arrayToSort 3857 end 3858 in 3859 val inlineConstants = inlineConstants 3860 and addressConstants = sortedConstants 3861 and nonAddressConstants = sort(nonAddressConstants, []) 3862 end 3863 3864 (* Get the number of constants that need to be added to the address area. *) 3865 val constsInConstArea = List.length addressConstants 3866 3867 local 3868 (* Add one byte for the HLT and round up to a number of words. *) 3869 val endOfCode = (codeSize+nativeWordSize) div nativeWordSize * (nativeWordSize div wordSize) 3870 val numOfNonAddrWords = Word.fromInt(List.length nonAddressConstants) 3871 (* Each entry in the non-address constant area is 8 bytes. *) 3872 val intSize = 0w8 div wordSize 3873 in 3874 val endOfByteArea = endOfCode + numOfNonAddrWords * intSize 3875 (* +4 for no of consts. function name, profile object and offset to start of consts. *) 3876 val segSize = endOfByteArea + Word.fromInt constsInConstArea + 0w4 3877 end 3878 3879 (* Create a byte vector and copy the data in. This is a byte area and not scanned 3880 by the GC so cannot contain any addresses. *) 3881 val byteVec = byteVecMake segSize 3882 val ic = ref 0w0 3883 3884 local 3885 fun genByte (ival: Word8.word) = set8u (ival, !ic, byteVec) before ic := !ic + 0w1 3886 in 3887 fun genBytes l = Word8Vector.app (fn i => genByte i) l 3888 val () = List.app (fn b => genBytes b) bytesList 3889 val () = genBytes(Word8Vector.fromList(opCodeBytes(HLT, NONE))) (* Marker - this is used by ScanConstants in the RTS. *) 3890 end 3891 3892 (* Align ic onto a fullword boundary. *) 3893 val () = ic := ((!ic + nativeWordSize - 0w1) andb ~nativeWordSize) 3894 3895 (* Copy the non-address constants. These are only used in 64-bit mode and are 3896 either real constants or integers that are too large to fit in a 32-bit 3897 inline constants. We don't use this for real constants in 32-bit mode because 3898 we don't have relative addressing. Instead a real constant is the inline 3899 address of a boxed real number. *) 3900 local 3901 fun putNonAddrConst(addrs, constant) = 3902 let 3903 val addrOfConst = ! ic 3904 val () = genBytes(Word8Vector.fromList(largeWordToBytes(constant, 8))) 3905 fun setAddr addr = set32s(Word.toLargeInt(addrOfConst - addr - 0w4), addr, byteVec) 3906 in 3907 List.app setAddr addrs 3908 end 3909 in 3910 val () = List.app putNonAddrConst nonAddressConstants 3911 end 3912 3913 val _ = bytesToWords(! ic) = endOfByteArea orelse raise InternalError "mismatch" 3914 3915 (* Put in the number of constants. This must go in before we actually put 3916 in any constants. In 32-bit mode there are only two constants: the 3917 function name and the profile object. 3918 All other constants are in the code. *) 3919 local 3920 val lastWord = wordsToBytes(endOfByteArea + 0w3 + Word.fromInt constsInConstArea) 3921 3922 fun setBytes(_, _, 0) = () 3923 | setBytes(ival, offset, count) = 3924 ( 3925 byteVecSet(byteVec, offset, Word8.fromLargeInt(ival mod 256)); 3926 setBytes(ival div 256, offset+0w1, count-1) 3927 ) 3928 in 3929 val () = setBytes(LargeInt.fromInt(2 + constsInConstArea), wordsToBytes endOfByteArea, Word.toInt wordSize) 3930 (* Set the last word of the code to the (negative) byte offset of the start of the code area 3931 from the end of this word. *) 3932 val () = setBytes(Word.toLargeIntX(wordsToBytes endOfByteArea - lastWord), lastWord, Word.toInt wordSize) 3933 end; 3934 3935 (* We've put in all the byte data so it is safe to convert this to a mutable code 3936 cell that can contain addresses and will be scanned by the GC. *) 3937 val codeSeg = byteVecToCodeVec(byteVec, resultClosure) 3938 3939 (* Various RTS functions assume that the first constant is the function name. 3940 The profiler assumes that the second word is the address of the mutable that 3941 contains the profile count. *) 3942 val () = codeVecPutWord (codeSeg, endOfByteArea + 0w1, toMachineWord procName) 3943 (* Next the profile object. *) 3944 val () = codeVecPutWord (codeSeg, endOfByteArea + 0w2, profileObject) 3945 in 3946 let 3947 fun setBytes(_, _, 0w0) = () 3948 | setBytes(b, addr, count) = 3949 ( 3950 codeVecSet (codeSeg, addr, wordToWord8 b); 3951 setBytes(b >> 0w8, addr+0w1, count-0w1) 3952 ) 3953 3954 (* Inline constants - native 32-bit only plus one special case in 32-in-64 *) 3955 fun putInlConst (addrs, SelfAddress) = 3956 (* Self address goes inline. *) 3957 codeVecPutConstant (codeSeg, addrs, toMachineWord(codeVecAddr codeSeg), ConstAbsolute) 3958 | putInlConst (addrs, InlineAbsoluteAddress m) = 3959 codeVecPutConstant (codeSeg, addrs, m, ConstAbsolute) 3960 | putInlConst (addrs, InlineRelativeAddress m) = 3961 codeVecPutConstant (codeSeg, addrs, m, ConstX86Relative) 3962 3963 val _ = List.app putInlConst inlineConstants 3964 3965 (* Address constants - native 64-bit and 32-in-64. *) 3966 fun putAddrConst ((addrs, m), constAddr) = 3967 (* Put the constant in the constant area and set the original address 3968 to be the relative offset to the constant itself. *) 3969 ( 3970 codeVecPutWord (codeSeg, constAddr, m); 3971 (* Put in the 32-bit offset - always unsigned since the destination 3972 is after the reference. *) 3973 List.app(fn addr => setBytes(constAddr * wordSize - addr - 0w4, addr, 0w4)) addrs; 3974 constAddr+0w1 3975 ) 3976 3977 (* Put the constants. Any values in the constant area start at +3 i.e. after the profile. *) 3978 val _ = List.foldl putAddrConst (endOfByteArea+0w3) addressConstants 3979 3980 val () = 3981 if printAssemblyCode 3982 then (* print out the code *) 3983 ( 3984 printCode(cvec, codeSeg); 3985 printStream "\n\n" 3986 ) 3987 else () 3988 in 3989 (* Finally lock the code. *) 3990 codeVecLock(codeSeg, resultClosure) 3991 end (* the result *) 3992 end (* generateCode *) 3993 3994 structure Sharing = 3995 struct 3996 type code = code 3997 and reg = reg 3998 and genReg = genReg 3999 and fpReg = fpReg 4000 and addrs = addrs 4001 and operation = operation 4002 and regSet = RegSet.regSet 4003 and label = label 4004 and branchOps = branchOps 4005 and arithOp = arithOp 4006 and shiftType = shiftType 4007 and repOps = repOps 4008 and fpOps = fpOps 4009 and fpUnaryOps = fpUnaryOps 4010 and sse2Operations = sse2Operations 4011 and opSize = opSize 4012 and closureRef = closureRef 4013 end 4014 4015end (* struct *) (* CODECONS *); 4016