/* * libbinrec: a recompiling translator for machine code * Copyright (c) 2016 Andrew Church * * This software may be copied and redistributed under certain conditions; * see the file "COPYING" in the source code distribution for details. * NO WARRANTY is provided with this software. */ #include "src/common.h" #include "src/bitutils.h" #include "src/endian.h" #include "src/host-x86.h" #include "src/host-x86/host-x86-internal.h" #include "src/host-x86/host-x86-opcodes.h" #include "src/rtl-internal.h" /*************************************************************************/ /************* Local constant and data structure definitions *************/ /*************************************************************************/ /* Table of local constant data to insert in the prologue. */ static const struct { uint8_t data[16]; } local_constants[NUM_LOCAL_CONSTANTS] = { [LC_FLOAT32_SIGNBIT ] = {{0x00,0x00,0x00,0x80}}, [LC_FLOAT32_INV_SIGNBIT ] = {{0xFF,0xFF,0xFF,0x7F}}, [LC_FLOAT64_SIGNBIT ] = {{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80}}, [LC_FLOAT64_INV_SIGNBIT ] = {{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x7F}}, [LC_V2_FLOAT32_SIGNBIT ] = {{0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x80}}, [LC_V2_FLOAT32_INV_SIGNBIT] = {{0xFF,0xFF,0xFF,0x7F,0xFF,0xFF,0xFF,0x7F}}, [LC_V2_FLOAT64_SIGNBIT ] = {{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80}}, [LC_V2_FLOAT64_INV_SIGNBIT] = {{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x7F, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x7F}}, [LC_V2_FLOAT32_HIGH_ONES ] = {{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x80,0x3F,0x00,0x00,0x80,0x3F}}, }; /*-----------------------------------------------------------------------*/ /** * CodeBuffer: Structure encapsulating an output code buffer and its * allocated size and current length. Used to help optimization by * letting the compiler know it doesn't have to write size data back to * the handle every few bytes. */ typedef struct CodeBuffer { uint8_t * restrict buffer; long buffer_size; long len; } CodeBuffer; /*************************************************************************/ /************************ Basic utility routines *************************/ /*************************************************************************/ /** * current_reg: Helper function to return the RTL register currently * occupying the given host register. Returns zero if ctx->reg_map[] has * a nonzero entry but that register is already dead, or if host_reg is * occupied by the destination of the current instruction and that * register dies immediately (a dead store). * * [Parameters] * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * host_reg: Host register to look up (X86Register). * [Return value] * Index of the RTL register using the given host register, or 0 if * the host register is not in use. */ static inline PURE_FUNCTION int current_reg( const HostX86Context *ctx, int insn_index, X86Register host_reg) { const int rtl_reg = ctx->reg_map[host_reg]; if (rtl_reg && ctx->unit->regs[rtl_reg].death > insn_index) { return rtl_reg; } else { return 0; } } /*-----------------------------------------------------------------------*/ /** * is_spilled: Helper function to return whether a register is currently * spilled. * * [Parameters] * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * reg: RTL register number. * [Return value] * True if the register is spilled at insn_index, false if not. */ static inline PURE_FUNCTION bool is_spilled( const HostX86Context *ctx, int insn_index, int reg) { const HostX86RegInfo *reg_info = &ctx->regs[reg]; return reg_info->spilled && reg_info->spill_insn <= insn_index; } /*-----------------------------------------------------------------------*/ /** * sse_opcode_prefix_for_type: Return the opcode prefix byte (0x66, 0xF3, * 0xF2, or 0 for no prefix) to use with SSE opcodes for the given data type. * * [Parameters] * type: RTL data type. * [Return value] * Prefix byte, or 0 for no prefix. */ static inline CONST_FUNCTION uint8_t sse_opcode_prefix_for_type( RTLDataType type) { switch (type) { case RTLTYPE_V2_FLOAT64: return 0x66; case RTLTYPE_FLOAT32: return 0xF3; case RTLTYPE_FLOAT64: return 0xF2; default: return 0; } } /*-----------------------------------------------------------------------*/ /** * vex_pp_for_opcode_prefix: Return the 2-bit VEX "pp" field value * corresponding to the given opcode prefix byte (0x66, 0xF3, 0xF2, or * 0 for no prefix). * * [Parameters] * prefix: Prefix byte, or 0 for no prefix. * [Return value] * VEX.pp value corresponding to the prefix. */ static inline CONST_FUNCTION uint8_t vex_pp_for_opcode_prefix(uint8_t prefix) { switch (prefix) { case 0x66: return 1; case 0xF3: return 2; case 0xF2: return 3; default: ASSERT(prefix == 0); return 0; } } /*-----------------------------------------------------------------------*/ /** * rtlfexc_to_bits: Return an MXCSR bitmask corresponding to the * exception(s) specified by exc, or 0 if exc is invalid. */ static inline CONST_FUNCTION uint8_t rtlfexc_to_bits(RTLFloatException exc) { switch (exc) { case RTLFEXC_ANY: return 0x3D; case RTLFEXC_INEXACT: return 0x20; case RTLFEXC_INVALID: return 0x01; case RTLFEXC_OVERFLOW: return 0x08; case RTLFEXC_UNDERFLOW: return 0x10; case RTLFEXC_ZERO_DIVIDE: return 0x04; } return 0; } /*************************************************************************/ /*************** Utility routines for adding instructions ****************/ /*************************************************************************/ /** * append_opcode: Append an x86 opcode to the current code stream. The * code buffer is assumed to have enough space for the instruction. */ static inline void append_opcode(CodeBuffer *code, X86Opcode opcode) { uint8_t *ptr = code->buffer + code->len; if ((uint32_t)opcode <= 0xFF) { ASSERT(code->len + 1 <= code->buffer_size); code->len += 1; *ptr++ = opcode; } else if ((uint32_t)opcode <= 0xFFFF) { ASSERT(code->len + 2 <= code->buffer_size); code->len += 2; *ptr++ = opcode >> 8; *ptr++ = opcode; } else if ((uint32_t)opcode <= 0xFFFFFF) { ASSERT(code->len + 3 <= code->buffer_size); code->len += 3; *ptr++ = opcode >> 16; *ptr++ = opcode >> 8; *ptr++ = opcode; } else { ASSERT(code->len + 4 <= code->buffer_size); code->len += 4; *ptr++ = opcode >> 24; *ptr++ = opcode >> 16; *ptr++ = opcode >> 8; *ptr++ = opcode; } } /*-----------------------------------------------------------------------*/ /** * append_rex_opcode: Append an x86 opcode with a REX prefix to the * current code stream. The code buffer is assumed to have enough space * for the instruction. * * [Parameters] * handle: Translation handle. * rex: REX flags (bitwise OR of X86_REX_* or X86OP_REX_*). * opcode: Opcode to append. */ static inline void append_rex_opcode(CodeBuffer *code, uint8_t rex, X86Opcode opcode) { uint8_t *ptr = code->buffer + code->len; rex |= X86OP_REX; if ((uint32_t)opcode <= 0xFF) { ASSERT(code->len + 2 <= code->buffer_size); code->len += 2; *ptr++ = rex; *ptr++ = opcode; } else if ((uint32_t)opcode <= 0xFFFF) { ASSERT(code->len + 3 <= code->buffer_size); code->len += 3; *ptr++ = rex; *ptr++ = opcode >> 8; *ptr++ = opcode; } else if ((uint32_t)opcode <= 0xFFFFFF) { ASSERT(code->len + 4 <= code->buffer_size); code->len += 4; if (opcode>>16 == 0x66 || opcode>>16 == 0xF2 || opcode>>16 == 0xF3) { *ptr++ = opcode >> 16; *ptr++ = rex; } else { *ptr++ = rex; *ptr++ = opcode >> 16; } *ptr++ = opcode >> 8; *ptr++ = opcode; } else { ASSERT(code->len + 5 <= code->buffer_size); code->len += 5; ASSERT((uint32_t)opcode>>24 == 0x66 || (uint32_t)opcode>>24 == 0xF2 || (uint32_t)opcode>>24 == 0xF3); *ptr++ = opcode >> 24; *ptr++ = rex; *ptr++ = opcode >> 16; *ptr++ = opcode >> 8; *ptr++ = opcode; } } /*-----------------------------------------------------------------------*/ /** * maybe_append_empty_rex: Append an empty REX prefix (0x40) if * host_bytereg is one of X86_SP, X86_BP, X86_SI, or X86_DI and both * host_other1 and host_other2 are -1 (indicating no register) or one of * X86_AX through X86_DI. These are the conditions under which a REX * prefix is required (even if empty) to access host_bytereg as a byte * register; without REX, the corresponding values for the register field * in the opcode map to AH through DH instead. */ static inline void maybe_append_empty_rex( CodeBuffer *code, int host_bytereg, int host_other1, int host_other2) { if (host_bytereg >= X86_SP && host_bytereg <= X86_DI && host_other1 <= X86_DI && host_other2 <= X86_DI) { append_opcode(code, X86OP_REX); } } /*-----------------------------------------------------------------------*/ /** * append_vex_opcode: Append an x86 opcode with a VEX prefix to the * current code stream. The code buffer is assumed to have enough space * for the instruction. * * [Parameters] * handle: Translation handle. * opcode: Opcode to append. * vex_W: True to set the W field of the VEX prefix. * vex_L: True to set the L field of the VEX prefix. * vex_R: True to set the R field of the VEX prefix (to 0). * vex_X: True to set the X field of the VEX prefix (to 0). * vex_B: True to set the B field of the VEX prefix (to 0). * vex_vvvv: Register number for the vvvv field of the VEX prefix, * _not_ complemented. Pass 0 (not 15) if the field is not used. */ static inline void append_vex_opcode( CodeBuffer *code, X86Opcode opcode, bool vex_W, bool vex_L, bool vex_R, bool vex_X, bool vex_B, int vex_vvvv) { ASSERT((uint32_t)opcode > 0xFF); uint8_t *ptr = code->buffer + code->len; /* Currently we only use this function with instructions using the * 0F 38 escape bytes, so we always use the 3-byte VEX prefix. For * plain 0F opcodes, we could potentially use the 2-byte VEX format * instead. */ ASSERT((opcode & 0xFFFFFF) >= 0x0F3800 && (opcode & 0xFFFFFF) <= 0x0F38FF); const uint8_t prefix_byte = opcode >> 24; const uint8_t vex_pp = vex_pp_for_opcode_prefix(prefix_byte); ASSERT(code->len + 4 <= code->buffer_size); code->len += 4; *ptr++ = X86OP_VEX3; *ptr++ = ((vex_R<<7 | vex_X<<6 | vex_B<<5) ^ 0xE0) | 0x02; *ptr++ = vex_W<<7 | (~vex_vvvv & 15) << 3 | vex_L<<2 | vex_pp; *ptr++ = opcode & 0xFF; } /*-----------------------------------------------------------------------*/ /** * append_imm8: Append an 8-bit immediate value to the current code stream. * The code buffer is assumed to have enough space. */ static inline void append_imm8(CodeBuffer *code, uint8_t value) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 1 <= code->buffer_size); code->len += 1; *ptr++ = value; } /*-----------------------------------------------------------------------*/ /** * append_imm16: Append a 16-bit immediate value to the current code stream. * The code buffer is assumed to have enough space. */ static inline void append_imm16(CodeBuffer *code, uint16_t value) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 2 <= code->buffer_size); code->len += 2; *ptr++ = (uint8_t)(value >> 0); *ptr++ = (uint8_t)(value >> 8); } /*-----------------------------------------------------------------------*/ /** * append_imm32: Append a 32-bit immediate value to the current code stream. * The code buffer is assumed to have enough space. */ static inline void append_imm32(CodeBuffer *code, uint32_t value) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 4 <= code->buffer_size); code->len += 4; *ptr++ = (uint8_t)(value >> 0); *ptr++ = (uint8_t)(value >> 8); *ptr++ = (uint8_t)(value >> 16); *ptr++ = (uint8_t)(value >> 24); } /*-----------------------------------------------------------------------*/ /** * append_imm64: Append a 64-bit immediate value to the current code stream. * The code buffer is assumed to have enough space. */ static inline void append_imm64(CodeBuffer *code, uint64_t value) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 8 <= code->buffer_size); code->len += 8; *ptr++ = (uint8_t)(value >> 0); *ptr++ = (uint8_t)(value >> 8); *ptr++ = (uint8_t)(value >> 16); *ptr++ = (uint8_t)(value >> 24); *ptr++ = (uint8_t)(value >> 32); *ptr++ = (uint8_t)(value >> 40); *ptr++ = (uint8_t)(value >> 48); *ptr++ = (uint8_t)(value >> 56); } /*-----------------------------------------------------------------------*/ /** * append_ModRM: Append a ModR/M byte to the current code stream. * The code buffer is assumed to have enough space. */ static inline void append_ModRM(CodeBuffer *code, X86Mod mod, int reg_opcode, int r_m) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 1 <= code->buffer_size); code->len += 1; *ptr++ = x86_ModRM(mod, reg_opcode, r_m); } /*-----------------------------------------------------------------------*/ /** * append_ModRM: Append a ModR/M and SIB byte pair to the current code * stream. The code buffer is assumed to have enough space. */ static inline void append_ModRM_SIB( CodeBuffer *code, X86Mod mod, int reg_opcode, int scale, int index, int base) { uint8_t *ptr = code->buffer + code->len; ASSERT(code->len + 2 <= code->buffer_size); code->len += 2; *ptr++ = x86_ModRM(mod, reg_opcode, X86MODRM_SIB); *ptr++ = x86_SIB(scale, index, base); } /*-----------------------------------------------------------------------*/ /** * append_insn: Append an instruction which takes no operands. * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. */ static inline void append_insn(CodeBuffer *code, bool is64, X86Opcode opcode) { if (is64) { append_rex_opcode(code, X86_REX_W, opcode); } else { append_opcode(code, opcode); } } /*-----------------------------------------------------------------------*/ /** * append_insn_R: Append an instruction which takes a single register * operand encoded in the opcode itself (such as PUSH). * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. * reg: Register. */ static inline void append_insn_R( CodeBuffer *code, bool is64, X86Opcode opcode, X86Register reg) { uint8_t rex = is64 ? X86_REX_W : 0; if (reg & 8) { rex |= X86_REX_B; } if (rex) { append_rex_opcode(code, rex, opcode | (reg & 7)); } else { append_opcode(code, opcode | (reg & 7)); } } /*-----------------------------------------------------------------------*/ /** * append_insn_ModRM_reg: Append an instruction which takes a ModR/M byte, * encoding a register EA in the instruction. * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. * reg1: Register or sub-opcode for ModR/M reg field. * reg2: Register for ModR/M r/m field. */ static inline void append_insn_ModRM_reg( CodeBuffer *code, bool is64, X86Opcode opcode, int reg1, X86Register reg2) { uint8_t rex = is64 ? X86_REX_W : 0; if (reg1 & 8) { rex |= X86_REX_R; } if (reg2 & 8) { rex |= X86_REX_B; } if (rex) { append_rex_opcode(code, rex, opcode); } else { append_opcode(code, opcode); } append_ModRM(code, X86MOD_REG, reg1 & 7, reg2 & 7); } /*-----------------------------------------------------------------------*/ /** * append_insn_ModRM_mem: Append an instruction which takes a ModR/M byte, * encoding a memory EA in the instruction. * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. * reg: Register or sub-opcode for ModR/M reg field. * base: Base register for memory address. * index: Index register for memory address, or -1 to omit the index. * Must not be X86_SP. * offset: Constant offset for memory address. */ static inline void append_insn_ModRM_mem( CodeBuffer *code, bool is64, X86Opcode opcode, int reg, X86Register base, int index, int32_t offset) { uint8_t rex = is64 ? X86_REX_W : 0; if (reg & 8) { rex |= X86_REX_R; } if (index >= 0 && (index & 8)) { rex |= X86_REX_X; } if (base & 8) { rex |= X86_REX_B; } if (rex) { append_rex_opcode(code, rex, opcode); } else { append_opcode(code, opcode); } X86Mod mod; if (offset == 0) { /* The x86 ISA doesn't allow dereferencing rBP with no offset; * a ModR/M byte with mod=0, r/m=5 (BP) is interpreted as an * absolute address (RIP-relative in x86-64 mode). Instead, * we have to encode it as an 8-bit displacement of zero. * This also applies to R13 in x86-64 mode, since the hardware * does not check REX.B before invoking special handling for * that ModR/M combination. */ if (base == X86_BP || base == X86_R13) { mod = X86MOD_DISP8; } else { mod = X86MOD_DISP0; } } else if ((uint32_t)offset + 128 < 256) { // [-128,+127] mod = X86MOD_DISP8; } else { mod = X86MOD_DISP32; } if (index >= 0) { append_ModRM_SIB(code, mod, reg & 7, 0, index & 7, base & 7); } else if (base == X86_SP || base == X86_R12) { /* SP (4) in the r/m field is used to indicate the presence of a * SIB byte, so we have to encode SP references using SIB. This * also applies to R12, for the same reason as R13 with respect * to BP (see above). */ append_ModRM_SIB(code, mod, reg & 7, 0, X86SIB_NOINDEX, X86_SP); } else { append_ModRM(code, mod, reg & 7, base & 7); } if (mod == X86MOD_DISP8) { append_imm8(code, (uint8_t)offset); } else if (mod == X86MOD_DISP32) { append_imm32(code, (uint32_t)offset); } } /*-----------------------------------------------------------------------*/ /** * append_insn_ModRM_ctx: Append an instruction which takes a ModR/M byte, * encoding an EA appropriate to the given source RTL register. * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. * reg1: Register index or sub-opcode for ModR/M reg field. * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * rtl_reg2: RTL register index from which to set ModR/M r/m field. */ static inline void append_insn_ModRM_ctx( CodeBuffer *code, bool is64, X86Opcode opcode, int reg1, HostX86Context *ctx, int insn_index, int rtl_reg2) { if (is_spilled(ctx, insn_index, rtl_reg2)) { append_insn_ModRM_mem(code, is64, opcode, reg1, X86_SP, -1, ctx->regs[rtl_reg2].spill_offset); } else { append_insn_ModRM_reg(code, is64, opcode, reg1, ctx->regs[rtl_reg2].host_reg); } } /*-----------------------------------------------------------------------*/ /** * append_insn_ModRM_riprel: Append an instruction which takes a ModR/M * byte, encoding an EA using RIP-relative addressing. The instruction is * assumed not to have any additional bytes (such as immediate data) after * the EA displacement. * * [Parameters] * code: Output code buffer. * is64: True to prepend REX.W to an integer instruction, false otherwise. * opcode: Instruction opcode. * reg: Register for ModR/M reg field. * offset: Offset of the address to encode, counting from the base of * the code buffer. */ static inline void append_insn_ModRM_riprel( CodeBuffer *code, bool is64, X86Opcode opcode, X86Register reg, long offset) { uint8_t rex = is64 ? X86_REX_W : 0; if (reg & 8) { rex |= X86_REX_R; } if (rex) { append_rex_opcode(code, rex, opcode); } else { append_opcode(code, opcode); } append_ModRM(code, X86MOD_DISP0, reg & 7, X86MODRM_RIP_REL); /* Displacement is measured from the end of this instruction. */ const long disp = offset - (code->len + 4); ASSERT((uint64_t)disp + 0x80000000 < UINT64_C(0x100000000)); append_imm32(code, disp); } /*-----------------------------------------------------------------------*/ /** * append_vex_insn_ModRM_reg: Append a VEX-format instruction, encoding a * register EA. * * [Parameters] * code: Output code buffer. * vex_W: True to set VEX.W on the instruction, false otherwise. * vex_L: True to set VEX.L on the instruction, false otherwise. * opcode: Instruction opcode. * reg1: Register or sub-opcode for ModR/M reg field. * reg2: Register for ModR/M r/m field. * reg3: Register for VEX vvvv field, or 0 if no third register. */ static inline void append_vex_insn_ModRM_reg( CodeBuffer *code, bool vex_W, bool vex_L, X86Opcode opcode, int reg1, X86Register reg2, X86Register reg3) { append_vex_opcode(code, opcode, vex_W, vex_L, (reg1 & 8) != 0, false, (reg2 & 8) != 0, reg3 & 15); append_ModRM(code, X86MOD_REG, reg1 & 7, reg2 & 7); } /*-----------------------------------------------------------------------*/ /** * append_vex_insn_ModRM_mem: Append a VEX-format instruction, encoding a * memory EA. * * [Parameters] * code: Output code buffer. * vex_W: True to set VEX.W on the instruction, false otherwise. * vex_L: True to set VEX.L on the instruction, false otherwise. * opcode: Instruction opcode. * reg1: Register or sub-opcode for ModR/M reg field. * base: Base register for memory address. * index: Index register for memory address, or -1 to omit the index. * Must not be X86_SP. * offset: Constant offset for memory address. * reg3: Register for VEX vvvv field, or 0 if no third register. */ static inline void append_vex_insn_ModRM_mem( CodeBuffer *code, bool vex_W, bool vex_L, X86Opcode opcode, int reg1, X86Register base, int index, int32_t offset, X86Register reg3) { /* Currently, we only call this with stack references, so many of * these checks are meaningless. */ const bool vex_R = (reg1 & 8) != 0; ASSERT(index < 0); const bool vex_X = false; // index >= 0 && (index & 8) != 0 ASSERT(base == X86_SP); const bool vex_B = false; // (base & 8) != 0 append_vex_opcode(code, opcode, vex_W, vex_L, vex_R, vex_X, vex_B, reg3 & 15); X86Mod mod; if (offset == 0) { mod = X86MOD_DISP0; } else if ((uint32_t)offset + 128 < 256) { // [-128,+127] mod = X86MOD_DISP8; } else { mod = X86MOD_DISP32; } append_ModRM_SIB(code, mod, reg1 & 7, 0, X86SIB_NOINDEX, X86_SP); if (mod == X86MOD_DISP8) { append_imm8(code, (uint8_t)offset); } else if (mod == X86MOD_DISP32) { append_imm32(code, (uint32_t)offset); } } /*-----------------------------------------------------------------------*/ /** * append_vex_insn_ModRM_ctx: Append a VEX-format instruction, encoding * an EA appropriate to the given source RTL register. * * [Parameters] * code: Output code buffer. * vex_W: True to set VEX.W on the instruction, false otherwise. * vex_L: True to set VEX.L on the instruction, false otherwise. * opcode: Instruction opcode. * reg1: Register or sub-opcode for ModR/M reg field. * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * rtl_reg2: RTL register index from which to set ModR/M r/m field. * reg3: Register for VEX vvvv field, or 0 if no third register. */ static inline void append_vex_insn_ModRM_ctx( CodeBuffer *code, bool vex_W, bool vex_L, X86Opcode opcode, int reg1, HostX86Context *ctx, int insn_index, int rtl_reg2, X86Register reg3) { if (is_spilled(ctx, insn_index, rtl_reg2)) { append_vex_insn_ModRM_mem(code, vex_W, vex_L, opcode, reg1, X86_SP, -1, ctx->regs[rtl_reg2].spill_offset, reg3); } else { append_vex_insn_ModRM_reg(code, vex_W, vex_L, opcode, reg1, ctx->regs[rtl_reg2].host_reg, reg3); } } /*-----------------------------------------------------------------------*/ /** * append_nops: Append no-op instructions totaling the given number of * bytes. * * [Parameters] * code: Output code buffer. * len: Number of bytes to append. */ static void append_nops(CodeBuffer *code, int len) { ASSERT(len >= 0); ASSERT(len < 16); switch (len) { case 15: append_opcode(code, X86OP_OPERAND_SIZE); append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP8, 0, 0, 0, 0); append_imm8(code, 0); goto nop9; case 14: append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP8, 0, 0, 0, 0); append_imm8(code, 0); goto nop9; case 13: append_opcode(code, X86OP_NOP_Ev); append_ModRM(code, X86MOD_DISP8, 0, 0); append_imm8(code, 0); goto nop9; case 12: append_opcode(code, X86OP_NOP_Ev); append_ModRM(code, X86MOD_DISP0, 0, 0); goto nop9; case 11: append_opcode(code, X86OP_OPERAND_SIZE); append_opcode(code, X86OP_NOP); goto nop9; case 10: append_opcode(code, X86OP_NOP); /* fall through */ case 9: nop9: append_opcode(code, X86OP_OPERAND_SIZE); append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP32, 0, 0, 0, 0); append_imm32(code, 0); break; case 8: append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP32, 0, 0, 0, 0); append_imm32(code, 0); break; case 7: append_opcode(code, X86OP_NOP_Ev); append_ModRM(code, X86MOD_DISP32, 0, 0); append_imm32(code, 0); break; case 6: append_opcode(code, X86OP_OPERAND_SIZE); append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP8, 0, 0, 0, 0); append_imm8(code, 0); break; case 5: append_opcode(code, X86OP_NOP_Ev); append_ModRM_SIB(code, X86MOD_DISP8, 0, 0, 0, 0); append_imm8(code, 0); break; case 4: append_opcode(code, X86OP_NOP_Ev); append_ModRM(code, X86MOD_DISP8, 0, 0); append_imm8(code, 0); break; case 3: append_opcode(code, X86OP_NOP_Ev); append_ModRM(code, X86MOD_DISP0, 0, 0); break; case 2: append_opcode(code, X86OP_OPERAND_SIZE); append_opcode(code, X86OP_NOP); break; case 1: append_opcode(code, X86OP_NOP); break; } } /*-----------------------------------------------------------------------*/ /** * append_move: Append an instruction to copy (MOV) one register to another. * * [Parameters] * code: Output code buffer. * type: Register type (RTLTYPE_*). * host_dest: Destination host register. * host_src: Source host register. */ static inline void append_move(CodeBuffer *code, RTLDataType type, X86Register host_dest, X86Register host_src) { switch (type) { case RTLTYPE_INT32: case RTLTYPE_FPSTATE: ASSERT(host_dest <= X86_R15); ASSERT(host_src <= X86_R15); append_insn_ModRM_reg(code, false, X86OP_MOV_Gv_Ev, host_dest, host_src); return; case RTLTYPE_INT64: case RTLTYPE_ADDRESS: ASSERT(host_dest <= X86_R15); ASSERT(host_src <= X86_R15); append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, host_dest, host_src); return; default: ASSERT(!rtl_type_is_int(type)); ASSERT(host_dest >= X86_XMM0); ASSERT(host_src >= X86_XMM0); /* The Intel optimization guidelines state: (1) avoid mixed use of * integer/FP operations on the same register (thus MOVAPS instead * of MOVDQA); (2) use PS instead of PD if both operations are * bitwise-equivalent (thus MOVAPS even for double-precision types). */ append_insn_ModRM_reg(code, false, X86OP_MOVAPS_V_W, host_dest, host_src); return; } } /*-----------------------------------------------------------------------*/ /** * append_move_gpr: Append an instruction to copy (MOV) one integer * register to another. * * Specialization of append_move() for GPRs. */ static inline void append_move_gpr(CodeBuffer *code, RTLDataType type, X86Register host_dest, X86Register host_src) { ASSERT(rtl_type_is_int(type) || type == RTLTYPE_FPSTATE); append_insn_ModRM_reg(code, int_type_is_64(type), X86OP_MOV_Gv_Ev, host_dest, host_src); } /*-----------------------------------------------------------------------*/ /** * append_load_imm_gpr: Append an instruction to load an immediate value * into an integer register. * * [Parameters] * code: Output code buffer. * host_dest: Destination register. * imm: Immediate value to load. */ static inline void append_load_imm_gpr(CodeBuffer *code, X86Register host_dest, uint64_t imm) { if (imm <= UINT64_C(0xFFFFFFFF)) { append_insn_R(code, false, X86OP_MOV_rAX_Iv, host_dest); append_imm32(code, (uint32_t)imm); } else if (imm >= UINT64_C(0xFFFFFFFF80000000)) { append_insn_ModRM_reg(code, true, X86OP_MOV_Ev_Iz, 0, host_dest); append_imm32(code, (uint32_t)imm); } else { append_insn_R(code, true, X86OP_MOV_rAX_Iv, host_dest); append_imm64(code, imm); } } /*-----------------------------------------------------------------------*/ /** * append_load: Append an instruction to load a register from a memory * location. The memory address is assumed to be properly aligned. * * V2_FLOAT32 operands load a full XMM register (16 bytes). To load only * the 8 bytes of actual data, use the FLOAT64 type. * * [Parameters] * code: Output code buffer. * type: Register type (RTLTYPE_*). * host_dest: Destination host register. * host_base: Host register for memory address base. * host_index: Host register for memory address index, or -1 if no index. * offset: Access offset from base register. */ static inline void append_load( CodeBuffer *code, RTLDataType type, X86Register host_dest, X86Register host_base, int host_index, int32_t offset) { switch (type) { case RTLTYPE_INT32: case RTLTYPE_FPSTATE: append_insn_ModRM_mem(code, false, X86OP_MOV_Gv_Ev, host_dest, host_base, host_index, offset); return; case RTLTYPE_INT64: case RTLTYPE_ADDRESS: append_insn_ModRM_mem(code, true, X86OP_MOV_Gv_Ev, host_dest, host_base, host_index, offset); return; case RTLTYPE_FLOAT32: append_insn_ModRM_mem(code, false, X86OP_MOVSS_V_W, host_dest, host_base, host_index, offset); return; case RTLTYPE_FLOAT64: append_insn_ModRM_mem(code, false, X86OP_MOVSD_V_W, host_dest, host_base, host_index, offset); return; default: ASSERT(rtl_type_is_vector(type)); append_insn_ModRM_mem(code, false, X86OP_MOVAPS_V_W, host_dest, host_base, host_index, offset); return; } } /*-----------------------------------------------------------------------*/ /** * append_load_gpr: Append an instruction to load an integer register * from a memory location. The memory address is assumed to be properly * aligned. * * Specialization of append_load() for GPRs and host_index == -1. */ static inline void append_load_gpr( CodeBuffer *code, RTLDataType type, X86Register host_dest, X86Register host_base, int32_t offset) { ASSERT(rtl_type_is_int(type) || type == RTLTYPE_FPSTATE); append_insn_ModRM_mem(code, int_type_is_64(type), X86OP_MOV_Gv_Ev, host_dest, host_base, -1, offset); } /*-----------------------------------------------------------------------*/ /** * append_store: Append an instruction to store a register to a memory * location. The memory address is assumed to be properly aligned. * * V2_FLOAT32 operands store a full XMM register (16 bytes). To store * only the 8 bytes of actual data, use the FLOAT64 type. * * [Parameters] * code: Output code buffer. * type: Register type (RTLTYPE_*). * host_src: Destination host register. * host_base: Host register for memory address base. * host_index: Host register for memory address index, or -1 if no index. * offset: Access offset from base register. */ static inline void append_store( CodeBuffer *code, RTLDataType type, X86Register host_src, X86Register host_base, int host_index, int32_t offset) { switch (type) { case RTLTYPE_INT32: case RTLTYPE_FPSTATE: append_insn_ModRM_mem(code, false, X86OP_MOV_Ev_Gv, host_src, host_base, host_index, offset); return; case RTLTYPE_INT64: case RTLTYPE_ADDRESS: append_insn_ModRM_mem(code, true, X86OP_MOV_Ev_Gv, host_src, host_base, host_index, offset); return; case RTLTYPE_FLOAT32: append_insn_ModRM_mem(code, false, X86OP_MOVSS_W_V, host_src, host_base, host_index, offset); return; case RTLTYPE_FLOAT64: append_insn_ModRM_mem(code, false, X86OP_MOVSD_W_V, host_src, host_base, host_index, offset); return; default: ASSERT(rtl_type_is_vector(type)); append_insn_ModRM_mem(code, false, X86OP_MOVAPS_W_V, host_src, host_base, host_index, offset); return; } } /*-----------------------------------------------------------------------*/ /** * append_load_alias: Append an instruction to load the given alias from * its storage location. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * alias: Alias register to load. * host_dest: Host register into which to load alias register value. */ static inline void append_load_alias( CodeBuffer *code, const HostX86Context *ctx, const RTLAlias *alias, X86Register host_dest) { const X86Register host_base = alias->base ? ctx->regs[alias->base].host_reg : X86_SP; append_load(code, alias->type, host_dest, host_base, -1, alias->offset); } /*-----------------------------------------------------------------------*/ /** * append_store_alias: Append an instruction to store the given alias to * its storage location. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * alias: Alias to store. * host_src: Host register containing data to store. */ static inline void append_store_alias( CodeBuffer *code, const HostX86Context *ctx, const RTLAlias *alias, X86Register host_src) { if (alias->base) { /* We store V2_FLOAT32 as a full XMM register on the stack, but we * need to store only the 8 bytes with actual data if writing to * bound storage. */ const RTLDataType store_type = ((alias->type == RTLTYPE_V2_FLOAT32) ? RTLTYPE_FLOAT64 : alias->type); append_store(code, store_type, host_src, ctx->regs[alias->base].host_reg, -1, alias->offset); } else { append_store(code, alias->type, host_src, X86_SP, -1, alias->offset); } } /*-----------------------------------------------------------------------*/ /** * append_move_or_load: If the given source register has been spilled, * load it into the given destination register from its spill location; * otherwise, move it from its current register if it is not already in * the destination register. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * unit: RTLUnit being translated. * insn_index: Index of current instruction in ctx->unit->insns[]. * host_dest: Destination host register. * src: Source RTL register. */ static inline void append_move_or_load( CodeBuffer *code, const HostX86Context *ctx, const RTLUnit *unit, int insn_index, int host_dest, int src) { if (is_spilled(ctx, insn_index, src)) { append_load(code, unit->regs[src].type, host_dest, X86_SP, -1, ctx->regs[src].spill_offset); } else if (ctx->regs[src].host_reg != host_dest) { append_move(code, unit->regs[src].type, host_dest, ctx->regs[src].host_reg); } } /*-----------------------------------------------------------------------*/ /** * append_move_or_load_gpr: If the given source integer register has been * spilled, load it into the given destination register from its spill * location; otherwise, move it from its current register if it is not * already in the destination register. * * Specialization of append_move_or_load() for GPRs. As a special case * (used by translate_call()), if the register is not live at all, it is * assumed to be a constant which can be loaded directly into the target * register. */ static inline void append_move_or_load_gpr( CodeBuffer *code, const HostX86Context *ctx, const RTLUnit *unit, int insn_index, int host_dest, int src) { if (!unit->regs[src].live) { ASSERT(unit->regs[src].source == RTLREG_CONSTANT); ASSERT(rtl_register_is_int(&unit->regs[src])); append_load_imm_gpr(code, host_dest, unit->regs[src].value.i64); } else if (is_spilled(ctx, insn_index, src)) { append_load_gpr(code, unit->regs[src].type, host_dest, X86_SP, ctx->regs[src].spill_offset); } else if (ctx->regs[src].host_reg != host_dest) { append_move_gpr(code, unit->regs[src].type, host_dest, ctx->regs[src].host_reg); } } /*-----------------------------------------------------------------------*/ /** * append_compare: Append an appropriate comparison instruction for the * given parameters. src2==0 implies a register-immediate compare. * * If the operation is a floating-point compare, it must be a comparison * that only requires one test (GT or GE). * * [Parameters] * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * code: Output code buffer. * src1: RTL register containing first comparand. * src2: RTL register containing second comparand, or 0 for a * register-immediate compare. * src_imm: Immediate comparand if src2 == 0. * src1_temp: Temporary host register for reloading src1. Ignored if * src1 does not need to be reloaded. * icmp_eq: True if an integer comparison is an equality comparison. * Used to eliminate comparisons against immediate 0 after an * instruction which does not set the full set of flags. * fcmp_ordered: True if a floating-point comparison should use the * ordered comparison instruction (COMIS[SD]), false otherwise. * Ignored for integer compares. * clear_reg: X86Register to clear before performing the comparison, * or -1 to not clear any register. * [Return value] * True if a comparison instruction was added; false if the comparison * was optimized out. */ static bool append_compare( HostX86Context *ctx, int insn_index, CodeBuffer *code, int src1, int src2, int32_t src_imm, X86Register src1_temp, bool icmp_eq, bool fcmp_ordered, int clear_reg) { const RTLUnit * const unit = ctx->unit; const RTLRegister * const src1_reg = &unit->regs[src1]; const HostX86RegInfo * const src1_info = &ctx->regs[src1]; X86Register host_src1 = src1_info->host_reg; if (src2) { if (ctx->last_cmp_reg == src1 && ctx->last_cmp_target == src2) { return false; } if (is_spilled(ctx, insn_index, src1)) { host_src1 = src1_temp; append_load(code, src1_reg->type, host_src1, X86_SP, -1, src1_info->spill_offset); } if (clear_reg >= 0) { append_insn_ModRM_reg(code, false, X86OP_XOR_Gv_Ev, clear_reg, clear_reg); } if (rtl_register_is_int(src1_reg)) { const bool is64 = int_type_is_64(src1_reg->type); append_insn_ModRM_ctx(code, is64, X86OP_CMP_Gv_Ev, host_src1, ctx, insn_index, src2); } else { const bool is64 = (src1_reg->type == RTLTYPE_FLOAT64); const X86Opcode opcode = is64 ? (fcmp_ordered ? X86OP_COMISD : X86OP_UCOMISD) : (fcmp_ordered ? X86OP_COMISS : X86OP_UCOMISS); append_insn_ModRM_ctx(code, false, opcode, host_src1, ctx, insn_index, src2); } if (ctx->handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = 0; ctx->last_cmp_reg = src1; ctx->last_cmp_target = src2; } } else if (src_imm) { if (ctx->last_cmp_reg == src1 && ctx->last_cmp_target == 0 && ctx->last_cmp_imm == src_imm) { return false; } if (clear_reg >= 0) { append_insn_ModRM_reg(code, false, X86OP_XOR_Gv_Ev, clear_reg, clear_reg); } ASSERT(rtl_register_is_int(src1_reg)); const bool is64 = int_type_is_64(src1_reg->type); if ((uint32_t)src_imm + 128 < 256) { append_insn_ModRM_ctx(code, is64, X86OP_IMM_Ev_Ib, X86OP_IMM_CMP, ctx, insn_index, src1); append_imm8(code, (uint8_t)src_imm); } else { append_insn_ModRM_ctx(code, is64, X86OP_IMM_Ev_Iz, X86OP_IMM_CMP, ctx, insn_index, src1); append_imm32(code, src_imm); } if (ctx->handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = 0; ctx->last_cmp_reg = src1; ctx->last_cmp_target = 0; ctx->last_cmp_imm = src_imm; } } else { if (icmp_eq && ctx->last_test_reg == src1) { return false; } if (ctx->last_cmp_reg == src1 && ctx->last_cmp_target == 0 && ctx->last_cmp_imm == 0) { return false; } if (clear_reg >= 0) { append_insn_ModRM_reg(code, false, X86OP_XOR_Gv_Ev, clear_reg, clear_reg); } ASSERT(rtl_register_is_int(src1_reg)); const bool is64 = int_type_is_64(src1_reg->type); if (is_spilled(ctx, insn_index, src1)) { append_insn_ModRM_mem(code, is64, X86OP_IMM_Ev_Ib, X86OP_IMM_CMP, X86_SP, -1, src1_info->spill_offset); append_imm8(code, 0); } else { append_insn_ModRM_reg(code, is64, X86OP_TEST_Ev_Gv, host_src1, host_src1); } if (ctx->handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = src1; ctx->last_cmp_reg = src1; ctx->last_cmp_target = 0; ctx->last_cmp_imm = 0; } } return true; } /*-----------------------------------------------------------------------*/ /** * append_jump_raw: Append a JMP or Jcc instruction with the given * displacement. The displacement is encoded in 8 bits if in the range * [-127,+128] and in 32 bits otherwise; the opcode is assumed to be * correct for the displacement size. * * [Parameters] * code: Output code buffer. * opcode: Jump opcode. * disp: Displacement to encode. */ static inline void append_jump_raw(CodeBuffer *code, X86Opcode opcode, int32_t disp) { append_opcode(code, opcode); if (((uint32_t)disp + 128) < 256) { // i.e., disp is in [-128,+127] append_imm8(code, (uint8_t)disp); } else { append_imm32(code, disp); } } /*-----------------------------------------------------------------------*/ /** * append_jump: Append a JMP or Jcc instruction targeting the given code * location. * * If target is negative, a long jump with displacement 0 is appended and * the address of the displacement is saved as the current block's * unresolved branch. * * [Parameters] * code: Output code buffer. * block_info: HostX86BlockInfo for the current basic block. * short_opcode: Opcode for a short (8-bit) displacement. * long_opcode: Opcode for a long (32-bit) displacement. * label: Target label. * target: Byte position of the target instruction, or -1 if the target * is unknown. */ static void append_jump( CodeBuffer *code, HostX86BlockInfo *block_info, X86Opcode short_opcode, X86Opcode long_opcode, int label, long target) { if (target >= 0) { const int64_t offset = target - code->len; ASSERT(offset >= INT64_C(-0x80000000) && offset <= INT64_C(0x7FFFFFFF)); /* Jump displacements count from the end of the instruction, so we * have to take that into account here -- a 1-byte displacement * will be a 2-byte instruction, for example. */ ASSERT((uint32_t)short_opcode <= 0xFF); if (((unsigned long)(offset - 2) + 128) < 256) { append_opcode(code, short_opcode); append_imm8(code, (uint8_t)(offset - 2)); } else { append_opcode(code, long_opcode); if ((uint32_t)long_opcode <= 0xFF) { append_imm32(code, (uint32_t)(offset - 5)); } else { ASSERT((uint32_t)long_opcode <= 0xFFFF); append_imm32(code, (uint32_t)(offset - 6)); } } } else { append_opcode(code, long_opcode); block_info->unresolved_branch_offset = code->len; append_imm32(code, 0); block_info->unresolved_branch_target = label; } } /*-----------------------------------------------------------------------*/ /** * reload_base_and_index: Return the host registers containing the values * of the base and (if present) index registers for the given memory access * instruction, reloading spilled registers if necessary. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * fallback: Register to use as a fallback for reloads. * base_ret: Pointer to variable to receive the base register * (X86Register). * index_ret: Pointer to variable to receive the index register * (X86Register), or -1 if there is no index register for the access. */ static void reload_base_and_index( CodeBuffer *code, const HostX86Context *ctx, int insn_index, X86Register fallback, X86Register *base_ret, int *index_ret) { const RTLUnit * const unit = ctx->unit; const RTLInsn * const insn = &unit->insns[insn_index]; int base = insn->src1; int index = insn->host_data_16; /* For indexed accesses, if base is spilled but index is not, swap the * two registers. This ensures that if the fallback register overlaps * the index register (such as in a load operation when the destination * is the same as the index register), we don't clobber the index when * we reload the base. */ if (index && is_spilled(ctx, insn_index, base) && !is_spilled(ctx, insn_index, index)) { const int temp = base; base = index; index = temp; } /* Save the current output position in case we end up needing to load * the index first (for the 64+32 add case below). */ const long base_reload_pos = code->len; if (!is_spilled(ctx, insn_index, base)) { *base_ret = ctx->regs[base].host_reg; } else { /* This could be INT32/INT64 if address operand optimization * eliminated a ZCAST. */ ASSERT(rtl_register_is_int(&unit->regs[base])); append_load(code, unit->regs[base].type, fallback, X86_SP, -1, ctx->regs[base].spill_offset); *base_ret = fallback; } if (index) { if (!is_spilled(ctx, insn_index, index)) { *index_ret = ctx->regs[index].host_reg; } else { ASSERT(rtl_register_is_int(&unit->regs[index])); if (*base_ret == fallback) { /* We should always have a separate temporary if we have to * reload a spilled index. */ ASSERT(is_spilled(ctx, insn_index, base)); if (int_type_is_64(unit->regs[index].type)) { append_insn_ModRM_mem( code, true, X86OP_ADD_Gv_Ev, fallback, X86_SP, -1, ctx->regs[index].spill_offset); } else if (int_type_is_64(unit->regs[base].type)) { /* The base register is 64 bits, so we can load the * index first and add the base from its spill slot. */ code->len = base_reload_pos; append_load(code, RTLTYPE_INT32, fallback, X86_SP, -1, ctx->regs[index].spill_offset); append_insn_ModRM_mem( code, true, X86OP_ADD_Gv_Ev, fallback, X86_SP, -1, ctx->regs[base].spill_offset); } else { /* This is tricky: we have to add two 32-bit values * from memory and get a 64-bit sum without using any * other registers or memory. Since both values are * 32 bits wide, there'll be at most a carry of 1 into * the high word, so we do the addition in 32 bits and * handle the carry manually. Fortunately, this case * should be extremely rare in practice. */ log_warning(ctx->handle, "Slow add of spilled 32-bit" " base and index at %d", insn_index); append_insn_ModRM_mem( code, false, X86OP_ADD_Gv_Ev, fallback, // 32-bit add! X86_SP, -1, ctx->regs[index].spill_offset); const long jump_pos = code->len; append_jump_raw(code, X86OP_JNC_Jb, 0); const long jump_end = code->len; ASSERT(jump_end == jump_pos + 2); append_insn_ModRM_reg(code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_ROR, fallback); append_imm8(code, 32); append_insn_ModRM_reg(code, true, X86OP_IMM_Ev_Ib, X86OP_IMM_ADD, fallback); append_imm8(code, 1); append_insn_ModRM_reg(code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_ROR, fallback); append_imm8(code, 32); ASSERT(code->len - jump_end <= 127); code->buffer[jump_end-1] = (uint8_t)(code->len - jump_end); } *index_ret = -1; } else { append_load(code, unit->regs[index].type, fallback, X86_SP, -1, ctx->regs[index].spill_offset); *index_ret = fallback; } } } else { *index_ret = -1; } } /*-----------------------------------------------------------------------*/ /** * reload_store_source_gpr: Return the GPR containing the data value for * an integer or byte-reversed float store instruction. If necessary, * save RAX in XMM15; the caller needs to restore it after the store in * this case. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * insn_index: Index of current instruction in ctx->unit->insns[]. * host_base_ptr: Pointer to Base register for address (X86Register). * May be modified on return. * host_index_ptr: Pointer to index register for address (X86Register, * -1 if none). May be modified on return. * host_value_ret: Pointer to variable to receive the value register * (X86Register). * [Return value] * True if RAX was saved to XMM15, false otherwise. */ static bool reload_store_source_gpr( CodeBuffer *code, const HostX86Context *ctx, int insn_index, X86Register *host_base_ptr, int *host_index_ptr, X86Register *host_value_ret) { const RTLUnit * const unit = ctx->unit; const RTLInsn * const insn = &unit->insns[insn_index]; const int src2 = insn->src2; const RTLRegister * const src2_reg = &unit->regs[src2]; const HostX86RegInfo * const src2_info = &ctx->regs[src2]; RTLDataType type = src2_reg->type; bool is_float = false; if (type == RTLTYPE_FLOAT32) { is_float = true; type = RTLTYPE_INT32; } else if (type == RTLTYPE_FLOAT64) { is_float = true; type = RTLTYPE_INT64; } const bool is64 = int_type_is_64(type); const bool spilled = is_spilled(ctx, insn_index, src2); if (!is_float && !spilled) { /* If the value to be stored is the same as the base or index * register and MOVBE is not in use, we have to use a temporary for * the byte-swapped value (see notes in allocate_regs_for_insn()). */ bool bswap_src2_collision = false; if (!(ctx->handle->setup.host_features & BINREC_FEATURE_X86_MOVBE)) { if (insn->opcode == RTLOP_STORE_BR || insn->opcode == RTLOP_STORE_I16_BR) { bswap_src2_collision = (ctx->regs[src2].host_reg == *host_base_ptr || ctx->regs[src2].host_reg == *host_index_ptr); } } if (!bswap_src2_collision) { *host_value_ret = src2_info->host_reg; return false; } } /* insn->src3 is not an RTL register here! Instead it holds a * temporary X86Register for reloading a spilled store source value. * See the allocation logic at the top of allocate_regs_for_insn() * for details. */ X86Register host_value = (X86Register)insn->src3; if (host_value != *host_base_ptr && (int)host_value != *host_index_ptr) { if (spilled) { append_load_gpr(code, type, host_value, X86_SP, src2_info->spill_offset); } else if (is_float) { append_insn_ModRM_reg(code, is64, X86OP_MOVD_E_V, src2_info->host_reg, host_value); } else { append_move_gpr(code, type, host_value, src2_info->host_reg); } *host_value_ret = host_value; return false; } ASSERT(host_value == X86_R15); /* If we get here, base or index is in R15 due to a spill reload. * For an indexed access, if one of the two registers is unspilled, we * always use that register as the base (see reload_base_and_index()), * so host_index will be R15. If both registers are spilled or it's * not an indexed access, host_index will be -1. So we only need to * check host_base for RAX collision here. */ ASSERT(*host_index_ptr != X86_AX); if (*host_base_ptr == X86_AX) { ASSERT(*host_index_ptr == X86_R15); append_insn_ModRM_reg(code, true, X86OP_ADD_Gv_Ev, X86_R15, X86_AX); *host_base_ptr = X86_R15; *host_index_ptr = -1; } append_insn_ModRM_reg(code, true, X86OP_MOVD_V_E, X86_XMM15, X86_AX); if (spilled) { append_load_gpr(code, type, X86_AX, X86_SP, src2_info->spill_offset); } else if (is_float) { append_insn_ModRM_reg(code, is64, X86OP_MOVD_E_V, src2_info->host_reg, X86_AX); } else { append_move_gpr(code, type, X86_AX, src2_info->host_reg); } *host_value_ret = X86_AX; return true; } /*************************************************************************/ /************************* Alias/spill handling **************************/ /*************************************************************************/ /* Maximum length of alias setup code generated by reload_regs_for_block(). * If the input code buffer has at least this much space available, it will * not be expanded. The worst case is: * - 14 GPR spills with REX prefixes = 112 bytes * - 14 GPR loads with REX prefixes = 112 bytes * - 14 XMM exchanges with REX prefixes = 168 bytes * - 1 XMM load with a REX prefix = 9 bytes * for a total of 401 bytes. We round up to 416 to potentially help the * compiler optimize when using a temporary buffer. */ #define RELOAD_REGS_SIZE 416 /** * reload_regs_for_block: Reload host registers with the values expected * on entry to the given block. Merged alias values are moved or reloaded * to their designated merge registers, and if the control flow edge being * traversed is a backward branch, reload any registers whose spills are * crossed by the branch. * * The code buffer passed to this function may be a temporary buffer of * size RELOAD_REGS_SIZE or greater; in that case, the function will * always succeed. * * [Parameters] * code: Output code buffer (may be a temporary buffer). * ctx: Translation context. * block_index: Index of current basic block in ctx->unit->blocks[]. * target_block: Index of target basic block in ctx->unit->blocks[]. * [Return value] * True on success, false if out of memory. */ static bool reload_regs_for_block( CodeBuffer *code, HostX86Context *ctx, int block_index, int target_block) { RTLUnit * const unit = ctx->unit; const int num_aliases = unit->next_alias; const int last_insn = unit->blocks[block_index].last_insn; const int target_insn = unit->blocks[target_block].first_insn; const uint16_t *current_store = ctx->blocks[block_index].alias_store; const uint16_t *next_load = ctx->blocks[target_block].alias_load; if (UNLIKELY(code->buffer_size - code->len < RELOAD_REGS_SIZE)) { ASSERT(code->buffer == ctx->handle->code_buffer); ctx->handle->code_len = code->len; if (UNLIKELY(!binrec_ensure_code_space(ctx->handle, RELOAD_REGS_SIZE))) { log_error(ctx->handle, "No memory for register reload for block" " %d->%d", block_index, target_block); return false; } code->buffer = ctx->handle->code_buffer; code->buffer_size = ctx->handle->code_buffer_size; } /* If this is a forward branch, first spill any registers whose spills * are crossed by the branch. */ if (unit->blocks[block_index].next_block >= 0 && target_block > unit->blocks[block_index].next_block) { for (int i = 0; i < 32; i++) { /* We don't need to call current_reg() here since we check * reg->death separately below. */ const int reg_index = ctx->reg_map[i]; if (reg_index) { const RTLRegister *reg = &unit->regs[reg_index]; if (reg->death >= target_insn && is_spilled(ctx, target_insn, reg_index)) { /* The register can't be spilled if it's in the live * map (since it would have been overwritten by the * register that spilled it). */ ASSERT(!is_spilled(ctx, last_insn, reg_index)); const HostX86RegInfo *reg_info = &ctx->regs[reg_index]; append_store(code, reg->type, reg_info->host_reg, X86_SP, -1, reg_info->spill_offset); } } } } /* Construct a map of which registers get moved where and which need * to be reloaded from spill slots or loaded from alias storage. We * need a separate step for this in case the target of one move would * overwrite the source of another, in which case we have to use a swap * instead. This can get particularly tricky if several registers are * shifted in a loop (see tests/host-x86/general/alias-merge-swap-cycle-* * for some examples). */ uint32_t move_targets = 0; // Bit set = register is a move target uint32_t reload_targets = 0; // Bit set = register is a reload target uint32_t load_targets = 0; // Bit set = register is a load target uint8_t move_map[32]; // X86Register source for each move target uint8_t src_map[32]; // Map from original to current (swapped) registers // (i.e., "where is this register's value now?") uint8_t value_map[32]; // Map from original reg values to current locations // (i.e., "what does this register now hold?") uint8_t src_count[32]; // # of times each host register is used as a source // (indexed by original host register) uint8_t dest_type[32]; // RTLDataType of each alias, indexed by move target uint16_t reload_map[32]; // RTL register to load into each reload target uint16_t load_map[32]; // RTL alias to load into each load target memset(src_count, 0, sizeof(src_count)); for (int i = 1; i < num_aliases; i++) { const int merge_reg = next_load[i]; if (merge_reg && ctx->regs[merge_reg].merge_alias) { const X86Register host_dest = ctx->regs[merge_reg].host_merge; dest_type[host_dest] = unit->regs[merge_reg].type; value_map[host_dest] = host_dest; const int store_reg = current_store[i]; if (store_reg) { if (is_spilled(ctx, last_insn, store_reg)) { reload_targets |= 1 << host_dest; reload_map[host_dest] = (uint16_t)store_reg; } else { const X86Register host_src = ctx->regs[store_reg].host_reg; move_targets |= 1 << host_dest; move_map[host_dest] = host_src; src_map[host_src] = host_src; value_map[host_src] = host_src; src_count[host_src]++; } } else { load_targets |= 1 << host_dest; load_map[host_dest] = (uint16_t)i; } } } /* If this is a backward branch, also include reloads of registers * which are spilled now but were not spilled at the branch target. */ if (unit->blocks[block_index].next_block < 0 || target_block < unit->blocks[block_index].next_block) { const HostX86BlockInfo *target_info = &ctx->blocks[target_block]; for (int i = 0; i < 32; i++) { const int reg_index = target_info->initial_reg_map[i]; if (reg_index) { /* If the register is live on entry to the target block, * it can't have been chosen as a merge target, since merge * targets are always GET_ALIAS outputs (which by SSA are * not live before the GET_ALIAS instruction). */ ASSERT(!(move_targets & (1 << i))); ASSERT(!(reload_targets & (1 << i))); ASSERT(!(load_targets & (1 << i))); const RTLRegister *reg = &unit->regs[reg_index]; /* The register's live range should have been extended to * the last backward branch that targets a block where * it's live. */ ASSERT(reg->death >= last_insn); if (is_spilled(ctx, last_insn, reg_index)) { reload_targets |= 1 << i; reload_map[i] = reg_index; } } } } /* Add any registers which are live and unspilled both here and at the * beginning of the target block as moves to themselves, to ensure that * their values don't get lost during register shuffling. */ for (int i = 0; i < 32; i++) { const int reg = ctx->reg_map[i]; const int later_insn = max(last_insn, target_insn); if (reg && (target_insn > last_insn || unit->regs[reg].birth < target_insn) && unit->regs[reg].death >= later_insn && !is_spilled(ctx, later_insn, reg)) { ASSERT(!(move_targets & (1 << i))); move_targets |= 1 << i; move_map[i] = i; src_map[i] = i; value_map[i] = i; src_count[i]++; } } /* Now actually perform the moves, swapping registers as needed. In * order to avoid clobbering values in certain move patterns (see * tests/host-x86/general/alias-merge-source-live-no-swap.c for an * example), we initially only move registers whose targets are not * themselves used as sources. If no moves can be done during a pass * but there are still registers left to be moved, then the remaining * moves must form one or more cycles, so we swap one pair to try and * break a cycle, then start a new pass. */ while (move_targets) { uint32_t move_targets_pass = move_targets; bool resolved_any = false; while (move_targets_pass) { const X86Register host_dest = ctz32(move_targets_pass); move_targets_pass ^= 1 << host_dest; const X86Register move_src = move_map[host_dest]; const X86Register host_src = src_map[move_src]; if (host_src != host_dest) { if (src_count[value_map[host_dest]] > 0) { /* The value in the register we're about to write is * still needed, so skip it for now. */ continue; } append_move(code, dest_type[host_dest], host_dest, host_src); } move_targets ^= 1 << host_dest; // Register was resolved. resolved_any = true; src_count[move_src]--; } if (move_targets && !resolved_any) { /* There's a cycle in the move graph, so pick the first target * remaining and swap it with its source value. */ const X86Register host_dest = ctz32(move_targets); move_targets ^= 1 << host_dest; const X86Register move_src = move_map[host_dest]; const X86Register host_src = src_map[move_src]; ASSERT(host_src != host_dest); // Or it would already be resolved. /* Swap the registers, then update maps so we know where the * values have gone. There's no equivalent of the XCHG * instruction for XMM registers, so use the XOR method * (a^=b, b^=a, a^=b) in that case. */ if (host_dest >= X86_XMM0) { ASSERT(host_src >= X86_XMM0); append_insn_ModRM_reg(code, false, X86OP_XORPS, host_dest, host_src); append_insn_ModRM_reg(code, false, X86OP_XORPS, host_src, host_dest); append_insn_ModRM_reg(code, false, X86OP_XORPS, host_dest, host_src); } else { ASSERT(host_src < X86_XMM0); /* Slight laziness here: always exchange 64 bits even if * the values in both registers are only 32 bits wide. */ append_insn_ModRM_reg(code, true, X86OP_XCHG_Ev_Gv, host_src, host_dest); } value_map[host_src] = value_map[host_dest]; src_map[value_map[host_dest]] = host_src; value_map[host_dest] = move_src; src_map[move_src] = host_dest; } } /* Finally, load values from storage which were spilled or not live. */ while (reload_targets) { const X86Register host_dest = ctz32(reload_targets); reload_targets ^= 1 << host_dest; const int src = reload_map[host_dest]; append_load(code, unit->regs[src].type, host_dest, X86_SP, -1, ctx->regs[src].spill_offset); } while (load_targets) { const X86Register host_dest = ctz32(load_targets); load_targets ^= 1 << host_dest; append_load_alias(code, ctx, &unit->aliases[load_map[host_dest]], host_dest); } return true; } /*-----------------------------------------------------------------------*/ /** * check_reload_conflicts: Return whether any alias or spill reloads * required for the branch at branch_insn collide with any live registers. * * [Parameters] * ctx: Translation context. * block_index: Index of current basic block in ctx->unit->blocks[]. * branch_insn: Index of branch instruction in ctx->unit->insns[]. * [Return value] * True if any spill reloads would collide with a live register, * false otherwise. */ static bool check_reload_conflicts(const HostX86Context *ctx, int block_index, int branch_insn) { ASSERT(branch_insn == ctx->unit->blocks[block_index].last_insn); RTLUnit * const unit = ctx->unit; const int target_label = unit->insns[branch_insn].label; const int target_block = unit->label_blockmap[target_label]; const int target_insn = unit->blocks[target_block].first_insn; /* Check for alias reload conflicts. A conflict occurs when the * destination of a register move or reload is either: * - live past the branch (including registers which are dead at * the branch but alias-merged into the next block) -- this set * of registers is recorded in the block's end_live field; or * - dead at the branch, but alias-merged into the target block -- * this covers cases where the reload would generate an XCHG * instruction (or an equivalent operation for XMM registers), * leaving the registers swapped if the branch was not taken. * Make sure to cover both cases here. */ const int num_aliases = unit->next_alias; const uint16_t *current_store = ctx->blocks[block_index].alias_store; const uint16_t *next_load = ctx->blocks[target_block].alias_load; uint32_t conflict_regs = ctx->blocks[block_index].end_live; for (int i = 1; i < num_aliases; i++) { const int merge_reg = next_load[i]; if (merge_reg && ctx->regs[merge_reg].merge_alias) { const int merge_src = current_store[i]; if (!is_spilled(ctx, branch_insn, merge_src)) { conflict_regs |= 1 << ctx->regs[merge_src].host_reg; } } } for (int i = 1; i < num_aliases; i++) { const int merge_reg = next_load[i]; if (merge_reg && ctx->regs[merge_reg].merge_alias) { const int merge_src = current_store[i]; const X86Register host_src = ctx->regs[merge_src].host_reg; const X86Register host_dest = ctx->regs[merge_reg].host_merge; const bool move_required = (!merge_src || is_spilled(ctx, branch_insn, merge_src) || host_src != host_dest); if (move_required && (conflict_regs & (1 << host_dest))) { return true; } } } /* If this is a backward branch, also check for spill reload conflicts. */ if (target_insn < branch_insn) { const uint16_t *current_map = ctx->reg_map; const uint16_t *next_map = ctx->blocks[target_block].initial_reg_map; uint32_t live = ctx->blocks[block_index].end_live; while (live) { const int host_reg = ctz32(live); live ^= 1 << host_reg; if (next_map[host_reg] && current_map[host_reg] != next_map[host_reg]) { return true; } } } return false; } /*************************************************************************/ /*************************** Translation core ****************************/ /*************************************************************************/ /** * append_prologue: Append the function prologue to the output code buffer. * * [Parameters] * ctx: Translation context. * [Return value] * True on success, false if out of memory. */ static bool append_prologue(HostX86Context *ctx) { ASSERT(ctx); ASSERT(ctx->handle); binrec_t * const handle = ctx->handle; const uint32_t regs_to_save = ctx->regs_touched & ctx->callee_saved_regs; const bool is_windows_seh = (handle->setup.host == BINREC_ARCH_X86_64_WINDOWS_SEH); /* Figure out how much stack space we use in total. */ int total_stack_use; const int push_size = 8 * popcnt32(regs_to_save & 0xFFFF); total_stack_use = push_size; /* If we have any XMM registers to save, we have to align the stack at * this point so the saves and loads are properly aligned. This implies * that we also need to align the frame size here, since the final stack * pointer must remain 16-byte aligned. */ const uint32_t xmm_to_save = regs_to_save >> 16; if (xmm_to_save) { /* The stack pointer after pushes is either 0 or 8 bytes past a * multiple of 16. To align it, we subtract 8 if the number of * pushes is even. (That's not a typo -- the stack pointer comes * in unaligned due to the return address pushed by the CALL * instruction that jumped here.) */ if (push_size % 16 == 0) { total_stack_use += 8; } total_stack_use += 16 * popcnt32(xmm_to_save); ctx->frame_size = align_up(ctx->frame_size, 16); } total_stack_use += ctx->frame_size + ctx->frame_callee_reserve; /* Final stack pointer alignment: the total stack usage should be a * multiple of 16 plus 8, again because of the return address. */ if (total_stack_use % 16 != 8) { total_stack_use += 16 - ((total_stack_use + 8) & 15); } /* Calculate the amount of stack space to reserve, excluding GPR pushes. */ const int stack_alloc = total_stack_use - push_size; ctx->stack_alloc = stack_alloc; if (is_windows_seh) { /* Create unwind data for the function, because Microsoft likes * finding ways to make everybody's lives harder... */ enum { UWOP_PUSH_NONVOL = 0, UWOP_ALLOC_LARGE = 1, UWOP_ALLOC_SMALL = 2, UWOP_SAVE_XMM128 = 8, }; /* We'll have at most: * 1 * 8 GPRs * 2 * stack adjustment * 2 * 10 XMM registers * for a total of 30 16-bit data words. */ uint8_t unwind_info[4 + 2*30]; int prologue_pos = 0; unwind_info[0] = 1; // version:3, flags:5 unwind_info[1] = 0; // Size of prologue (will be filled in later) unwind_info[2] = 0; // Code count (will be filled in later) unwind_info[3] = 0; // Frame register/offset (not used) /* The unwind information has to be given in reverse order (?!), * so generate from the end of the buffer and move it into place * when we're done. */ int unwind_pos = sizeof(unwind_info); for (int reg = 0; reg < 16; reg++) { if (regs_to_save & (1 << reg)) { unwind_pos -= 2; ASSERT(unwind_pos >= 4); unwind_info[unwind_pos+0] = prologue_pos; unwind_info[unwind_pos+1] = UWOP_PUSH_NONVOL | reg<<4; if (reg < 8) { prologue_pos += 1; // PUSH } else { prologue_pos += 2; // REX PUSH } } } if (stack_alloc > 0) { const int stack_alloc_info = (stack_alloc / 8) - 1; if (stack_alloc > 128) { ASSERT(stack_alloc < 524288); // Should never need this much. unwind_pos -= 4; ASSERT(unwind_pos >= 4); unwind_info[unwind_pos+0] = prologue_pos; unwind_info[unwind_pos+1] = UWOP_ALLOC_LARGE; unwind_info[unwind_pos+2] = (uint8_t)(stack_alloc_info >> 0); unwind_info[unwind_pos+3] = (uint8_t)(stack_alloc_info >> 8); prologue_pos += 7; } else { unwind_pos -= 2; ASSERT(unwind_pos >= 4); unwind_info[unwind_pos+0] = prologue_pos; unwind_info[unwind_pos+1] = UWOP_ALLOC_SMALL | stack_alloc_info << 4; if (stack_alloc == 128) { prologue_pos += 7; } else { prologue_pos += 4; } } } int sp_offset = ctx->frame_size; for (int reg = 16; reg < 32; reg++) { if (regs_to_save & (1 << reg)) { unwind_pos -= 4; ASSERT(unwind_pos >= 4); unwind_info[unwind_pos+0] = prologue_pos; unwind_info[unwind_pos+1] = UWOP_SAVE_XMM128 | reg<<4; unwind_info[unwind_pos+2] = (uint8_t)(sp_offset >> 4); unwind_info[unwind_pos+3] = (uint8_t)(sp_offset >> 12); if (reg & 8) { prologue_pos += 5; // REX + MOVAPS + ModR/M + SIB } else { prologue_pos += 4; // MOVAPS + ModR/M + SIB } if (sp_offset >= 128) { prologue_pos += 4; // disp32 } else if (sp_offset > 0) { prologue_pos += 1; // disp8 } sp_offset += 16; } } const int code_size = sizeof(unwind_info) - unwind_pos; const int size = 4 + code_size; ASSERT(size <= (int)sizeof(unwind_info)); memmove(unwind_info + 4, unwind_info + unwind_pos, code_size); unwind_info[1] = prologue_pos; unwind_info[2] = code_size / 2; const int alignment = handle->code_alignment; ASSERT(alignment >= 8); int code_offset = 8 + size; if (code_offset % alignment != 0) { code_offset += alignment - (code_offset % alignment); } if (UNLIKELY(!binrec_ensure_code_space(handle, code_offset))) { log_error(handle, "No memory for Windows SEH data"); return false; } uint8_t *buffer = handle->code_buffer; *ALIGNED_CAST(uint64_t *, buffer) = bswap_le64(code_offset); memcpy(buffer + 8, unwind_info, size); ASSERT(code_offset >= 8+size); memset(buffer + (8+size), 0, code_offset - (8+size)); handle->code_len += code_offset; } /* In the worst case (Windows ABI with all registers saved and a frame * size of >=128 bytes), the prologue will require: * 1 * 4 low GPR saves * 2 * 4 high GPR saves * 7 * 1 stack adjustment * 8 * 2 low XMM saves * 9 * 8 high XMM saves * for a total of 107 bytes. */ if (UNLIKELY(!binrec_ensure_code_space(handle, 107))) { log_error(handle, "No memory for unit prologue"); return false; } CodeBuffer code = {.buffer = handle->code_buffer, .buffer_size = handle->code_buffer_size, .len = handle->code_len}; for (int reg = 0; reg < 16; reg++) { if (regs_to_save & (1 << reg)) { append_insn_R(&code, false, X86OP_PUSH_rAX, reg); } } if (stack_alloc >= 128) { append_opcode(&code, X86OP_REX_W); append_opcode(&code, X86OP_IMM_Ev_Iz); append_ModRM(&code, X86MOD_REG, X86OP_IMM_SUB, X86_SP); append_imm32(&code, stack_alloc); } else if (stack_alloc > 0) { append_opcode(&code, X86OP_REX_W); append_opcode(&code, X86OP_IMM_Ev_Ib); append_ModRM(&code, X86MOD_REG, X86OP_IMM_SUB, X86_SP); append_imm8(&code, stack_alloc); } int sp_offset = ctx->frame_size; for (int reg = 16; reg < 32; reg++) { if (regs_to_save & (1 << reg)) { if (reg & 8) { append_rex_opcode(&code, X86OP_REX_R, X86OP_MOVAPS_W_V); } else { append_opcode(&code, X86OP_MOVAPS_W_V); } if (sp_offset >= 128) { append_ModRM_SIB(&code, X86MOD_DISP32, reg & 7, 0, X86SIB_NOINDEX, X86_SP); append_imm32(&code, sp_offset); } else if (sp_offset > 0) { append_ModRM_SIB(&code, X86MOD_DISP8, reg & 7, 0, X86SIB_NOINDEX, X86_SP); append_imm8(&code, sp_offset); } else { append_ModRM_SIB(&code, X86MOD_DISP0, reg & 7, 0, X86SIB_NOINDEX, X86_SP); } sp_offset += 16; } } if (is_windows_seh) { /* Make sure the prologue is the same length we said it would be. */ const int code_offset = (int) *ALIGNED_CAST(uint64_t *, handle->code_buffer); ASSERT(code.len == code_offset + handle->code_buffer[9]); } /* If we have any local constants, insert them here with a jump over * them to the first instruction. */ int num_xmm_constants = 0; for (int i = 0; i < lenof(ctx->const_loc); i++) { if (ctx->const_loc[i]) { num_xmm_constants++; } } if (num_xmm_constants) { const int cst_size = 16 * num_xmm_constants; const int padding = (16 - (code.len + (cst_size>=128 ? 5 : 2))) & 15; const int disp = padding + cst_size; handle->code_len = code.len; if (UNLIKELY(!binrec_ensure_code_space(handle, 5+disp))) { log_error(handle, "No memory for local constants"); return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; const X86Opcode jump_opcode = disp>=128 ? X86OP_JMP_Jz : X86OP_JMP_Jb; append_jump_raw(&code, jump_opcode, disp); ASSERT(code.len + padding <= code.buffer_size); memset(&code.buffer[code.len], 0, padding); code.len += padding; ASSERT(code.len % 16 == 0); for (int i = 0; i < lenof(ctx->const_loc); i++) { if (ctx->const_loc[i]) { ctx->const_loc[i] = code.len; ASSERT(code.len + 16 <= code.buffer_size); memcpy(&code.buffer[code.len], &local_constants[i], 16); code.len += 16; } } } handle->code_len = code.len; return true; } /*-----------------------------------------------------------------------*/ /** * append_epilogue: Append the function epilogue to the output code buffer. * * [Parameters] * ctx: Translation context. * append_ret: True to append a RET instruction after the epilogue. * [Return value] * True on success, false if out of memory. */ static bool append_epilogue(HostX86Context *ctx, bool append_ret) { ASSERT(ctx); ASSERT(ctx->handle); binrec_t * const handle = ctx->handle; const uint32_t regs_saved = ctx->regs_touched & ctx->callee_saved_regs; const int stack_alloc = ctx->stack_alloc; ctx->label_offsets[0] = handle->code_len; /* The maximum size of the epilogue is the same as the maximum size of * the prologue, plus 1 for the RET instruction. */ if (UNLIKELY(!binrec_ensure_code_space(handle, 108))) { log_error(handle, "No memory for unit epilogue"); return false; } CodeBuffer code = {.buffer = handle->code_buffer, .buffer_size = handle->code_buffer_size, .len = handle->code_len}; int sp_offset = ctx->frame_size + 16 * popcnt32(regs_saved >> 16); for (int reg = 31; reg >= 16; reg--) { if (regs_saved & (1 << reg)) { sp_offset -= 16; if (reg & 8) { append_rex_opcode(&code, X86OP_REX_R, X86OP_MOVAPS_V_W); } else { append_opcode(&code, X86OP_MOVAPS_V_W); } if (sp_offset >= 128) { append_ModRM_SIB(&code, X86MOD_DISP32, reg & 7, 0, X86SIB_NOINDEX, X86_SP); append_imm32(&code, sp_offset); } else if (sp_offset > 0) { append_ModRM_SIB(&code, X86MOD_DISP8, reg & 7, 0, X86SIB_NOINDEX, X86_SP); append_imm8(&code, sp_offset); } else { append_ModRM_SIB(&code, X86MOD_DISP0, reg & 7, 0, X86SIB_NOINDEX, X86_SP); } } } if (stack_alloc >= 128) { append_opcode(&code, X86OP_REX_W); append_opcode(&code, X86OP_IMM_Ev_Iz); append_ModRM(&code, X86MOD_REG, X86OP_IMM_ADD, X86_SP); append_imm32(&code, stack_alloc); } else if (stack_alloc > 0) { append_opcode(&code, X86OP_REX_W); append_opcode(&code, X86OP_IMM_Ev_Ib); append_ModRM(&code, X86MOD_REG, X86OP_IMM_ADD, X86_SP); append_imm8(&code, stack_alloc); } for (int reg = 15; reg >= 0; reg--) { if (regs_saved & (1 << reg)) { append_insn_R(&code, false, X86OP_POP_rAX, reg); } } if (append_ret) { append_opcode(&code, X86OP_RET); } handle->code_len = code.len; return true; } /*-----------------------------------------------------------------------*/ /** * do_call_setup: Perform setup for a call-like instruction (CALL, * CALL_TRANSPARENT, or CHAIN). * * [Parameters] * ctx: Translation context. * code: Output code buffer. * insn_index: Index of instruction in ctx->unit->insns[]. * is_tail: True to set up for a tail call, false to set up for a * non-tail call. * src1_loc: X86Register holding the target address, or -1 if the * address is not in a host register. If not -1, may be modified * on return. * src2, src3: RTL registers holding the function arguments, or 0 if * there is no argument in the corresponding position. */ static void do_call_setup(HostX86Context *ctx, CodeBuffer *code, int insn_index, bool is_tail, int *src1_loc, int src2, int src3) { const RTLUnit * const unit = ctx->unit; int src2_loc = (!src2 || !unit->regs[src2].live || is_spilled(ctx, insn_index, src2) ? -1 : ctx->regs[src2].host_reg); int src3_loc = (!src3 || !unit->regs[src3].live || is_spilled(ctx, insn_index, src3) ? -1 : ctx->regs[src3].host_reg); /* * Put arguments (if any) in the right place. This is a bit ugly * due to all the different ways we might have to swap values around. * We take advantage of the fact that RAX and R11 are both callee-saved * (thus unused at this point) and not argument registers in either ABI * (System V or Windows), and use them as temporaries if needed. * * Several of these cases could be implemented in fewer operations * using swaps, but the XCHG instruction has the same latency as a * 3-move sequence (temp=a, a=b, b=temp) on current-generation CPUs, * so we prefer moves if we can get away with less than three per swap. * * Behavior table for 1 argument: * src2 | src1 | Actions * -------+-------+---------------------------------------------------- * arg0 | (any) | Reload src1 to RAX (if spilled; likewise below) * RAX | arg0 | Move arg0 to R11, move RAX to arg0 * (any) | arg0 | Move arg0 to RAX, move/reload src2 to arg0 * (any) | (any) | Move/reload src2 to arg0, reload src1 to RAX * * Behavior table for 2 arguments: * src2 | src3 | src1 | Actions * -------+-------+-------+-------------------------------------------- * arg0 | arg0 | arg1 | Copy arg1 to RAX, copy arg0 to arg1 * arg0 | arg0 | (any) | Copy arg0 to arg1, reload src1 to RAX * arg0 | arg1 | (any) | Reload src1 to RAX * arg0 | RAX | arg1 | Move arg1 to R11, move RAX to arg1 * arg0 | (any) | arg1 | Move arg11 to RAX, move/reload src3 to arg1 * arg0 | (any) | (any) | Move/reload src3 to arg1, reload src1 to RAX * arg1 | arg0 | arg0 | Swap arg0 and arg1 (and call arg1) * arg1 | arg0 | arg1 | Swap arg0 and arg1 (and call arg0) * arg1 | arg0 | (any) | Swap arg0 and arg1, reload src1 to RAX * arg1 | arg1 | arg0 | Move arg0 to RAX, copy arg1 to arg0 * arg1 | arg1 | (any) | Copy arg1 to arg0, reload src1 to RAX * arg1 | RAX | arg0 | Move arg0 to R11, move arg1 to arg0, move * | | | RAX to arg1 * arg1 | (any) | arg0 | Move arg0 to RAX, move arg1 to arg0, * | | | move/reload src3 to arg1 * arg1 | (any) | arg1 | Move arg1 to arg0, move/reload src3 to arg1 * | | | (and call arg0) * arg1 | (any) | (any) | Move arg1 to arg0, move/reload src3 to arg1, * | | | reload src1 to RAX * (any) | arg0 | arg0 | Move arg0 to arg1, move/reload src2 to arg0 * (any) | arg0 | arg1 | Move/reload src2 to R11, move arg1 to RAX, * | | | move arg0 to arg1, move R11 to arg0 * (any) | arg0 | (any) | Move arg0 to arg1, move/reload src2 to arg0, * | | | reload src1 to RAX * RAX | arg1 | arg0 | Move arg0 to R11, move RAX to arg0 * (any) | arg1 | arg0 | Move arg0 to RAX, move/reload src2 to arg0 * (any) | arg1 | (any) | Move/reload src2 to arg0, reload src1 to RAX * RAX | (any) | arg0 | Move/reload src3 to arg1, move arg0 to R11, * | | | move RAX to arg0 * (any) | (any) | arg0 | Move/reload src3 to arg1, move arg0 to RAX, * | | | move/reload src2 to arg0 * (any) | RAX | arg1 | Move/reload src2 to arg0, move arg1 to R11, * | | | move RAX to arg1 * (any) | (any) | arg1 | Move/reload src2 to arg0, move arg1 to RAX, * | | | move/reload src3 to arg1 * (any) | (any) | (any) | Move/reload src2 to arg0, move/reload src3 * | | | to arg1, reload src1 to RAX */ if (src2) { const bool src2_is64 = int_type_is_64(unit->regs[src2].type); const bool src3_is64 = // Safe even if src3 == 0. int_type_is_64(unit->regs[src3].type); const int host_arg0 = host_x86_int_arg_register(ctx, 0); ASSERT(host_arg0 >= 0); const int host_arg1 = host_x86_int_arg_register(ctx, 1); ASSERT(host_arg1 >= 0); /* For simplicity, we omit the reload of src1 if that's the last * operation, since we fall through to a reload of src1 anyway. */ if (!src3) { // 1-argument call if (src2_loc != host_arg0) { if (*src1_loc == host_arg0) { const X86Register src1_target = (src2_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg0); *src1_loc = src1_target; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg0, src2); } } else { // 2-argument call if (src2_loc == host_arg0) { if (src3_loc == host_arg0) { if (*src1_loc == host_arg1) { append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, X86_AX, host_arg1); *src1_loc = X86_AX; } append_insn_ModRM_reg(code, src2_is64, X86OP_MOV_Gv_Ev, host_arg1, host_arg0); } else if (src3_loc != host_arg1) { if (*src1_loc == host_arg1) { const X86Register src1_target = (src3_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg1); *src1_loc = src1_target; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg1, src3); } } else if (src2_loc == host_arg1) { if (src3_loc == host_arg0) { append_insn_ModRM_reg(code, src2_is64 || src3_is64, X86OP_XCHG_Ev_Gv, host_arg1, host_arg0); if (*src1_loc == host_arg0) { *src1_loc = host_arg1; } else if (*src1_loc == host_arg1) { *src1_loc = host_arg0; } } else if (src3_loc == host_arg1) { if (*src1_loc == host_arg0) { append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, X86_AX, host_arg0); *src1_loc = X86_AX; } append_insn_ModRM_reg(code, src2_is64, X86OP_MOV_Gv_Ev, host_arg0, host_arg1); } else { if (*src1_loc == host_arg0) { const X86Register src1_target = (src3_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg0); *src1_loc = src1_target; } append_insn_ModRM_reg(code, src2_is64, X86OP_MOV_Gv_Ev, host_arg0, host_arg1); if (*src1_loc == host_arg1) { *src1_loc = host_arg0; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg1, src3); } } else if (src3_loc == host_arg0) { if (*src1_loc == host_arg1) { append_move_or_load_gpr(code, ctx, unit, insn_index, X86_R11, src2); append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, X86_AX, host_arg1); *src1_loc = X86_AX; append_insn_ModRM_reg(code, src3_is64, X86OP_MOV_Gv_Ev, host_arg1, host_arg0); append_insn_ModRM_reg(code, src2_is64, X86OP_MOV_Gv_Ev, host_arg0, X86_R11); } else { append_insn_ModRM_reg(code, src3_is64, X86OP_MOV_Gv_Ev, host_arg1, host_arg0); if (*src1_loc == host_arg0) { *src1_loc = host_arg1; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg0, src2); } } else if (src3_loc == host_arg1) { if (*src1_loc == host_arg0) { const X86Register src1_target = (src2_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg0); *src1_loc = src1_target; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg0, src2); } else if (*src1_loc == host_arg0) { append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg1, src3); const X86Register src1_target = (src2_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg0); *src1_loc = src1_target; append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg0, src2); } else { append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg0, src2); if (*src1_loc == host_arg1) { const X86Register src1_target = (src3_loc == X86_AX) ? X86_R11 : X86_AX; append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, src1_target, host_arg1); *src1_loc = src1_target; } append_move_or_load_gpr(code, ctx, unit, insn_index, host_arg1, src3); } } // if (!src3) } // if (src2) } /*-----------------------------------------------------------------------*/ /** * translate_call: Translate a CALL or CALL_TRANSPARENT instruction. * * [Parameters] * ctx: Translation context. * block_index: Index of basic block in ctx->unit->blocks[]. * insn_index: Index of instruction in ctx->unit->insns[]. * [Return value] * True on success, false if out of memory. */ static bool translate_call(HostX86Context *ctx, int block_index, int insn_index) { ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(block_index >= 0); ASSERT(block_index < ctx->unit->num_blocks); ASSERT(insn_index >= 0); ASSERT((uint32_t)insn_index < ctx->unit->num_insns); binrec_t * const handle = ctx->handle; const RTLUnit * const unit = ctx->unit; const RTLBlock * const block = &unit->blocks[block_index]; RTLInsn * const insn = &unit->insns[insn_index]; const int src1 = insn->src1; const int src2 = insn->src2; const int src3 = insn->src3; int src1_loc = (!unit->regs[src1].live || is_spilled(ctx, insn_index, src1) ? -1 : ctx->regs[src1].host_reg); const bool is_tail = (insn->host_data_16 != 0); /* Call setup will generally require more space than is reserved by * default, so expand the buffer if needed. */ const int MAX_SETUP_LEN = 3*10; // 3x 64-bit immediate (src1/src2/src3) /* Tail calls: worst case epilogue (107 bytes, see append_epilogue()) + * JMP Ev (without REX, since src1 is loaded to RAX) */ const int MAX_TAIL_CALL_LEN = MAX_SETUP_LEN + 107 + 2; /* Nontail calls: CALL Ev (without REX, since spilled or immediate src1 * is always loaded to RAX) + return value copy (with REX) + MXCSR * save/load (16) + worst case save/restore for System V ABI (9x REX * GPR store, 8x non-REX XMM store, 7x REX XMM store, all doubled) */ const int MAX_NONTAIL_CALL_LEN = MAX_SETUP_LEN + 2 + 3 + 16 + (2 * (9*8 + 8*8 + 7*9)); const int max_len = is_tail ? MAX_TAIL_CALL_LEN : MAX_NONTAIL_CALL_LEN; if (UNLIKELY(handle->code_len + max_len > handle->code_buffer_size) && UNLIKELY(!binrec_ensure_code_space(handle, max_len))) { log_error(handle, "No memory for CALL instruction"); return false; } CodeBuffer code = {.buffer = handle->code_buffer, .buffer_size = handle->code_buffer_size, .len = handle->code_len}; const long initial_len = code.len; /* If this is not a tail call, we need to save any values live in * caller-saved registers to the stack (and we need to do this before * potentially clobbering them with function arguments). The register * allocator will let us know which registers are live via the * host_data_32 field in the CALL instruction. */ if (!is_tail) { if (insn->dest) { /* Make sure we don't overwrite the result after the call! * If the result register is in the save set, it means that * register spilled whatever was previously living there, so * remove it from the save set. */ insn->host_data_32 &= ~(1 << ctx->regs[insn->dest].host_reg); } uint32_t save_regs = insn->host_data_32; while (save_regs) { const int reg = ctz32(save_regs); save_regs ^= 1 << reg; ASSERT(ctx->stack_callsave[reg] >= 0); if (reg >= X86_XMM0) { append_insn_ModRM_mem(&code, false, X86OP_MOVAPS_W_V, reg, X86_SP, -1, ctx->stack_callsave[reg]); } else { append_insn_ModRM_mem(&code, true, X86OP_MOV_Ev_Gv, reg, X86_SP, -1, ctx->stack_callsave[reg]); } } /* We also save MXCSR since its value is volatile in all x86 ABIs. */ ASSERT(ctx->stack_mxcsr >= 0); append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_STMXCSR, X86_SP, -1, ctx->stack_mxcsr); } /* Get arguments into the right place. */ do_call_setup(ctx, &code, insn_index, is_tail, &src1_loc, src2, src3); /* Reload the call target (or copy from constant), if necessary. */ if (src1_loc < 0) { append_move_or_load_gpr(&code, ctx, unit, insn_index, X86_AX, src1); src1_loc = X86_AX; } if (is_tail) { /* If the call target is in a callee-saved register, it'll be * clobbered by the epilogue, so move it out of the way. We use * RAX since it's the return register in both SysV and Windows ABIs * and it doesn't require a REX prefix. */ if (ctx->callee_saved_regs & (1 << src1_loc)) { append_move_gpr(&code, RTLTYPE_ADDRESS, X86_AX, src1_loc); src1_loc = X86_AX; } /* Append the epilogue. We've already reserved space for it, so * this can never fail. */ handle->code_len = code.len; ASSERT(append_epilogue(ctx, false)); ASSERT(handle->code_buffer == code.buffer); ASSERT(handle->code_buffer_size == code.buffer_size); code.len = handle->code_len; /* Do the actual tail call. */ append_insn_ModRM_reg(&code, false, X86OP_MISC_FF, X86OP_MISC_FF_JMP_Ev, src1_loc); /* The following RETURN (if present) is no longer needed. */ if ((uint32_t)insn_index < unit->num_insns - 1) { /* RETURN should never start a new block. */ ASSERT(insn_index < block->last_insn); unit->insns[insn_index+1].opcode = RTLOP_NOP; unit->insns[insn_index+1].src1 = 0; } } else { // not a tail call /* Do the actual call. */ append_insn_ModRM_reg(&code, false, X86OP_MISC_FF, X86OP_MISC_FF_CALL_Ev, src1_loc); /* If the return value is stored to an RTL register, move it to * where it belongs. */ const int dest = insn->dest; if (dest && ctx->regs[dest].host_reg != X86_AX) { append_move_gpr(&code, unit->regs[dest].type, ctx->regs[dest].host_reg, X86_AX); } /* Restore all registers we saved before the call. */ append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_LDMXCSR, X86_SP, -1, ctx->stack_mxcsr); uint32_t save_regs = insn->host_data_32; while (save_regs) { const int reg = ctz32(save_regs); save_regs ^= 1 << reg; ASSERT(ctx->stack_callsave[reg] >= 0); if (reg >= X86_XMM0) { append_insn_ModRM_mem(&code, false, X86OP_MOVAPS_V_W, reg, X86_SP, -1, ctx->stack_callsave[reg]); } else { append_insn_ModRM_mem(&code, true, X86OP_MOV_Gv_Ev, reg, X86_SP, -1, ctx->stack_callsave[reg]); } } } // if (is_tail) ASSERT(code.len - initial_len <= max_len); handle->code_len = code.len; ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; return true; } /*-----------------------------------------------------------------------*/ /** * translate_chain: Translate a CHAIN instruction. * * [Parameters] * ctx: Translation context. * insn_index: Index of instruction in ctx->unit->insns[]. * [Return value] * True on success, false if out of memory. */ static bool translate_chain(HostX86Context *ctx, int insn_index) { ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(insn_index >= 0); ASSERT((uint32_t)insn_index < ctx->unit->num_insns); binrec_t * const handle = ctx->handle; const RTLUnit * const unit = ctx->unit; RTLInsn * const insn = &unit->insns[insn_index]; /* Reserve space as for a tail call (see notes in translate_call()), * plus extra for the chain logic itself. We don't attempt to * optimize immediates here (since there should never be a reason to), * and we handle the address separately, so the maximum setup length * from do_call_setup() is 2 REX GPR loads (8 bytes each). */ const int MAX_CHAIN_PREFIX_LEN = 17; // 7 bytes alignment + MOV R15,imm64 const int CHAIN_SUFFIX_LEN = 3; // MOV RAX,R15 const int MAX_TAIL_CALL_LEN = 2*8 + 107 + 2; const int max_len = MAX_CHAIN_PREFIX_LEN + MAX_TAIL_CALL_LEN + CHAIN_SUFFIX_LEN; if (UNLIKELY(handle->code_len + max_len > handle->code_buffer_size) && UNLIKELY(!binrec_ensure_code_space(handle, max_len))) { log_error(handle, "No memory for CHAIN instruction"); return false; } CodeBuffer code = {.buffer = handle->code_buffer, .buffer_size = handle->code_buffer_size, .len = handle->code_len}; const long initial_len = code.len; /* * We start the chain with a long jump over the chain code followed by * five bytes of 0x00. These will be replaced by MOV R15,addr (10 * bytes) once the address is known. We arrange for the jump to be * 64-bit aligned so that in the common case of an address with the * high 16 bits clear, we can write both the MOV R15 opcode and the * low 48 bits of the address in a single store, thus minimizing the * number of store operations that could potentially trigger a * self-modifying code condition. (It's superficially convenient that * the x86 architecture requires the CPU to detect modified code, but * that just ends up constraining the logic for actually modifying the * code.) * * Those initial 10 bytes are followed by the standard call setup to * get arguments into their proper places. After setup, we move the * address from R15 to RAX so it's not clobbered by the function * epilogue, then pop the stack and jump to RAX like a normal tail call. */ append_nops(&code, (8 - code.len) & 7); ASSERT(code.len % 8 == 0); insn->host_data_32 = code.len; // Save code location for CHAIN_RESOLVE. append_insn_R(&code, true, X86OP_MOV_rAX_Iv, X86_R15); append_imm64(&code, 0); do_call_setup(ctx, &code, insn_index, true, (int[]){-1}, insn->src1, insn->src2); append_move_gpr(&code, RTLTYPE_ADDRESS, X86_AX, X86_R15); handle->code_len = code.len; ASSERT(append_epilogue(ctx, false)); ASSERT(handle->code_buffer == code.buffer); ASSERT(handle->code_buffer_size == code.buffer_size); code.len = handle->code_len; append_insn_ModRM_reg(&code, false, X86OP_MISC_FF, X86OP_MISC_FF_JMP_Ev, X86_AX); /* Overwrite the MOV R15,imm64 opcode with a jump, now that we know * where it should land. */ const int disp = code.len - (insn->host_data_32 + 5); ASSERT(disp < 256); code.buffer[insn->host_data_32] = X86OP_JMP_Jz; code.buffer[insn->host_data_32 + 1] = (uint8_t)disp; ASSERT(code.len - initial_len <= max_len); handle->code_len = code.len; return true; } /*-----------------------------------------------------------------------*/ /** * translate_chain_resolve: Translate a CHAIN_RESOLVE instruction. * * [Parameters] * ctx: Translation context. * code: Output code buffer. * insn_index: Index of instruction in ctx->unit->insns[]. */ static void translate_chain_resolve(HostX86Context *ctx, CodeBuffer *code, int insn_index) { ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(insn_index >= 0); ASSERT((uint32_t)insn_index < ctx->unit->num_insns); const RTLUnit * const unit = ctx->unit; RTLInsn * const insn = &unit->insns[insn_index]; /* There should never be a reason for the address to be spilled. */ ASSERT(!is_spilled(ctx, insn_index, insn->src1)); const X86Register host_src1 = ctx->regs[insn->src1].host_reg; const int target_index = insn->src_imm; ASSERT(target_index >= 0); ASSERT(target_index < insn_index); const RTLInsn * const target_insn = &unit->insns[target_index]; ASSERT(target_insn->opcode == RTLOP_CHAIN); const long target_offset = target_insn->host_data_32; ASSERT(target_offset + 10 <= code->len); ASSERT(target_offset % 8 == 0); /* Don't resolve the chain if the pointer is null. */ append_insn_ModRM_reg(code, true, X86OP_TEST_Ev_Gv, host_src1, host_src1); append_jump_raw(code, X86OP_JZ_Jb, 0); const long skip_from = code->len; /* If the high 16 bits of the value are nonzero, first store those to * the high bits of the immediate. R15 is guaranteed to be saved by * the prologue due to the CHAIN instruction, so we can safely use it * as a temporary here. */ append_insn_ModRM_reg(code, true, X86OP_MOV_Gv_Ev, X86_R15, host_src1); append_insn_ModRM_reg(code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHR, X86_R15); append_imm8(code, 48); append_jump_raw(code, X86OP_JZ_Jb, 8); const long low48_from = code->len; append_opcode(code, X86OP_OPERAND_SIZE); append_insn_ModRM_riprel(code, false, X86OP_MOV_Ev_Gv, X86_R15, target_offset + 8); ASSERT(code->len == low48_from + 8); /* Shift the MOV R15 opcode into the bottom of the address and store * it over the branch. We can safely destroy the existing value * because it dies here by contract (RTL mandates that a CHAIN_RESOLVE * instruction is immediately followed by RETURN). */ append_insn_ModRM_reg(code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHL, host_src1); append_imm8(code, 16); append_insn_ModRM_reg(code, true, X86OP_IMM_Ev_Iz, X86OP_IMM_OR, host_src1); append_imm32(code, X86OP_REX_WB | (X86OP_MOV_rAX_Iv + (X86_R15 & 7)) << 8); append_insn_ModRM_riprel(code, true, X86OP_MOV_Ev_Gv, host_src1, target_offset); /* Jump directly to the chain code so we don't force the caller to * look up the translated code for the target address a second time. */ const long disp = target_offset - code->len; ASSERT(disp < 0); if (disp-2 >= -128) { append_jump_raw(code, X86OP_JMP_Jb, disp-2); } else { append_jump_raw(code, X86OP_JMP_Jz, disp-5); } const long skip_to = code->len; ASSERT(skip_to - skip_from < 128); code->buffer[skip_from - 1] = (uint8_t)(skip_to - skip_from); } /*-----------------------------------------------------------------------*/ /** * translate_fma: Translate a fused multiply-add instruction. * * [Parameters] * code: Output code buffer. * ctx: Translation context. * insn_index: Index of instruction in ctx->unit->insns[]. * fma_base_opcode: Base opcode (one of VF*132PS) when using the FMA * instruction set extension. * sub: True if the operation is FMSUB or FNMSUB. * negate: True if the operation is FNMADD or FNMSUB. */ static void translate_fma(CodeBuffer *code, HostX86Context *ctx, int insn_index, X86Opcode fma_base_opcode, bool sub, bool negate) { ASSERT(code); ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(insn_index >= 0); ASSERT((uint32_t)insn_index < ctx->unit->num_insns); const RTLUnit * const unit = ctx->unit; const RTLInsn * const insn = &unit->insns[insn_index]; const int dest = insn->dest; const int src1 = insn->src1; const int src2 = insn->src2; const int src3 = insn->src3; const X86Register host_dest = ctx->regs[dest].host_reg; const RTLDataType type = unit->regs[dest].type; const bool is_vector = rtl_type_is_vector(type); const RTLDataType scalar_type = is_vector ? rtl_vector_element_type(type) : type; const bool is64 = (scalar_type == RTLTYPE_FLOAT64); if (ctx->handle->setup.host_features & BINREC_FEATURE_X86_FMA) { const X86Register host_src1 = ctx->regs[src1].host_reg; const X86Register host_src3 = ctx->regs[src3].host_reg; ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; const bool spilled1 = is_spilled(ctx, insn_index, src1); const bool spilled3 = is_spilled(ctx, insn_index, src3); /* Final opcode for the operation. */ X86Opcode opcode; /* RTL register copied/loaded to the destination XMM register. */ int dest_reg; /* RTL register used as the effective address operand. */ int ea_reg; /* X86Register which is encoded in the VEX.vvvv field. */ X86Register vex_vvvv; /* True if src1 needs to be reloaded to host_temp. */ bool reload_src1 = false; /* Choose the opcode variant, assign RTL registers to XMM operands, * and record which registers need to be reloaded. */ if (spilled3) { /* src3 is spilled, so use variant 213 which adds from the * effective address operand. */ opcode = fma_base_opcode + (X86OP_VFMADD213SS - X86OP_VFMADD132SS); dest_reg = src2; ea_reg = src3; if (spilled1) { reload_src1 = true; vex_vvvv = host_temp; } else { vex_vvvv = host_src1; } } else if (host_dest == host_src3) { /* dest overlaps src3, so use variant 231 which overwrites * the addend. */ opcode = fma_base_opcode + (X86OP_VFMADD231SS - X86OP_VFMADD132SS); dest_reg = src3; ea_reg = src2; if (spilled1) { reload_src1 = true; vex_vvvv = host_temp; } else { vex_vvvv = host_src1; } } else { /* For all other cases, we can use variant 132. */ opcode = fma_base_opcode; dest_reg = src1; ea_reg = src2; vex_vvvv = host_src3; } /* Choose between vector and scalar versions of the instruction. */ ASSERT(!(opcode & 1)); if (!rtl_register_is_vector(&unit->regs[dest])) { opcode |= 1; } /* Reload registers as needed. */ append_move_or_load(code, ctx, unit, insn_index, host_dest, dest_reg); if (reload_src1) { append_load(code, type, host_temp, X86_SP, -1, ctx->regs[src1].spill_offset); } /* Perform the actual operation. */ append_vex_insn_ModRM_ctx(code, is64, false, opcode, host_dest, ctx, insn_index, ea_reg, vex_vvvv); } else { /* Without the FMA extension, we just do separate multiply/add * operations and eat the precision loss. Yum. */ const X86Register host_src2 = ctx->regs[src2].host_reg; const uint8_t prefix = sse_opcode_prefix_for_type(type); const X86Opcode add_opcode = sub ? X86OP_SUBPS : X86OP_ADDPS; if (host_dest == host_src2 && !is_spilled(ctx, insn_index, src2)) { if (prefix) { append_imm8(code, prefix); } append_insn_ModRM_ctx(code, false, X86OP_MULPS, host_dest, ctx, insn_index, src1); } else { append_move_or_load(code, ctx, unit, insn_index, host_dest, src1); if (prefix) { append_imm8(code, prefix); } append_insn_ModRM_ctx(code, false, X86OP_MULPS, host_dest, ctx, insn_index, src2); } if (negate) { const int lc_id = is_vector ? (is64 ? LC_V2_FLOAT64_SIGNBIT : LC_V2_FLOAT32_SIGNBIT) : (is64 ? LC_FLOAT64_SIGNBIT : LC_FLOAT32_SIGNBIT); const long lc_offset = ctx->const_loc[lc_id]; ASSERT(lc_offset); append_insn_ModRM_riprel(code, false, X86OP_XORPS, host_dest, lc_offset); } if (prefix) { append_imm8(code, prefix); } append_insn_ModRM_ctx(code, false, add_opcode, host_dest, ctx, insn_index, insn->src3); } } /*-----------------------------------------------------------------------*/ /** * translate_fzcast: Translate an FZCAST instruction. * * [Parameters] * ctx: Translation context. * insn_index: Index of instruction in ctx->unit->insns[]. * [Return value] * True on success, false if out of memory. */ static bool translate_fzcast(HostX86Context *ctx, int insn_index) { ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(insn_index >= 0); ASSERT((uint32_t)insn_index < ctx->unit->num_insns); CodeBuffer code = {.buffer = ctx->handle->code_buffer, .buffer_size = ctx->handle->code_buffer_size, .len = ctx->handle->code_len}; const long initial_len = code.len; const RTLUnit * const unit = ctx->unit; const RTLInsn * const insn = &unit->insns[insn_index]; const int dest = insn->dest; const int src1 = insn->src1; const X86Register host_dest = ctx->regs[dest].host_reg; X86Register host_src1; if (is_spilled(ctx, insn_index, src1)) { ASSERT(ctx->regs[dest].temp_allocated); host_src1 = ctx->regs[dest].host_temp; append_load_gpr(&code, unit->regs[src1].type, host_src1, X86_SP, ctx->regs[src1].spill_offset); } else { host_src1 = ctx->regs[src1].host_reg; } const X86Opcode opcode = (unit->regs[dest].type == RTLTYPE_FLOAT64 ? X86OP_CVTSI2SD : X86OP_CVTSI2SS); if (!int_type_is_64(unit->regs[src1].type)) { /* The INT32 case is easy: 32-bit values in GPRs will always have * the high 32 bits clear, so we can just use the value as is, * treating it as a 64-bit signed integer. */ append_insn_ModRM_reg(&code, true, opcode, host_dest, host_src1); ctx->handle->code_len = code.len; return true; } /* The x86 instruction set doesn't include an unsigned conversion * instruction, but we can take advantage of the fact that both * FLOAT32 and FLOAT64 have less than 64 bits of precision and * shift the value right if its MSB is set. */ ASSERT(ctx->regs[dest].temp_allocated); const int max_len = 73; if (UNLIKELY(ctx->handle->code_len + max_len > ctx->handle->code_buffer_size)) { if (UNLIKELY(!binrec_ensure_code_space(ctx->handle, max_len))) { log_error(ctx->handle, "No memory for FZCAST instruction"); return false; } code.buffer = ctx->handle->code_buffer; code.buffer_size = ctx->handle->code_buffer_size; } /* Check whether the MSB is set. */ append_insn_ModRM_reg(&code, true, X86OP_TEST_Ev_Gv, host_src1, host_src1); const long js_pos = code.len; append_jump_raw(&code, X86OP_JS_Jb, 0); const long js_end = code.len; ASSERT(js_end == js_pos + 2); /* MSB not set: simple conversion. */ append_insn_ModRM_reg(&code, true, opcode, host_dest, host_src1); const long msb_set_jmp_pos = code.len; append_jump_raw(&code, X86OP_JMP_Jb, 0); const long msb_set_jmp_end = code.len; ASSERT(msb_set_jmp_end == msb_set_jmp_pos + 2); ASSERT(code.len - js_end <= 127); code.buffer[js_end-1] = (uint8_t)(code.len - js_end); /* MSB set: divide by 2 (with appropriate rounding) before conversion * and multiply by 2 afterward. */ const X86Register host_temp = ctx->regs[dest].host_temp; if (host_src1 != host_temp) { append_move_gpr(&code, RTLTYPE_INT64, host_temp, host_src1); } append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_STMXCSR, X86_SP, -1, ctx->stack_mxcsr); append_insn_ModRM_reg(&code, true, X86OP_SHIFT_Ev_1, X86OP_SHIFT_SHR, host_temp); /* RC={1,3}: round down */ append_insn_ModRM_mem(&code, false, X86OP_BTx_Ev_Ib, X86OP_BITTEST_BT, X86_SP, -1, ctx->stack_mxcsr); append_imm8(&code, 13); const long jnc0_pos = code.len; append_jump_raw(&code, X86OP_JNC_Jb, 0); const long jnc0_end = code.len; ASSERT(jnc0_end == jnc0_pos + 2); /* RC=2: round up */ append_insn_ModRM_mem(&code, false, X86OP_BTx_Ev_Ib, X86OP_BITTEST_BT, X86_SP, -1, ctx->stack_mxcsr); append_imm8(&code, 14); const long jnc1_pos = code.len; append_jump_raw(&code, X86OP_JNC_Jb, 0); const long jnc1_end = code.len; ASSERT(jnc1_end == jnc1_pos + 2); /* RC=0: round to even */ maybe_append_empty_rex(&code, host_temp, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_UNARY_Eb, X86OP_UNARY_TEST, host_temp); append_imm8(&code, 1); const long jz_pos = code.len; append_jump_raw(&code, X86OP_JZ_Jb, 0); const long jz_end = code.len; ASSERT(jz_end == jz_pos + 2); /* Increment halved value if rounding up. */ ASSERT(code.len - jnc1_end <= 127); code.buffer[jnc1_end-1] = (uint8_t)(code.len - jnc1_end); append_insn_ModRM_reg(&code, true, X86OP_IMM_Ev_Ib, X86OP_IMM_ADD, host_temp); append_imm8(&code, 1); /* Convert rounded value and double. */ ASSERT(code.len - jz_end <= 127); code.buffer[jz_end-1] = (uint8_t)(code.len - jz_end); ASSERT(code.len - jnc0_end <= 127); code.buffer[jnc0_end-1] = (uint8_t)(code.len - jnc0_end); append_insn_ModRM_reg(&code, true, opcode, host_dest, host_temp); const X86Opcode add_opcode = (unit->regs[dest].type == RTLTYPE_FLOAT64 ? X86OP_ADDSD : X86OP_ADDSS); append_insn_ModRM_reg(&code, false, add_opcode, host_dest, host_dest); ASSERT(code.len - msb_set_jmp_end <= 127); code.buffer[msb_set_jmp_end-1] = (uint8_t)(code.len - msb_set_jmp_end); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; ctx->last_cmp_target = 0; ctx->last_cmp_imm = 0; ASSERT(code.len - initial_len <= max_len); ctx->handle->code_len = code.len; return true; } /*-----------------------------------------------------------------------*/ /** * translate_block: Translate the given RTL basic block. * * [Parameters] * ctx: Translation context. * block_index: Index of basic block in ctx->unit->blocks[]. * [Return value] * True on success, false if out of memory. */ static bool translate_block(HostX86Context *ctx, int block_index) { ASSERT(ctx); ASSERT(ctx->handle); ASSERT(ctx->unit); ASSERT(block_index >= 0); ASSERT(block_index < ctx->unit->num_blocks); binrec_t * const handle = ctx->handle; const RTLUnit * const unit = ctx->unit; const RTLBlock * const block = &unit->blocks[block_index]; HostX86BlockInfo * const block_info = &ctx->blocks[block_index]; CodeBuffer code = {.buffer = handle->code_buffer, .buffer_size = handle->code_buffer_size, .len = handle->code_len}; block_info->unresolved_branch_offset = -1; bool fall_through = true; // Does code fall through to the next block? ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; ctx->last_cmp_target = 0; ctx->last_cmp_imm = 0; for (int insn_index = block->first_insn; insn_index <= block->last_insn; insn_index++) { const RTLInsn * const insn = &unit->insns[insn_index]; const int dest = insn->dest; const int src1 = insn->src1; const int src2 = insn->src2; /* Verify (if ENABLE_ASSERT) that all generated code fits within * the space we reserve per instruction here. Currently, the worst * possible case is BFINS with spill of dest, spilled src1 and src2, * all stack offsets >= 128, and a bit count greater than 32 (see * tests/host-x86/insn/bfins-spilled-max-output-len.c). */ const int MAX_INSN_LEN = 45; if (UNLIKELY(code.len + MAX_INSN_LEN > code.buffer_size)) { handle->code_len = code.len; if (UNLIKELY(!binrec_ensure_code_space(handle, MAX_INSN_LEN))) { log_error(handle, "No memory for instruction at %d", insn_index); return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; } /* This is not "const" because we rewrite it for instructions which * do their own buffer size management. */ long initial_len = code.len; /* Evict the current occupant of the destination register if needed. */ if (dest) { const X86Register host_dest = ctx->regs[dest].host_reg; const int spill_index = current_reg(ctx, insn_index, host_dest); if (spill_index) { const RTLRegister *spill_reg = &unit->regs[spill_index]; const HostX86RegInfo *spill_info = &ctx->regs[spill_index]; ASSERT(spill_info->spilled); ASSERT(spill_info->spill_insn == insn_index); append_store(&code, spill_reg->type, spill_info->host_reg, X86_SP, -1, spill_info->spill_offset); } ctx->reg_map[host_dest] = dest; } switch ((RTLOpcode)insn->opcode) { case RTLOP_NOP: if (insn->src_imm != 0) { append_opcode(&code, X86OP_NOP_Ev); append_ModRM(&code, X86MOD_DISP0, 0, X86MODRM_RIP_REL); append_imm32(&code, (uint32_t)insn->src_imm); if (insn->src_imm >> 32) { append_opcode(&code, X86OP_NOP_Ev); append_ModRM_SIB(&code, X86MOD_DISP32, 0, 0, X86SIB_NOINDEX, X86_SP); append_imm32(&code, (uint32_t)(insn->src_imm >> 32)); } } break; case RTLOP_SET_ALIAS: { /* We need to store to memory if (1) this is a terminal block, * (2) at least one successor block doesn't both (a) have a * mergeable GET_ALIAS and (b) SET_ALIAS the same alias, or * (3) this block or any successor block contains a non-tail * CALL or CALL_TRANSPARENT. (FIXME: (3) only needs to apply * if the alias has bound storage, but for our purposes it * shouldn't make a difference.) */ bool need_store = (block->exits[0] < 0 || block_info->has_nontail_call); for (int i = 0; !need_store && i < lenof(block->exits); i++) { const int successor = block->exits[i]; if (successor >= 0) { const int reg = ctx->blocks[successor].alias_load[insn->alias]; if (!reg || !ctx->regs[reg].merge_alias || !ctx->blocks[successor].alias_store[insn->alias] || ctx->blocks[successor].has_nontail_call) { need_store = true; } } } if (need_store) { if (is_spilled(ctx, insn_index, src1)) { const RTLRegister *src1_reg = &unit->regs[src1]; const X86Register temp_reg = (rtl_register_is_int(src1_reg) ? X86_R15 : X86_XMM15); append_load(&code, src1_reg->type, temp_reg, X86_SP, -1, ctx->regs[src1].spill_offset); append_store_alias(&code, ctx, &unit->aliases[insn->alias], temp_reg); } else { append_store_alias(&code, ctx, &unit->aliases[insn->alias], ctx->regs[src1].host_reg); } } break; } // case RTLOP_SET_ALIAS case RTLOP_GET_ALIAS: /* Register allocation informs us whether we need to load * from memory. */ if (!ctx->regs[dest].merge_alias) { append_load_alias(&code, ctx, &unit->aliases[insn->alias], ctx->regs[dest].host_reg); } else if (ctx->regs[dest].host_merge != ctx->regs[dest].host_reg) { /* The value is already loaded, but we need to move it to * a different register. */ append_move(&code, unit->regs[dest].type, ctx->regs[dest].host_reg, ctx->regs[dest].host_merge); } break; case RTLOP_MOVE: append_move_or_load(&code, ctx, unit, insn_index, ctx->regs[dest].host_reg, src1); break; case RTLOP_SELECT: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; /* Set the x86 condition flags based on the condition register. */ X86CondCode condition; if (insn->host_data_16) { condition = insn->host_data_16 & 0xF; const int32_t cmp2 = insn->host_data_32; const bool cmp2_is_imm = (insn->host_data_16 & 0x10) != 0; if (insn->host_data_16 & 0x40) { /* 0x40 implies FTESTEXC, which always has src1 of type * FPSTATE and an immediate RTLFloatException src2. */ ASSERT(cmp2_is_imm); if (!is_spilled(ctx, insn_index, insn->src3)) { maybe_append_empty_rex( &code, ctx->regs[insn->src3].host_reg, -1, -1); } append_insn_ModRM_ctx( &code, false, X86OP_UNARY_Eb, X86OP_UNARY_TEST, ctx, insn_index, insn->src3); append_imm8(&code, rtlfexc_to_bits(cmp2)); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; } else { append_compare(ctx, insn_index, &code, insn->src3, cmp2_is_imm ? 0 : cmp2, cmp2, (insn->host_data_16 >> 8) & 0x1F, (condition & 0xE) == X86CC_Z, (insn->host_data_16 & 0x20) != 0, -1); } } else { condition = X86CC_NZ; append_compare(ctx, insn_index, &code, insn->src3, 0, 0, 0, true, false, -1); } /* Put one of the source values in the destination register, if * necessary. Note that MOV does not alter flags. */ bool dest_is_src1; if (host_dest == host_src1 && !is_spilled(ctx, insn_index, src1)) { dest_is_src1 = true; } else if (host_dest == host_src2 && !is_spilled(ctx, insn_index, src2)) { dest_is_src1 = false; } else { dest_is_src1 = true; append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); } /* Conditionally move the other value into the register. */ const int other = dest_is_src1 ? src2 : src1; if (rtl_register_is_float(&unit->regs[dest])) { if (!dest_is_src1) { condition ^= 1; } const X86Opcode opcode = X86OP_Jcc_Jb | condition; append_opcode(&code, opcode); append_imm8(&code, 0); const long jump_from = code.len; append_move_or_load(&code, ctx, unit, insn_index, host_dest, other); const long jump_to = code.len; ASSERT(jump_to - jump_from <= 127); code.buffer[jump_from - 1] = jump_to - jump_from; } else { ASSERT(rtl_register_is_int(&unit->regs[dest]) || unit->regs[dest].type == RTLTYPE_FPSTATE); if (dest_is_src1) { condition ^= 1; } const bool is64 = int_type_is_64(unit->regs[dest].type); const X86Opcode opcode = X86OP_CMOVcc | condition; append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, other); } break; } // case RTLOP_SELECT case RTLOP_SCAST: case RTLOP_ZCAST: { const RTLDataType type_dest = unit->regs[dest].type; const RTLDataType type_src1 = unit->regs[src1].type; const X86Register host_dest = ctx->regs[dest].host_reg; X86Register host_src1 = ctx->regs[src1].host_reg; if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, type_src1, host_dest, X86_SP, ctx->regs[src1].spill_offset); host_src1 = host_dest; } if (int_type_is_64(type_dest)) { if (int_type_is_64(type_src1)) { if (host_dest != host_src1) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Ev_Gv, host_src1, host_dest); } } else { if (insn->opcode == RTLOP_SCAST) { append_insn_ModRM_reg(&code, true, X86OP_MOVSXD_Gv_Ev, host_dest, host_src1); } else if (host_dest != host_src1) { /* We can skip the MOV if host_dest == host_src1 * because all 32-bit operations clear the high * word of the output register. */ append_insn_ModRM_reg(&code, false, X86OP_MOV_Ev_Gv, host_src1, host_dest); } } } else { /* When converting from 64 to 32 bits, we need a MOV even * if the source and destination are in the same register * in order to clear the high 32 bits of the register. */ if (int_type_is_64(type_src1) || host_dest != host_src1) { append_insn_ModRM_reg(&code, false, X86OP_MOV_Ev_Gv, host_src1, host_dest); } } break; } // case RTLOP_SCAST, RTLOP_ZCAST case RTLOP_SEXT8: case RTLOP_SEXT16: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[dest].type); const X86Opcode opcode = (insn->opcode == RTLOP_SEXT8 ? X86OP_MOVSX_Gv_Eb : X86OP_MOVSX_Gv_Ew); if (insn->opcode == RTLOP_SEXT8 && !is64 && !is_spilled(ctx, insn_index, src1)) { maybe_append_empty_rex(&code, ctx->regs[src1].host_reg, host_dest, -1); } append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); break; } // case RTLOP_SEXT8, RTLOP_SEXT16 case RTLOP_NEG: if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; /* We could almost forward the EFLAGS result as an ordered * compare against zero, but two cases would result in * incorrect behavior: * - Negating a zero operand sets CF=1, which breaks the * AE (CF=0) and B (CF=1) tests. * - Negating the smallest negative integer (0x8000_0000 * for INT32) sets OF=1, which breaks the signed tests * (the result would be treated as positive instead of * negative). * So we have to perform an explicit compare for a * subsequent SLT/SGT operation. */ ctx->last_cmp_reg = 0; } /* Fall through to common NEG/NOT handling. */ case RTLOP_NOT: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, unit->regs[src1].type, host_dest, X86_SP, ctx->regs[src1].spill_offset); } else if (host_dest != host_src1) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Ev_Gv, host_src1, host_dest); } const X86UnaryOpcode opcode = insn->opcode == RTLOP_NOT ? X86OP_UNARY_NOT : X86OP_UNARY_NEG; append_insn_ModRM_reg(&code, is64, X86OP_UNARY_Ev, opcode, host_dest); break; } // case RTLOP_NEG, RTLOP_NOT case RTLOP_ADD: case RTLOP_SUB: case RTLOP_AND: case RTLOP_OR: case RTLOP_XOR: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86Opcode opcode = ( insn->opcode == RTLOP_ADD ? X86OP_ADD_Gv_Ev : insn->opcode == RTLOP_SUB ? X86OP_SUB_Gv_Ev : insn->opcode == RTLOP_AND ? X86OP_AND_Gv_Ev : insn->opcode == RTLOP_OR ? X86OP_OR_Gv_Ev : /* RTLOP_XOR */ X86OP_XOR_Gv_Ev); if (host_dest == host_src2 && !is_spilled(ctx, insn_index, src2)) { append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); } else { append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src2); } if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } break; } // case RTLOP_{ADD,SUB,AND,OR,XOR} case RTLOP_MUL: { /* This case is identical to RTLOP_ADD (etc.), but it's * separated out to aid optimization, since the x86 IMUL * instruction is two bytes (0F AF) as opposed to the other * ALU instructions which are one byte. */ const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86Opcode opcode = X86OP_IMUL_Gv_Ev; if (host_dest == host_src2 && !is_spilled(ctx, insn_index, src2)) { append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); } else { append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src2); } /* We don't need to test the CONDITION_CODES optimization * flag if we're just clearing the current state. */ ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_MUL case RTLOP_MULHU: case RTLOP_MULHS: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); ASSERT(host_dest != X86_AX); /* If another value is already in rDX, save it away so it's not * clobbered. We save all 64 bits of the register so we don't * have to worry about checking the type of what's there. */ const bool dx_busy = (host_dest != X86_DX && current_reg(ctx, insn_index, X86_DX)); bool swapped_dx = false; if (dx_busy) { /* If dest shares a hardware register with src1 or src2, * we need to preserve its value until the actual multiply; * otherwise, we can use a MOV for potentially less latency. */ if (host_dest == host_src1 || host_dest == host_src2) { swapped_dx = true; append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, host_dest, X86_DX); } else { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, host_dest, X86_DX); } } /* The register allocator gives us a temporary iff rAX is live * across this instruction. */ if (ctx->regs[dest].temp_allocated) { ASSERT(ctx->regs[dest].host_temp != X86_DX); append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, ctx->regs[dest].host_temp, X86_AX); } int multiplier; X86Register host_mult; if (host_src2 == X86_AX && !is_spilled(ctx, insn_index, src2)) { multiplier = src1; host_mult = host_src1; } else { /* Watch out for the input operands being moved around * by XCHG! */ X86Register multiplicand = host_src1; if (swapped_dx) { if (multiplicand == X86_DX) { multiplicand = host_dest; } else if (multiplicand == host_dest) { multiplicand = X86_DX; } } /* Can't use append_move_or_load_gpr() here because of the * possible rDX swap. */ if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, unit->regs[src1].type, X86_AX, X86_SP, ctx->regs[src1].spill_offset); } else if (host_src1 != X86_AX) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, X86_AX, multiplicand); } multiplier = src2; host_mult = host_src2; } const X86UnaryOpcode opcode = ( insn->opcode == RTLOP_MULHU ? X86OP_UNARY_MUL_rAX: /* RTLOP_MULHS */ X86OP_UNARY_IMUL_rAX); if (swapped_dx && !is_spilled(ctx, insn_index, multiplier)) { if (host_mult == X86_DX) { host_mult = host_dest; } else if (host_mult == host_dest) { host_mult = X86_DX; } append_insn_ModRM_reg(&code, is64, X86OP_UNARY_Ev, opcode, host_mult); } else { append_insn_ModRM_ctx(&code, is64, X86OP_UNARY_Ev, opcode, ctx, insn_index, multiplier); } if (host_dest != X86_DX) { if (dx_busy) { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, X86_DX, host_dest); } else { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, X86_DX); } } if (ctx->regs[dest].temp_allocated) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_AX, ctx->regs[dest].host_temp); } ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_MULH[US] case RTLOP_DIVU: case RTLOP_DIVS: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); ASSERT(host_dest != host_src2); ASSERT(host_dest != X86_DX); X86Register divisor = host_src2; /* As with MULH*, the presence of a temporary register means * we need to save the non-result output register. For * division, that means either the register is live across this * instruction or it's assigned to src2 and we need to move * src2 out of the way of the dividend registers (rDX:rAX). */ if (ctx->regs[dest].temp_allocated) { ASSERT(ctx->regs[dest].host_temp != X86_AX); append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, ctx->regs[dest].host_temp, X86_DX); if (divisor == X86_DX) { divisor = ctx->regs[dest].host_temp; } } ASSERT(divisor != X86_DX); /* As with MULH*, save the result register's value if needed. * For division, we never allocate dest and src2 in the same * register, so we only need to XCHG if host_dest == host_src1. */ X86Register dividend = host_src1; const bool ax_busy = (host_dest != X86_AX && current_reg(ctx, insn_index, X86_AX)); if (host_dest != X86_AX && (divisor == X86_AX || ax_busy)) { if (dividend == host_dest) { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, host_dest, X86_AX); dividend = X86_AX; } else { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, host_dest, X86_AX); } /* RAX will be destroyed before the divide instruction, so * make sure not to use it as the divisor in any case. */ if (divisor == X86_AX) { divisor = host_dest; } } ASSERT(divisor != X86_AX); if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, unit->regs[src1].type, X86_AX, X86_SP, ctx->regs[src1].spill_offset); } else if (dividend != X86_AX) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, X86_AX, host_src1); } X86UnaryOpcode opcode; if (insn->opcode == RTLOP_DIVU) { append_insn_ModRM_reg(&code, false, X86OP_XOR_Gv_Ev, X86_DX, X86_DX); opcode = X86OP_UNARY_DIV_rAX; } else { append_insn(&code, is64, X86OP_CWD); opcode = X86OP_UNARY_IDIV_rAX; } if (is_spilled(ctx, insn_index, src2)) { append_insn_ModRM_ctx(&code, is64, X86OP_UNARY_Ev, opcode, ctx, insn_index, src2); } else { append_insn_ModRM_reg(&code, is64, X86OP_UNARY_Ev, opcode, divisor); } if (host_dest != X86_AX) { if (ax_busy) { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, X86_AX, host_dest); } else { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, X86_AX); } } if (ctx->regs[dest].temp_allocated) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_DX, ctx->regs[dest].host_temp); } ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_DIV[US] case RTLOP_MODU: case RTLOP_MODS: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); ASSERT(host_dest != host_src2); ASSERT(host_dest != X86_AX); X86Register divisor = host_src2; if (ctx->regs[dest].temp_allocated) { ASSERT(ctx->regs[dest].host_temp != X86_DX); append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, ctx->regs[dest].host_temp, X86_AX); if (divisor == X86_AX) { divisor = ctx->regs[dest].host_temp; } } ASSERT(divisor != X86_AX); append_move_or_load_gpr(&code, ctx, unit, insn_index, X86_AX, src1); /* Save the result register's value if necesssary. For modulo, * we take care of moving src1 to rAX first and we never * allocate dest and src2 in the same register, so this can * always be a regular MOV. */ const bool dx_busy = (host_dest != X86_DX && current_reg(ctx, insn_index, X86_DX)); if (host_dest != X86_DX) { if (divisor == X86_DX || dx_busy) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, host_dest, X86_DX); } if (divisor == X86_DX) { divisor = host_dest; } } ASSERT(divisor != X86_DX); X86UnaryOpcode opcode; if (insn->opcode == RTLOP_MODU) { append_insn_ModRM_reg(&code, false, X86OP_XOR_Gv_Ev, X86_DX, X86_DX); opcode = X86OP_UNARY_DIV_rAX; } else { append_insn(&code, is64, X86OP_CWD); opcode = X86OP_UNARY_IDIV_rAX; } if (is_spilled(ctx, insn_index, src2)) { append_insn_ModRM_ctx(&code, is64, X86OP_UNARY_Ev, opcode, ctx, insn_index, src2); } else { append_insn_ModRM_reg(&code, is64, X86OP_UNARY_Ev, opcode, divisor); } if (host_dest != X86_DX) { if (dx_busy) { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, X86_DX, host_dest); } else { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, X86_DX); } } if (ctx->regs[dest].temp_allocated) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_AX, ctx->regs[dest].host_temp); } ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_MOD[US] case RTLOP_SLL: case RTLOP_SRL: case RTLOP_SRA: if (handle->setup.host_features & BINREC_FEATURE_X86_BMI2) { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86Opcode opcode = ( insn->opcode == RTLOP_SLL ? X86OP_SHLX : insn->opcode == RTLOP_SRL ? X86OP_SHRX : /* RTLOP_SRA */ X86OP_SARX); /* There's also a RORX instruction in BMI2, but it takes an * immediate count instead of a register count because... * uh, I dunno, I guess because Intel wanted to annoy * compiler writers? */ X86Register host_shift; if (is_spilled(ctx, insn_index, src2)) { ASSERT(host_dest != ctx->regs[src1].host_reg || is_spilled(ctx, insn_index, src1)); /* 32-bit load even if src2 is 64 bits wide because we * only need the low 5-6 bits of the value. */ append_load_gpr(&code, RTLTYPE_INT32, host_dest, X86_SP, ctx->regs[src2].spill_offset); host_shift = host_dest; } else { host_shift = ctx->regs[src2].host_reg; } append_vex_insn_ModRM_ctx( &code, is64, false, opcode, host_dest, ctx, insn_index, src1, host_shift); break; } /* Otherwise fall through to non-BMI2 handling. */ case RTLOP_ROL: case RTLOP_ROR: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool src2_spilled = is_spilled(ctx, insn_index, src2); ASSERT(host_dest != X86_CX); const bool is64 = int_type_is_64(unit->regs[src1].type); const X86ShiftOpcode opcode = ( insn->opcode == RTLOP_SLL ? X86OP_SHIFT_SHL : insn->opcode == RTLOP_SRL ? X86OP_SHIFT_SHR : insn->opcode == RTLOP_SRA ? X86OP_SHIFT_SAR : insn->opcode == RTLOP_ROL ? X86OP_SHIFT_ROL : /* RTLOP_ROR */ X86OP_SHIFT_ROR); append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); /* If we couldn't allocate rCX for the second operand, swap * it with whatever's in there now. This has to come after the * src1->dest copy! */ bool swapped_cx = false; if (ctx->regs[dest].temp_allocated) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, ctx->regs[dest].host_temp, X86_CX); } if (src2_spilled) { /* If src2 is spilled but we don't have a temporary, rCX * must be free, so we can just overwrite it. If we do * have a temporary, we just saved rCX away, so again we * can just load straight into it. For this specific case, * we always use a 32-bit load even if src2 is a 64-bit * value, since only the low 5-6 bits of the value matter. * (x86 is little-endian, so we don't have to adjust the * load address to do this.) */ append_load_gpr(&code, RTLTYPE_INT32, X86_CX, X86_SP, ctx->regs[src2].spill_offset); } else if (host_src2 != X86_CX) { if (current_reg(ctx, insn_index, X86_CX)) { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, X86_CX, host_src2); swapped_cx = true; } else { append_insn_ModRM_reg(&code, false, X86OP_MOV_Gv_Ev, X86_CX, host_src2); } } append_insn_ModRM_reg(&code, is64, X86OP_SHIFT_Ev_CL, opcode, host_dest); /* If we had to save or swap rCX, restore the original register * values. But prefer MOV over XCHG (and discard src2) if src2 * dies on this instruction, since MOV can be zero-latency. */ if (ctx->regs[dest].temp_allocated) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_CX, ctx->regs[dest].host_temp); } else if (swapped_cx) { if (unit->regs[src2].death == insn_index) { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_CX, host_src2); } else { append_insn_ModRM_reg(&code, true, X86OP_XCHG_Ev_Gv, X86_CX, host_src2); } } /* The shift instructions don't change any flags if the shift * count is zero, so we can't rely on the state of ZF. */ ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_{ROL,ROR} (and non-BMI2 SLL/SRL/SRA) case RTLOP_CLZ: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); if (handle->setup.host_features & BINREC_FEATURE_X86_LZCNT) { append_insn_ModRM_ctx(&code, is64, X86OP_LZCNT, host_dest, ctx, insn_index, src1); } else { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; append_insn_ModRM_ctx(&code, is64, X86OP_BSR, host_dest, ctx, insn_index, src1); append_insn_R(&code, false, X86OP_MOV_rAX_Iv, host_temp); append_imm32(&code, is64 ? 127 : 63); /* This can always be a 32-bit operation regardless of the * input data type. */ append_insn_ModRM_reg(&code, false, X86OP_CMOVZ, host_dest, host_temp); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Ib, X86OP_IMM_XOR, host_dest); append_imm8(&code, is64 ? 63 : 31); } if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } break; } // case RTLOP_CLZ case RTLOP_BSWAP: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, unit->regs[src1].type, host_dest, X86_SP, ctx->regs[src1].spill_offset); } else if (host_dest != host_src1) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Ev_Gv, host_src1, host_dest); } append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_dest); break; } // case RTLOP_BSWAP case RTLOP_SEQ: case RTLOP_SLTU: case RTLOP_SLTS: case RTLOP_SGTU: case RTLOP_SGTS: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Opcode set_opcode = ( insn->opcode == RTLOP_SLTU ? X86OP_SETB : insn->opcode == RTLOP_SLTS ? X86OP_SETL : insn->opcode == RTLOP_SGTU ? X86OP_SETA : insn->opcode == RTLOP_SGTS ? X86OP_SETG : /* RTLOP_SEQ */ X86OP_SETZ); /* On current-generation Intel processors, XOR reg,reg followed * by SETcc has less latency than SETcc followed by MOVZX, * because the processor recognizes XOR as a zero idiom and * doesn't impose a partial register stall on subsequent use of * the target GPR. However, XOR modifies the EFLAGS register, * so we can only make use of it if we're not omitting the * compare, and in that case we have to do the XOR first. * Naturally, this implies we also can't use XOR if the * destination register overlaps either of the source registers. */ const bool should_clear_dest = (!is_spilled(ctx, insn_index, src1) && host_dest != ctx->regs[src1].host_reg && (is_spilled(ctx, insn_index, src2) || host_dest != ctx->regs[src2].host_reg)); /* We don't bother checking for SEQ since icmp_eq is only used * for register-immediate compares. */ const bool added_compare = append_compare( ctx, insn_index, &code, src1, src2, 0, host_dest, false, false, should_clear_dest ? (int)host_dest : -1); const bool cleared_dest = should_clear_dest && added_compare; /* Registers SP-DI require a REX prefix (even if empty) to * access the low byte as a byte register. */ maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, set_opcode, 0, host_dest); if (!cleared_dest) { maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_MOVZX_Gv_Eb, host_dest, host_dest); } break; } // case RTLOP_{SEQ,SLTU,SLTS,SGTU,SGTS} case RTLOP_BFEXT: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); const int operand_size = is64 ? 64 : 32; /* Despite documentation suggesting otherwise, it turns out * BEXTR has slightly less latency than a MOV/SHR/AND sequence * even including the extra instruction to load the control * byte, so we use it if extracting from the middle of a * register. */ if ((handle->setup.host_features & BINREC_FEATURE_X86_BMI1) && insn->bitfield.start != 0 && insn->bitfield.start + insn->bitfield.count < operand_size) { /* For this case, we use dest as a temporary to hold the * control byte, so it needs to be separate from src1. */ ASSERT(host_dest != host_src1 || is_spilled(ctx, insn_index, src1)); append_insn_R(&code, false, X86OP_MOV_rAX_Iv, host_dest); append_imm32(&code, insn->bitfield.start | insn->bitfield.count << 8); append_vex_insn_ModRM_ctx( &code, is64, false, X86OP_BEXTR, host_dest, ctx, insn_index, src1, host_dest); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } break; } X86Register host_shifted; if (insn->bitfield.start != 0) { append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, is64, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHR, host_dest); append_imm8(&code, insn->bitfield.start); host_shifted = host_dest; } else if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, unit->regs[src1].type, host_dest, X86_SP, ctx->regs[src1].spill_offset); host_shifted = host_dest; } else { host_shifted = host_src1; } if (insn->bitfield.start + insn->bitfield.count < operand_size) { if (insn->bitfield.count < 8) { if (host_shifted != host_dest) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_shifted); } append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Ib, X86OP_IMM_AND, host_dest); append_imm8(&code, (1 << insn->bitfield.count) - 1); } else if (insn->bitfield.count == 8) { maybe_append_empty_rex(&code, host_shifted, host_dest, -1); append_insn_ModRM_reg(&code, is64, X86OP_MOVZX_Gv_Eb, host_dest, host_shifted); } else if (insn->bitfield.count == 16) { append_insn_ModRM_reg(&code, is64, X86OP_MOVZX_Gv_Ew, host_dest, host_shifted); } else if (insn->bitfield.count < 32) { if (host_shifted != host_dest) { append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_shifted); } append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_dest); append_imm32(&code, (1 << insn->bitfield.count) - 1); } else if (insn->bitfield.count == 32) { append_insn_ModRM_reg(&code, false, X86OP_MOV_Gv_Ev, host_dest, host_shifted); } else { X86Register host_andsrc; if (host_shifted != host_dest) { host_andsrc = host_shifted; append_insn_R(&code, true, X86OP_MOV_rAX_Iv, host_dest); } else { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; host_andsrc = host_temp; append_insn_R(&code, true, X86OP_MOV_rAX_Iv, host_temp); } append_imm64(&code, (UINT64_C(1) << insn->bitfield.count) - 1); append_insn_ModRM_reg(&code, is64, X86OP_AND_Gv_Ev, host_dest, host_andsrc); } } else if (host_dest != host_shifted) { /* This implies that the instruction is "extracting" the * entire register contents. */ append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_shifted); } /* Whether flags are set depends on the location of the * bitfield in the source value. There probably aren't many * cases in which we'll want to save flags from a BFEXT anyway, * so we don't bother with the details and just clear the * cached state unconditionally. */ ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_BFEXT case RTLOP_BFINS: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const X86Register host_src2 = ctx->regs[src2].host_reg; const bool src2_spilled = is_spilled(ctx, insn_index, src2); ASSERT(host_dest != host_src2 || src2_spilled); const bool is64 = int_type_is_64(unit->regs[src1].type); const int operand_size = is64 ? 64 : 32; if (UNLIKELY(insn->bitfield.count == operand_size)) { /* Handle this case specially not so much for optimization * purposes (since it should normally be optimized to a * simple move at the RTL level) but because handling it * correctly in the normal path takes extra effort. */ append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_src2); break; } /* Copy the first source into the destination, masking off the * bits to be overwritten. */ if (is64 && insn->bitfield.start + insn->bitfield.count > 31) { const uint64_t src2_mask = (UINT64_C(1) << insn->bitfield.count) - 1; const uint64_t src1_mask = ~(src2_mask << insn->bitfield.start); if (host_dest == host_src1 && !is_spilled(ctx, insn_index, src1)) { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; ASSERT(host_temp != host_src2); append_insn_R(&code, true, X86OP_MOV_rAX_Iv, host_temp); append_imm64(&code, src1_mask); append_insn_ModRM_reg(&code, true, X86OP_AND_Gv_Ev, host_dest, host_temp); } else { append_insn_R(&code, true, X86OP_MOV_rAX_Iv, host_dest); append_imm64(&code, src1_mask); append_insn_ModRM_ctx(&code, true, X86OP_AND_Gv_Ev, host_dest, ctx, insn_index, src1); } } else { const uint32_t src2_mask = (1 << insn->bitfield.count) - 1; const uint32_t src1_mask = ~(src2_mask << insn->bitfield.start); if (src1_mask == 0x000000FF) { if (!is_spilled(ctx, insn_index, src1)) { maybe_append_empty_rex(&code, host_src1, host_dest, -1); } append_insn_ModRM_ctx(&code, is64, X86OP_MOVZX_Gv_Eb, host_dest, ctx, insn_index, src1); } else if (src1_mask == 0x0000FFFF) { append_insn_ModRM_ctx(&code, is64, X86OP_MOVZX_Gv_Ew, host_dest, ctx, insn_index, src1); } else { append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); if (src1_mask >= 0xFFFFFF80) { append_insn_ModRM_reg(&code, is64, X86OP_IMM_Ev_Ib, X86OP_IMM_AND, host_dest); append_imm8(&code, (uint8_t)src1_mask); } else { append_insn_ModRM_reg(&code, is64, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_dest); append_imm32(&code, src1_mask); } } } /* Copy the bits to be inserted to the temporary register, * shifting them to the appropriate place. But reuse src2 as * the temporary register if it's not spilled and it dies on * this instruction. */ X86Register host_newbits; if (!src2_spilled && unit->regs[src2].death == insn_index) { host_newbits = host_src2; } else { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; /* This assertion will hold even if src2 is currently * spilled, because whatever register is occupying * host_src2 must be live past this instruction. */ ASSERT(host_temp != host_src2); host_newbits = host_temp; } if (insn->bitfield.count > 32) { /* We can't use a 64-bit immediate value with AND, so * shift the value left and (if necessary) right again. */ ASSERT(is64); if (host_newbits != host_src2) { append_insn_ModRM_ctx(&code, true, X86OP_MOV_Gv_Ev, host_newbits, ctx, insn_index, src2); } append_insn_ModRM_reg(&code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHL, host_newbits); append_imm8(&code, 64 - insn->bitfield.count); const int shr_count = 64 - (insn->bitfield.start + insn->bitfield.count); if (shr_count > 0) { append_insn_ModRM_reg(&code, true, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHR, host_newbits); append_imm8(&code, shr_count); } } else { if (insn->bitfield.start + insn->bitfield.count == operand_size) { if (host_newbits != host_src2) { append_insn_ModRM_ctx( &code, is64, X86OP_MOV_Gv_Ev, host_newbits, ctx, insn_index, src2); } } else if (insn->bitfield.count < 8) { if (host_newbits != host_src2) { /* This can safely be a 32-bit move even if src2 * is a spilled 64-bit value, since x86 is * little-endian. */ append_insn_ModRM_ctx( &code, false, X86OP_MOV_Gv_Ev, host_newbits, ctx, insn_index, src2); } append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Ib, X86OP_IMM_AND, host_newbits); append_imm8(&code, (1 << insn->bitfield.count) - 1); } else if (insn->bitfield.count == 8) { if (!src2_spilled) { maybe_append_empty_rex(&code, host_src2, host_newbits, -1); } append_insn_ModRM_ctx( &code, false, X86OP_MOVZX_Gv_Eb, host_newbits, ctx, insn_index, src2); } else if (insn->bitfield.count == 16) { append_insn_ModRM_ctx( &code, false, X86OP_MOVZX_Gv_Ew, host_newbits, ctx, insn_index, src2); } else if (insn->bitfield.count == 32) { append_insn_ModRM_ctx( &code, false, X86OP_MOV_Gv_Ev, host_newbits, ctx, insn_index, src2); } else { if (host_newbits != host_src2) { append_insn_ModRM_ctx( &code, false, X86OP_MOV_Gv_Ev, host_newbits, ctx, insn_index, src2); } append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_newbits); append_imm32(&code, (1 << insn->bitfield.count) - 1); } if (insn->bitfield.start > 0) { append_insn_ModRM_reg(&code, is64, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHL, host_newbits); append_imm8(&code, insn->bitfield.start); } } /* OR the new bits into the destination. */ append_insn_ModRM_reg(&code, is64, X86OP_OR_Gv_Ev, host_dest, host_newbits); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_BFINS case RTLOP_ANDI: /* AND with 255 or 65535 can be translated to a zero-extend. */ if (insn->src_imm == 0xFF) { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; if (!is_spilled(ctx, insn_index, src1)) { maybe_append_empty_rex(&code, host_src1, host_dest, -1); } append_insn_ModRM_ctx(&code, false, X86OP_MOVZX_Gv_Eb, host_dest, ctx, insn_index, src1); break; } else if (insn->src_imm == 0xFFFF) { const X86Register host_dest = ctx->regs[dest].host_reg; append_insn_ModRM_ctx(&code, false, X86OP_MOVZX_Gv_Ew, host_dest, ctx, insn_index, src1); break; } /* Fall through to common ALU-immediate handling. */ case RTLOP_ADDI: case RTLOP_ORI: case RTLOP_XORI: { const X86Register host_dest = ctx->regs[dest].host_reg; /* The immediate value is actually signed, but we treat it as * unsigned here to simplify range testing. */ const uint32_t imm = (uint32_t)insn->src_imm; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86ImmOpcode opcode = ( insn->opcode == RTLOP_ADDI ? X86OP_IMM_ADD : insn->opcode == RTLOP_ANDI ? X86OP_IMM_AND : insn->opcode == RTLOP_ORI ? X86OP_IMM_OR : /* RTLOP_XORI */ X86OP_IMM_XOR); append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); if (imm + 128 < 256) { append_insn_ModRM_reg(&code, is64, X86OP_IMM_Ev_Ib, opcode, host_dest); append_imm8(&code, (uint8_t)imm); } else { append_insn_ModRM_reg(&code, is64, X86OP_IMM_Ev_Iz, opcode, host_dest); append_imm32(&code, imm); } if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } break; } // case RTLOP_{ADDI,ANDI,ORI,XORI} case RTLOP_MULI: { const X86Register host_dest = ctx->regs[dest].host_reg; const uint32_t imm = (uint32_t)insn->src_imm; // As for ADDI etc. const bool is64 = int_type_is_64(unit->regs[src1].type); if (imm + 128 < 256) { append_insn_ModRM_ctx(&code, is64, X86OP_IMUL_Gv_Ev_Ib, host_dest, ctx, insn_index, src1); append_imm8(&code, (uint8_t)imm); } else { append_insn_ModRM_ctx(&code, is64, X86OP_IMUL_Gv_Ev_Iz, host_dest, ctx, insn_index, src1); append_imm32(&code, imm); } ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; break; } // case RTLOP_MULI case RTLOP_SLLI: case RTLOP_SRLI: case RTLOP_SRAI: case RTLOP_RORI: { const X86Register host_dest = ctx->regs[dest].host_reg; const uint8_t shift_count = (uint8_t)insn->src_imm; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86ShiftOpcode opcode = ( insn->opcode == RTLOP_SLLI ? X86OP_SHIFT_SHL : insn->opcode == RTLOP_SRLI ? X86OP_SHIFT_SHR : insn->opcode == RTLOP_SRAI ? X86OP_SHIFT_SAR : /* RTLOP_RORI */ X86OP_SHIFT_ROR); append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, is64, X86OP_SHIFT_Ev_Ib, opcode, host_dest); append_imm8(&code, shift_count); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { if ((shift_count & 0xFF) != 0) { ctx->last_test_reg = dest; } else { ctx->last_test_reg = 0; } ctx->last_cmp_reg = 0; } break; } // case RTLOP_{SLLI,SRLI,SRAI,RORI} case RTLOP_SEQI: case RTLOP_SLTUI: case RTLOP_SLTSI: case RTLOP_SGTUI: case RTLOP_SGTSI: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Opcode set_opcode = ( insn->opcode == RTLOP_SLTUI ? X86OP_SETB : insn->opcode == RTLOP_SLTSI ? X86OP_SETL : insn->opcode == RTLOP_SGTUI ? (insn->src_imm == 0 ? X86OP_SETNZ : X86OP_SETA) : insn->opcode == RTLOP_SGTSI ? X86OP_SETG : /* RTLOP_SEQI */ X86OP_SETZ); /* See comments in the SEQ/SLTU/SLTS/SGTU/SGTS case for why we * conditionally use XOR to clear dest. */ const bool should_clear_dest = (is_spilled(ctx, insn_index, src1) || host_dest != ctx->regs[src1].host_reg); const bool added_compare = append_compare( ctx, insn_index, &code, src1, 0, insn->src_imm, 0, set_opcode==X86OP_SETZ || set_opcode==X86OP_SETNZ, false, should_clear_dest ? (int)host_dest : -1); const bool cleared_dest = should_clear_dest && added_compare; maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, set_opcode, 0, host_dest); if (!cleared_dest) { maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_MOVZX_Gv_Eb, host_dest, host_dest); } break; } // case RTLOP_{SEQI,SLTUI,SLTSI,SGTUI,SGTSI} case RTLOP_BITCAST: { const X86Register host_dest = ctx->regs[dest].host_reg; switch (unit->regs[src1].type) { case RTLTYPE_INT32: if (is_spilled(ctx, insn_index, src1)) { append_load(&code, RTLTYPE_FLOAT32, host_dest, X86_SP, -1, ctx->regs[src1].spill_offset); } else { append_insn_ModRM_ctx(&code, false, X86OP_MOVD_V_E, host_dest, ctx, insn_index, src1); } break; case RTLTYPE_INT64: if (is_spilled(ctx, insn_index, src1)) { append_load(&code, RTLTYPE_FLOAT64, host_dest, X86_SP, -1, ctx->regs[src1].spill_offset); } else { append_insn_ModRM_ctx(&code, true, X86OP_MOVD_V_E, host_dest, ctx, insn_index, src1); } break; case RTLTYPE_FLOAT32: if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, RTLTYPE_INT32, host_dest, X86_SP, ctx->regs[src1].spill_offset); } else { append_insn_ModRM_reg(&code, false, X86OP_MOVD_E_V, ctx->regs[src1].host_reg, host_dest); } break; case RTLTYPE_FLOAT64: if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, RTLTYPE_INT64, host_dest, X86_SP, ctx->regs[src1].spill_offset); } else { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, ctx->regs[src1].host_reg, host_dest); } break; default: log_error(handle, "Invalid data type %s in BITCAST at %d", rtl_type_name(unit->regs[src1].type), insn_index); } break; } // case RTLOP_BITCAST case RTLOP_FCVT: { const X86Register host_dest = ctx->regs[dest].host_reg; if (unit->regs[dest].type == RTLTYPE_FLOAT64) { ASSERT(unit->regs[src1].type == RTLTYPE_FLOAT32); append_insn_ModRM_ctx(&code, false, X86OP_CVTSS2SD, host_dest, ctx, insn_index, src1); } else { ASSERT(unit->regs[dest].type == RTLTYPE_FLOAT32); ASSERT(unit->regs[src1].type == RTLTYPE_FLOAT64); append_insn_ModRM_ctx(&code, false, X86OP_CVTSD2SS, host_dest, ctx, insn_index, src1); } break; } // case RTLOP_FCVT case RTLOP_FZCAST: handle->code_len = code.len; if (!translate_fzcast(ctx, insn_index)) { return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; code.len = handle->code_len; initial_len = code.len; // Suppress output length check. break; case RTLOP_FSCAST: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[src1].type); const X86Opcode opcode = (unit->regs[dest].type == RTLTYPE_FLOAT64 ? X86OP_CVTSI2SD : X86OP_CVTSI2SS); append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); break; } // case RTLOP_FSCAST case RTLOP_FROUNDI: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[dest].type); const X86Opcode opcode = (unit->regs[src1].type == RTLTYPE_FLOAT64 ? X86OP_CVTSD2SI : X86OP_CVTSS2SI); append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); break; } // case RTLOP_FROUNDI case RTLOP_FTRUNCI: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool is64 = int_type_is_64(unit->regs[dest].type); const X86Opcode opcode = (unit->regs[src1].type == RTLTYPE_FLOAT64 ? X86OP_CVTTSD2SI : X86OP_CVTTSS2SI); append_insn_ModRM_ctx(&code, is64, opcode, host_dest, ctx, insn_index, src1); break; } // case RTLOP_FTRUNCI case RTLOP_FNEG: case RTLOP_FABS: case RTLOP_FNABS: { const X86Register host_dest = ctx->regs[dest].host_reg; const bool isvec = rtl_register_is_vector(&unit->regs[dest]); const RTLDataType base_type = isvec ? rtl_vector_element_type(unit->regs[dest].type) : unit->regs[dest].type; const bool is64 = (base_type == RTLTYPE_FLOAT64); const X86Opcode opcode = ( insn->opcode == RTLOP_FNEG ? X86OP_XORPS : insn->opcode == RTLOP_FABS ? X86OP_ANDPS : /* RTLOP_FNABS */ X86OP_ORPS); const int lc_id = (insn->opcode == RTLOP_FABS ? (isvec ? (is64 ? LC_V2_FLOAT64_INV_SIGNBIT : LC_V2_FLOAT32_INV_SIGNBIT) : (is64 ? LC_FLOAT64_INV_SIGNBIT : LC_FLOAT32_INV_SIGNBIT)) : (isvec ? (is64 ? LC_V2_FLOAT64_SIGNBIT : LC_V2_FLOAT32_SIGNBIT) : (is64 ? LC_FLOAT64_SIGNBIT : LC_FLOAT32_SIGNBIT))); const long lc_offset = ctx->const_loc[lc_id]; ASSERT(lc_offset); append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_riprel(&code, false, opcode, host_dest, lc_offset); break; } // case RTLOP_FNEG, RTLOP_FABS, RTLOP_FNABS case RTLOP_FADD: case RTLOP_FSUB: case RTLOP_FMUL: case RTLOP_FDIV: { const X86Register host_dest = ctx->regs[dest].host_reg; X86Register host_src2 = ctx->regs[src2].host_reg; bool src2_loaded = !is_spilled(ctx, insn_index, src2); const X86Opcode base_opcode = ( insn->opcode==RTLOP_FADD ? X86OP_ADDPS : insn->opcode==RTLOP_FSUB ? X86OP_SUBPS : insn->opcode==RTLOP_FMUL ? X86OP_MULPS : /* RTLOP_FDIV */ X86OP_DIVPS); const uint8_t prefix = sse_opcode_prefix_for_type(unit->regs[dest].type); if (insn->opcode == RTLOP_FDIV && unit->regs[dest].type == RTLTYPE_V2_FLOAT32) { /* If we do a DIVPS directly on the register values, the * two high elements of the XMM vector will trigger * invalid-operation exceptions since they're always zero * for V2_FLOAT32. To avoid this, we copy src2 into a * temporary register and insert 1.0f in the two high * elements, then divide by that temporary register * instead of the original src2. This also conveniently * leaves zeroes in the high words of the output. */ ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; append_move_or_load(&code, ctx, unit, insn_index, host_temp, src2); const long lc_offset = ctx->const_loc[LC_V2_FLOAT32_HIGH_ONES]; ASSERT(lc_offset); append_insn_ModRM_riprel(&code, false, X86OP_ORPS, host_temp, lc_offset); host_src2 = host_temp; src2_loaded = true; } if (host_dest == host_src2 && src2_loaded) { if (prefix) { append_imm8(&code, prefix); } append_insn_ModRM_ctx(&code, false, base_opcode, host_dest, ctx, insn_index, src1); } else { append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); if (prefix) { append_imm8(&code, prefix); } /* We can't use append_insn_ModRM_ctx() because src2 might * be in a different register due to the FDIV hack. */ if (src2_loaded) { append_insn_ModRM_reg(&code, false, base_opcode, host_dest, host_src2); } else { append_insn_ModRM_mem( &code, false, base_opcode, host_dest, X86_SP, -1, ctx->regs[src2].spill_offset); } } break; } // case RTLOP_FADD, RTLOP_FSUB, RTLOP_FMUL, RTLOP_FDIV case RTLOP_FSQRT: { const X86Register host_dest = ctx->regs[dest].host_reg; const uint8_t prefix = sse_opcode_prefix_for_type(unit->regs[dest].type); if (prefix) { append_imm8(&code, prefix); } append_insn_ModRM_ctx(&code, false, X86OP_SQRTPS, host_dest, ctx, insn_index, src1); break; } // case RTLOP_FSQRT case RTLOP_FCMP: { const X86Register host_dest = ctx->regs[dest].host_reg; RTLFloatCompare cmpsel = insn->fcmp & 7; /* LT/LE are converted to GT/GE during the first pass. */ ASSERT(cmpsel != RTLFCMP_LT && cmpsel != RTLFCMP_LE); const bool do_compare = !(ctx->last_cmp_reg == src1 && ctx->last_cmp_target == src2); const bool invert = (insn->fcmp & RTLFCMP_INVERT) != 0; /* Reload src1 (if necessary) before any XOR to increase code * parallelism. */ X86Register host_src1; if (do_compare) { if (!is_spilled(ctx, insn_index, src1)) { host_src1 = ctx->regs[src1].host_reg; } else { ASSERT(ctx->regs[dest].temp_allocated); host_src1 = ctx->regs[dest].host_temp; append_load(&code, unit->regs[src1].type, host_src1, X86_SP, -1, ctx->regs[src1].spill_offset); } } /* For EQ, we need to set dest to a default value since the * test requires two steps (P=0 and Z=1). XOR reg,reg is * faster than MOV reg,imm for this, but if using XOR we have * to do it before the test so as not to clobber the test * result. As long as we're at it, we also use XOR here for * non-EQ tests if the compare won't be omitted, since * XOR+SETcc is faster than SETcc+MOVZX. */ bool dest_initted = false; if (do_compare && !(cmpsel == RTLFCMP_EQ && invert)) { append_insn_ModRM_reg(&code, false, X86OP_XOR_Gv_Ev, host_dest, host_dest); dest_initted = true; } if (do_compare) { const bool is64 = (unit->regs[src1].type == RTLTYPE_FLOAT64); const bool ordered = (insn->fcmp & RTLFCMP_ORDERED) != 0; const X86Opcode cmp_opcode = is64 ? (ordered ? X86OP_COMISD : X86OP_UCOMISD) : (ordered ? X86OP_COMISS : X86OP_UCOMISS); append_insn_ModRM_ctx(&code, false, cmp_opcode, host_src1, ctx, insn_index, src2); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = 0; ctx->last_cmp_reg = src1; ctx->last_cmp_target = src2; } } if (cmpsel == RTLFCMP_EQ) { if (!dest_initted) { append_insn_R(&code, false, X86OP_MOV_rAX_Iv, host_dest); append_imm32(&code, invert ? 1 : 0); } const int jump_disp = (host_dest >= X86_SP ? 4 : 3); append_jump_raw(&code, X86OP_JP_Jb, jump_disp); const long jump_from = code.len; const X86Opcode set_opcode = invert ? X86OP_SETNZ : X86OP_SETZ; maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, set_opcode, 0, host_dest); const long jump_to = code.len; ASSERT(jump_to - jump_from == jump_disp); } else { const X86Opcode set_opcode = (cmpsel==RTLFCMP_GT ? (invert ? X86OP_SETBE : X86OP_SETA) : cmpsel==RTLFCMP_GE ? (invert ? X86OP_SETB : X86OP_SETAE) : /*RTLFCMP_UN*/ (invert ? X86OP_SETNP : X86OP_SETP)); maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, set_opcode, 0, host_dest); if (!dest_initted) { maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_MOVZX_Gv_Eb, host_dest, host_dest); } } break; } // case RTLOP_FCMP case RTLOP_FMADD: translate_fma(&code, ctx, insn_index, X86OP_VFMADD132PS, false, false); break; case RTLOP_FMSUB: translate_fma(&code, ctx, insn_index, X86OP_VFMSUB132PS, true, false); break; case RTLOP_FNMADD: translate_fma(&code, ctx, insn_index, X86OP_VFNMADD132PS, false, true); break; case RTLOP_FNMSUB: translate_fma(&code, ctx, insn_index, X86OP_VFNMSUB132PS, true, true); break; case RTLOP_FGETSTATE: append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_STMXCSR, X86_SP, -1, ctx->stack_mxcsr); append_insn_ModRM_mem( &code, false, X86OP_MOV_Gv_Ev, ctx->regs[dest].host_reg, X86_SP, -1, ctx->stack_mxcsr); break; case RTLOP_FSETSTATE: if (is_spilled(ctx, insn_index, src1)) { append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_LDMXCSR, X86_SP, -1, ctx->regs[src1].spill_offset); } else { append_store(&code, RTLTYPE_INT32, ctx->regs[src1].host_reg, X86_SP, -1, ctx->stack_mxcsr); append_insn_ModRM_mem( &code, false, X86OP_MISC_0FAE, X86OP_MISC0FAE_LDMXCSR, X86_SP, -1, ctx->stack_mxcsr); } break; case RTLOP_FTESTEXC: { const X86Register host_dest = ctx->regs[dest].host_reg; const X86Register host_src1 = ctx->regs[src1].host_reg; const uint8_t bits = rtlfexc_to_bits(insn->src_imm); if (UNLIKELY(!bits)) { log_error(handle, "Invalid FP exception %d in FTESTEXC at %d", (int)insn->src_imm, insn_index); break; } if (bits == 0x01) { /* In this case, we can just AND the value with 1. */ append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Ib, X86OP_IMM_AND, host_dest); append_imm8(&code, 1); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } } else { bool cleared_dest = false; if (is_spilled(ctx, insn_index, src1) || host_dest != ctx->regs[src1].host_reg) { append_insn_ModRM_reg(&code, false, X86OP_XOR_Gv_Ev, host_dest, host_dest); cleared_dest = true; } if (is_spilled(ctx, insn_index, src1)) { append_insn_ModRM_mem(&code, false, X86OP_UNARY_Eb, X86OP_UNARY_TEST, X86_SP, -1, ctx->regs[src1].spill_offset); } else { maybe_append_empty_rex(&code, host_src1, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_UNARY_Eb, X86OP_UNARY_TEST, host_src1); } append_imm8(&code, bits); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; ctx->last_cmp_target = 0; ctx->last_cmp_imm = 0; maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_SETNZ, 0, host_dest); if (!cleared_dest) { maybe_append_empty_rex(&code, host_dest, -1, -1); append_insn_ModRM_reg(&code, false, X86OP_MOVZX_Gv_Eb, host_dest, host_dest); } } break; } // case RTLOP_FTESTEXC case RTLOP_FCLEAREXC: { const X86Register host_dest = ctx->regs[dest].host_reg; append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Ib, X86OP_IMM_AND, host_dest); append_imm8(&code, -64); break; } // case RTLOP_FCLEAREXC case RTLOP_FSETROUND: { const X86Register host_dest = ctx->regs[dest].host_reg; append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_dest); append_imm32(&code, 0x9FFF); if (insn->src_imm != RTLFROUND_NEAREST) { append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_OR, host_dest); append_imm32( &code, ((const uint8_t[]){0,3,1,2})[insn->src_imm & 3] << 13); } break; } // case RTLOP_FSETROUND case RTLOP_FCOPYROUND: { const X86Register host_dest = ctx->regs[dest].host_reg; ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; if (!is_spilled(ctx, insn_index, src2) && host_dest == ctx->regs[src2].host_reg) { append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_dest); append_imm32(&code, 0x6000); append_move_or_load_gpr(&code, ctx, unit, insn_index, host_temp, src1); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_temp); append_imm32(&code, 0x9FFF); } else { append_move_or_load_gpr(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_dest); append_imm32(&code, 0x9FFF); append_move_or_load_gpr(&code, ctx, unit, insn_index, host_temp, src2); append_insn_ModRM_reg(&code, false, X86OP_IMM_Ev_Iz, X86OP_IMM_AND, host_temp); append_imm32(&code, 0x6000); } append_insn_ModRM_reg(&code, false, X86OP_OR_Gv_Ev, host_dest, host_temp); break; } // case RTLOP_FCOPYROUND case RTLOP_VBUILD2: { const X86Register host_dest = ctx->regs[dest].host_reg; append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); if (unit->regs[dest].type == RTLTYPE_V2_FLOAT32) { X86Register host_src2; if (is_spilled(ctx, insn_index, src2)) { ASSERT(ctx->regs[dest].temp_allocated); host_src2 = ctx->regs[dest].host_temp; append_load(&code, RTLTYPE_FLOAT32, host_src2, X86_SP, -1, ctx->regs[src2].spill_offset); } else { host_src2 = ctx->regs[src2].host_reg; } append_insn_ModRM_reg(&code, false, X86OP_UNPCKLPS, host_dest, host_src2); /* Ensure that the high half of the register is clear, * so that later floating-point operations don't raise * unnecessary exceptions. */ append_insn_ModRM_reg(&code, false, X86OP_MOVQ_V_W, host_dest, host_dest); } else { ASSERT(unit->regs[dest].type == RTLTYPE_V2_FLOAT64); append_insn_ModRM_ctx(&code, false, X86OP_MOVHPS_V_M, host_dest, ctx, insn_index, src2); } break; } // case RTLOP_VBUILD2 case RTLOP_VBROADCAST: { const X86Register host_dest = ctx->regs[dest].host_reg; if (unit->regs[src1].source == RTLREG_CONSTANT && unit->regs[src1].value.i64 == 0) { append_insn_ModRM_reg(&code, false, X86OP_XORPS, host_dest, host_dest); } else if (unit->regs[dest].type == RTLTYPE_V2_FLOAT32) { append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); append_insn_ModRM_reg(&code, false, X86OP_UNPCKLPS, host_dest, host_dest); /* If we loaded src1 from memory, the rest of the register * will have been cleared, so we don't need to manually * clear the high half here. */ if (!is_spilled(ctx, insn_index, src1)) { append_insn_ModRM_reg(&code, false, X86OP_MOVQ_V_W, host_dest, host_dest); } } else { ASSERT(unit->regs[dest].type == RTLTYPE_V2_FLOAT64); append_insn_ModRM_ctx(&code, false, X86OP_MOVDDUP, host_dest, ctx, insn_index, src1); } break; } // case RTLOP_VBROADCAST case RTLOP_VEXTRACT: { const X86Register host_dest = ctx->regs[dest].host_reg; if (insn->elem == 0) { /* Can't use append_move_or_load() since we change the type. */ const RTLDataType element_type = rtl_vector_element_type(unit->regs[src1].type); if (is_spilled(ctx, insn_index, src1)) { append_load(&code, element_type, host_dest, X86_SP, -1, ctx->regs[src1].spill_offset); } else if (ctx->regs[src1].host_reg != host_dest) { append_move(&code, element_type, host_dest, ctx->regs[src1].host_reg); } } else if (unit->regs[src1].type == RTLTYPE_V2_FLOAT32) { if (is_spilled(ctx, insn_index, src1)) { append_load(&code, RTLTYPE_FLOAT32, host_dest, X86_SP, -1, ctx->regs[src1].spill_offset + 4); } else { if (ctx->regs[src1].host_reg != host_dest) { append_move(&code, RTLTYPE_V2_FLOAT32, host_dest, ctx->regs[src1].host_reg); } append_insn_ModRM_reg(&code, false, X86OP_PSHIFTQ_U_I, X86OP_PSHIFT_SRLDQ, host_dest); append_imm8(&code, 4); } } else { ASSERT(unit->regs[src1].type == RTLTYPE_V2_FLOAT64); /* We can't just call append_insn_ModRM_ctx() unconditionally * with MOVLPS (a.k.a. MOVHLPS) because that reads from the * second doubleword of an XMM register but the _first_ * doubleword at a memory address. */ if (is_spilled(ctx, insn_index, src1)) { append_load(&code, RTLTYPE_FLOAT64, host_dest, X86_SP, -1, ctx->regs[src1].spill_offset + 8); } else { append_insn_ModRM_reg(&code, false, X86OP_MOVHLPS, host_dest, ctx->regs[src1].host_reg); } } break; } // case RTLOP_VEXTRACT case RTLOP_VINSERT: { const X86Register host_dest = ctx->regs[dest].host_reg; append_move_or_load(&code, ctx, unit, insn_index, host_dest, src1); if (unit->regs[dest].type == RTLTYPE_V2_FLOAT32) { X86Register host_src2; if (is_spilled(ctx, insn_index, src2)) { ASSERT(ctx->regs[dest].temp_allocated); host_src2 = ctx->regs[dest].host_temp; append_load(&code, RTLTYPE_FLOAT32, host_src2, X86_SP, -1, ctx->regs[src2].spill_offset); } else { host_src2 = ctx->regs[src2].host_reg; } if (insn->elem == 0) { /* PSLLQ instead of PSLLDQ to keep the high half of the * register clear. */ append_insn_ModRM_reg(&code, false, X86OP_PSHIFTQ_U_I, X86OP_PSHIFT_SLL, host_dest); append_imm8(&code, 32); append_insn_ModRM_reg( &code, false, X86OP_MOVSS_V_W, host_dest, host_src2); } else { /* UNPCKLPS pushes the second element of the old vector * into the high half of the register, so we need an * extra MOVQ to clear it even if src2 was reloaded * (and therefore has a zero second word). */ append_insn_ModRM_reg(&code, false, X86OP_UNPCKLPS, host_dest, host_src2); append_insn_ModRM_reg(&code, false, X86OP_MOVQ_V_W, host_dest, host_dest); } } else { ASSERT(unit->regs[dest].type == RTLTYPE_V2_FLOAT64); if (insn->elem == 0) { if (is_spilled(ctx, insn_index, src2)) { append_insn_ModRM_mem( &code, false, X86OP_MOVLPS_V_M, host_dest, X86_SP, -1, ctx->regs[src2].spill_offset); } else { append_insn_ModRM_reg( &code, false, X86OP_MOVSD_V_W, host_dest, ctx->regs[src2].host_reg); } } else { append_insn_ModRM_ctx(&code, false, X86OP_MOVHPS_V_M, host_dest, ctx, insn_index, src2); } } break; } // case RTLOP_VINSERT case RTLOP_VFCVT: { const X86Register host_dest = ctx->regs[dest].host_reg; if (unit->regs[dest].type == RTLTYPE_V2_FLOAT64) { ASSERT(unit->regs[src1].type == RTLTYPE_V2_FLOAT32); append_insn_ModRM_ctx(&code, false, X86OP_CVTPS2PD, host_dest, ctx, insn_index, src1); } else { ASSERT(unit->regs[dest].type == RTLTYPE_V2_FLOAT32); ASSERT(unit->regs[src1].type == RTLTYPE_V2_FLOAT64); append_insn_ModRM_ctx(&code, false, X86OP_CVTPD2PS, host_dest, ctx, insn_index, src1); } break; } // case RTLOP_VFCVT case RTLOP_VFCMP: { // FIXME: This is mostly a hack to speed up vector NaN checking. // Need to implement other comparison types and/or find a better // way to do this. ASSERT(insn->fcmp == RTLFCMP_UN); const X86Register host_dest = ctx->regs[dest].host_reg; ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; const bool is64 = (unit->regs[src1].type == RTLTYPE_V2_FLOAT64); const X86Opcode cmp_opcode = is64 ? X86OP_CMPPD : X86OP_CMPPS; append_move_or_load(&code, ctx, unit, insn_index, host_temp, src1); append_insn_ModRM_ctx(&code, false, cmp_opcode, host_temp, ctx, insn_index, src2); append_imm8(&code, X86XMMCMP_UNORD); if (is64) { append_insn_ModRM_reg(&code, false, X86OP_PSHIFTQ_U_I, X86OP_PSHIFT_SRLDQ, host_temp); append_imm8(&code, 4); } append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, host_temp, host_dest); break; } // case RTLOP_VFCMP case RTLOP_LOAD_IMM: { const uint64_t imm = insn->src_imm; const X86Register host_dest = ctx->regs[dest].host_reg; switch (unit->regs[dest].type) { case RTLTYPE_FLOAT32: if (imm == 0) { append_insn_ModRM_reg(&code, false, X86OP_XORPS, host_dest, host_dest); } else { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; append_insn_R(&code, false, X86OP_MOV_rAX_Iv, host_temp); append_imm32(&code, (uint32_t)imm); append_insn_ModRM_reg(&code, false, X86OP_MOVD_V_E, host_dest, host_temp); } break; case RTLTYPE_FLOAT64: if (imm == 0) { append_insn_ModRM_reg(&code, false, X86OP_XORPS, host_dest, host_dest); } else { ASSERT(ctx->regs[dest].temp_allocated); const X86Register host_temp = ctx->regs[dest].host_temp; append_insn_R(&code, true, X86OP_MOV_rAX_Iv, host_temp); append_imm64(&code, imm); append_insn_ModRM_reg(&code, true, X86OP_MOVD_V_E, host_dest, host_temp); } break; default: ASSERT(rtl_type_is_int(unit->regs[dest].type)); if (imm == 0) { append_insn_ModRM_reg(&code, false, X86OP_XOR_Gv_Ev, host_dest, host_dest); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } } else { append_load_imm_gpr(&code, host_dest, imm); } break; } break; } // case RTLOP_LOAD_IMM case RTLOP_LOAD_ARG: { const X86Register host_dest = ctx->regs[dest].host_reg; const int host_src_i = host_x86_int_arg_register(ctx, insn->arg_index); ASSERT(host_src_i >= 0); // Checked during register allocation. const X86Register host_src = (X86Register)host_src_i; if (host_dest != host_src) { const bool is64 = int_type_is_64(unit->regs[dest].type); append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_src); } break; } // case RTLOP_LOAD_ARG case RTLOP_LOAD: { const HostX86RegInfo * const dest_info = &ctx->regs[dest]; const X86Register host_dest = dest_info->host_reg; const X86Register host_temp = (dest_info->temp_allocated ? dest_info->host_temp : host_dest); X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, host_temp, &host_base, &host_index); RTLDataType load_type = unit->regs[dest].type; if (load_type == RTLTYPE_V2_FLOAT32) { load_type = RTLTYPE_FLOAT64; // Only load 8 bytes. } const int32_t offset = insn->host_data_32 ? (int32_t)insn->host_data_32 : insn->offset; append_load(&code, load_type, host_dest, host_base, host_index, offset); break; } // case RTLOP_LOAD case RTLOP_LOAD_U8: case RTLOP_LOAD_S8: { const HostX86RegInfo * const dest_info = &ctx->regs[dest]; const X86Register host_dest = dest_info->host_reg; const X86Register host_temp = (dest_info->temp_allocated ? dest_info->host_temp : host_dest); X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, host_temp, &host_base, &host_index); const int32_t offset = insn->host_data_32 ? (int32_t)insn->host_data_32 : insn->offset; const X86Opcode opcode = (insn->opcode == RTLOP_LOAD_U8 ? X86OP_MOVZX_Gv_Eb : X86OP_MOVSX_Gv_Eb); append_insn_ModRM_mem(&code, false, opcode, host_dest, host_base, host_index, offset); break; } // case RTLOP_LOAD_U8, RTLOP_LOAD_S8 case RTLOP_LOAD_U16: case RTLOP_LOAD_S16: { const HostX86RegInfo * const dest_info = &ctx->regs[dest]; const X86Register host_dest = dest_info->host_reg; const X86Register host_temp = (dest_info->temp_allocated ? dest_info->host_temp : host_dest); X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, host_temp, &host_base, &host_index); const int32_t offset = insn->host_data_32 ? (int32_t)insn->host_data_32 : insn->offset; const X86Opcode opcode = (insn->opcode == RTLOP_LOAD_U16 ? X86OP_MOVZX_Gv_Ew : X86OP_MOVSX_Gv_Ew); append_insn_ModRM_mem(&code, false, opcode, host_dest, host_base, host_index, offset); break; } // case RTLOP_LOAD_U16, RTLOP_LOAD_S16 case RTLOP_STORE: { X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, X86_R15, &host_base, &host_index); const int32_t offset = (int32_t)insn->host_data_32; const RTLRegister * const src2_reg = &unit->regs[src2]; RTLDataType type = src2_reg->type; if (src2_reg->source == RTLREG_CONSTANT && !src2_reg->live) { /* Constant store optimized to memory-immediate operation. */ const bool is64 = int_type_is_64(type); append_insn_ModRM_mem(&code, is64, X86OP_MOV_Ev_Iz, 0, host_base, host_index, offset); append_imm32(&code, (uint32_t)src2_reg->value.i64); } else { X86Register host_value = ctx->regs[src2].host_reg; if (is_spilled(ctx, insn_index, src2)) { /* src3 is our value temporary (see register allocation). * For plain stores, if we run out of GPRs we'll just use * XMM15 instead, so there's no collision with the base * or index register. */ host_value = insn->src3; ASSERT(host_value != host_base && (int)host_value != host_index); if (host_value >= X86_XMM0 && rtl_type_is_int(type)) { type = (int_type_is_64(type) ? RTLTYPE_FLOAT64 : RTLTYPE_FLOAT32); } append_load(&code, type, host_value, X86_SP, -1, ctx->regs[src2].spill_offset); } const RTLDataType store_type = (type == RTLTYPE_V2_FLOAT32) ? RTLTYPE_FLOAT64 : type; append_store(&code, store_type, host_value, host_base, host_index, offset); } break; } // case RTLOP_STORE case RTLOP_STORE_I8: { X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, X86_R15, &host_base, &host_index); const int32_t offset = (int32_t)insn->host_data_32; const RTLRegister * const src2_reg = &unit->regs[src2]; if (src2_reg->source == RTLREG_CONSTANT && !src2_reg->live) { /* Constant store optimized to memory-immediate operation. */ append_insn_ModRM_mem(&code, false, X86OP_MOV_Eb_Ib, 0, host_base, host_index, offset); append_imm8(&code, (uint8_t)src2_reg->value.i64); } else { X86Register host_value; const bool saved_ax = reload_store_source_gpr( &code, ctx, insn_index, &host_base, &host_index, &host_value); maybe_append_empty_rex( &code, host_value, host_base, host_index); append_insn_ModRM_mem(&code, false, X86OP_MOV_Eb_Gb, host_value, host_base, host_index, offset); if (saved_ax) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, X86_XMM15, X86_AX); } } break; } // case RTLOP_STORE_I8 case RTLOP_STORE_I16: { X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, X86_R15, &host_base, &host_index); const int32_t offset = (int32_t)insn->host_data_32; const RTLRegister * const src2_reg = &unit->regs[src2]; if (src2_reg->source == RTLREG_CONSTANT && !src2_reg->live) { /* Constant store optimized to memory-immediate operation. */ append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_mem(&code, false, X86OP_MOV_Ev_Iz, 0, host_base, host_index, offset); append_imm16(&code, (uint16_t)src2_reg->value.i64); } else { X86Register host_value; const bool saved_ax = reload_store_source_gpr( &code, ctx, insn_index, &host_base, &host_index, &host_value); append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_mem(&code, false, X86OP_MOV_Ev_Gv, host_value, host_base, host_index, offset); if (saved_ax) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, X86_XMM15, X86_AX); } } break; } // case RTLOP_STORE_I16 case RTLOP_LOAD_BR: { const HostX86RegInfo * const dest_info = &ctx->regs[dest]; const X86Register host_dest = dest_info->host_reg; const X86Register host_temp = (dest_info->temp_allocated ? dest_info->host_temp : host_dest); X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, host_temp, &host_base, &host_index); const int32_t offset = insn->host_data_32 ? (int32_t)insn->host_data_32 : insn->offset; switch (unit->regs[dest].type) { case RTLTYPE_INT32: case RTLTYPE_INT64: case RTLTYPE_ADDRESS: { const bool is64 = int_type_is_64(unit->regs[dest].type); if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, is64, X86OP_MOVBE_Gy_My, host_dest, host_base, host_index, offset); } else { append_insn_ModRM_mem(&code, is64, X86OP_MOV_Gv_Ev, host_dest, host_base, host_index, offset); append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_dest); } break; } // case RTLTYPE_{INT32,INT64,ADDRESS} case RTLTYPE_FLOAT32: case RTLTYPE_FLOAT64: { const bool is64 = (unit->regs[dest].type == RTLTYPE_FLOAT64); if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, is64, X86OP_MOVBE_Gy_My, host_temp, host_base, host_index, offset); } else { append_insn_ModRM_mem(&code, is64, X86OP_MOV_Gv_Ev, host_temp, host_base, host_index, offset); append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_temp); } append_insn_ModRM_reg(&code, is64, X86OP_MOVD_V_E, host_dest, host_temp); break; } // case RTLTYPE_{FLOAT32,FLOAT64} default: log_error(handle, "Invalid data type %s in LOAD_BR", rtl_type_name(unit->regs[dest].type)); } break; } // case RTLOP_LOAD_BR case RTLOP_LOAD_U16_BR: case RTLOP_LOAD_S16_BR: { const HostX86RegInfo * const dest_info = &ctx->regs[dest]; const X86Register host_dest = dest_info->host_reg; const X86Register host_temp = (dest_info->temp_allocated ? dest_info->host_temp : host_dest); X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, host_temp, &host_base, &host_index); const int32_t offset = insn->host_data_32 ? (int32_t)insn->host_data_32 : insn->offset; if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, false, X86OP_MOVBE_Gw_Mw, host_dest, host_base, host_index, offset); } else { /* MOVZX instead of plain MOV (which would leave the high * bits of the destination register unchanged) to avoid a * false dependency on the previous value of the register. */ append_insn_ModRM_mem(&code, false, X86OP_MOVZX_Gv_Ew, host_dest, host_base, host_index, offset); /* rorw $8,%reg would be slightly more compact, but that * incurs both a rotate penalty and a partial register * stall when subsequently using the full 32-bit register. * The byte-XCHG idiom (e.g. XCHG AH,AL) seems similarly * likely to cause a partial register stall, and we could * only use it with AX through DX anyway, so we don't do * that either. Modern processors should all support * MOVBE anyway. */ append_insn_R(&code, false, X86OP_BSWAP_rAX, host_dest); append_insn_ModRM_reg(&code, false, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHR, host_dest); append_imm8(&code, 16); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } if (insn->opcode == RTLOP_LOAD_U16_BR) { break; // Already zero-extended. } } const X86Opcode opcode = (insn->opcode == RTLOP_LOAD_U16_BR ? X86OP_MOVZX_Gv_Ew : X86OP_MOVSX_Gv_Ew); append_insn_ModRM_reg(&code, false, opcode, host_dest, host_dest); break; } // case RTLOP_LOAD_U16_BR, RTLOP_LOAD_S16_BR case RTLOP_STORE_BR: { X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, X86_R15, &host_base, &host_index); const int32_t offset = (int32_t)insn->host_data_32; X86Register host_value; const bool saved_ax = reload_store_source_gpr( &code, ctx, insn_index, &host_base, &host_index, &host_value); switch (unit->regs[src2].type) { case RTLTYPE_INT32: case RTLTYPE_INT64: case RTLTYPE_ADDRESS: { const bool is64 = int_type_is_64(unit->regs[src2].type); if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, is64, X86OP_MOVBE_My_Gy, host_value, host_base, host_index, offset); } else { append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_value); append_insn_ModRM_mem(&code, is64, X86OP_MOV_Ev_Gv, host_value, host_base, host_index, offset); if (!is_spilled(ctx, insn_index, src2) && host_value == ctx->regs[src2].host_reg && unit->regs[src2].death > insn_index) { append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_value); } } break; } // case RTLTYPE_{INT32,INT64,ADDRESS} case RTLTYPE_FLOAT32: case RTLTYPE_FLOAT64: { const bool is64 = (unit->regs[src2].type == RTLTYPE_FLOAT64); if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, is64, X86OP_MOVBE_My_Gy, host_value, host_base, host_index, offset); } else { append_insn_R(&code, is64, X86OP_BSWAP_rAX, host_value); append_insn_ModRM_mem(&code, is64, X86OP_MOV_Ev_Gv, host_value, host_base, host_index, offset); } break; } // case RTLTYPE_{FLOAT32,FLOAT64} default: log_error(handle, "Invalid data type %s in STORE_BR", rtl_type_name(unit->regs[src2].type)); } if (saved_ax) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, X86_XMM15, X86_AX); } break; } // case RTLOP_STORE_BR case RTLOP_STORE_I16_BR: { X86Register host_base; int host_index; reload_base_and_index(&code, ctx, insn_index, X86_R15, &host_base, &host_index); const int32_t offset = (int32_t)insn->host_data_32; X86Register host_value; const bool saved_ax = reload_store_source_gpr( &code, ctx, insn_index, &host_base, &host_index, &host_value); if (handle->setup.host_features & BINREC_FEATURE_X86_MOVBE) { append_insn_ModRM_mem(&code, false, X86OP_MOVBE_Mw_Gw, host_value, host_base, host_index, offset); } else if (is_spilled(ctx, insn_index, src2) || host_value != ctx->regs[src2].host_reg || unit->regs[src2].death <= insn_index) { append_insn_R(&code, false, X86OP_BSWAP_rAX, host_value); append_insn_ModRM_reg(&code, false, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_SHR, host_value); append_imm8(&code, 16); append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_mem(&code, false, X86OP_MOV_Ev_Gv, host_value, host_base, host_index, offset); /* We can't treat this as a test of the register because * there might be data in the high 16 bits. */ ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; } else { append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_reg(&code, false, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_ROR, host_value); append_imm8(&code, 8); append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_mem(&code, false, X86OP_MOV_Ev_Gv, host_value, host_base, host_index, offset); append_opcode(&code, X86OP_OPERAND_SIZE); append_insn_ModRM_reg(&code, false, X86OP_SHIFT_Ev_Ib, X86OP_SHIFT_ROR, host_value); append_imm8(&code, 8); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; } if (saved_ax) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, X86_XMM15, X86_AX); } break; } // case RTLOP_STORE_I16_BR case RTLOP_ATOMIC_INC: { const X86Register host_dest = ctx->regs[dest].host_reg; X86Register host_base; int host_index; /* A temporary register is only allocated if there are any * spill reloads, but if there are no reloads then the fallback * register isn't used anyway, so it's safe to pass host_temp * without checking temp_allocated. */ reload_base_and_index(&code, ctx, insn_index, ctx->regs[dest].host_temp, &host_base, &host_index); const bool is64 = int_type_is_64(unit->regs[dest].type); append_insn_R(&code, false, X86OP_MOV_rAX_Iv, host_dest); append_imm32(&code, 1); append_opcode(&code, X86OP_LOCK); append_insn_ModRM_mem( &code, is64, X86OP_XADD_Ev_Gv, host_dest, host_base, host_index, insn->host_data_32); if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = dest; ctx->last_cmp_reg = 0; } break; } // case RTLOP_ATOMIC_INC case RTLOP_CMPXCHG: { const X86Register host_dest = ctx->regs[dest].host_reg; ASSERT(host_dest != X86_AX); X86Register host_src1 = ctx->regs[src1].host_reg; X86Register host_src3 = ctx->regs[insn->src3].host_reg; X86Register host_temp; int temp_index = 0; const bool is64 = int_type_is_64(unit->regs[dest].type); /* If we have a temporary, we need to save RAX. However, we * want to save the allocated temporary (a GPR) for a CMPXCHG * operand, so we save RAX to R15, or to XMM15 if R15 was * allocated as the temporary. */ if (ctx->regs[dest].temp_allocated) { if (ctx->regs[dest].host_temp == X86_R15) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_V_E, X86_XMM15, X86_AX); } else { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_R15, X86_AX); } host_temp = ctx->regs[dest].host_temp; } else { /* RAX is free (or already in use by an operand), so our * temporary is R15. */ host_temp = X86_R15; } /* Reload src1 and src3, if needed. */ if (is_spilled(ctx, insn_index, src1)) { append_load_gpr(&code, RTLTYPE_ADDRESS, host_temp, X86_SP, ctx->regs[src1].spill_offset); host_src1 = host_temp; host_temp = host_dest; /* Make sure we're not about to overwrite src2 in case src3 * is also spilled (the register allocator guarantees this). */ ASSERT(!(!is_spilled(ctx, insn_index, src2) && host_dest == ctx->regs[src2].host_reg)); temp_index++; } if (is_spilled(ctx, insn_index, insn->src3)) { append_load_gpr(&code, unit->regs[insn->src3].type, host_temp, X86_SP, ctx->regs[insn->src3].spill_offset); host_src3 = host_temp; temp_index++; } /* If we have an index register and it's spilled, "reload" it * by adding it to the address register and subtracting it * again afterward. This is obviously slow, but it should * probably be uncommon since address generation for atomic * operations is normally fairly localized. */ int host_index = -1; bool index_spilled = false; if (insn->host_data_16) { HostX86RegInfo *index_info = &ctx->regs[insn->host_data_16]; if (is_spilled(ctx, insn_index, insn->host_data_16)) { log_warning(handle, "Slow reload of spilled CMPXCHG" " index register"); index_spilled = true; ASSERT(unit->regs[insn->host_data_16].type == RTLTYPE_ADDRESS); append_insn_ModRM_mem( &code, true, X86OP_ADD_Gv_Ev, host_src1, X86_SP, -1, index_info->spill_offset); } else { host_index = index_info->host_reg; } } /* If we have an index register in RAX and both src1 and src3 * were spilled, add the index to the reloaded src1 so it's not * in the way of src2. Since host_src1 is a temporary in this * case, we don't have to worry about restoring its old value * later. We don't try to use the rAX save register because * (1) it won't exist if the index dies on this instruction * and (2) we can't use it as an index if it's XMM15. */ if (host_index == X86_AX && temp_index == 2) { ASSERT(is_spilled(ctx, insn_index, src1)); append_insn_ModRM_reg(&code, true, X86OP_ADD_Gv_Ev, host_src1, host_index); host_index = -1; } /* Load src2 (the compare value) into rAX. If any other * operand is in rAX, save it in the destination register; * note that we don't need to restore it from dest later, * since if it's live past this instruction, it will already * have been saved in (and be restored from) R15 or XMM15. * We also don't have to worry about clobbering anything * that's already in dest, since the register allocator avoids * reusing the register of any unspilled input operand. */ if (host_src1 == X86_AX || host_src3 == X86_AX || host_index == X86_AX) { ASSERT(temp_index < 2); /* If we saved RAX to R15 above, this MOV is technically * unnecessary, but the logic to use R15 in that specific * case (which will probably be fairly rare) is more * complex than it's worth, especially since MOVs are * potentially zero-latency. Likewise, we don't try to * omit this MOV if src2 is also in rAX (due to being the * same RTL register as src1 or src3). */ const bool is64_ax = (is64 || host_src1 == X86_AX || host_index == X86_AX); append_insn_ModRM_reg(&code, is64_ax, X86OP_MOV_Gv_Ev, host_dest, X86_AX); if (host_src1 == X86_AX) { host_src1 = host_dest; } if (host_src3 == X86_AX) { host_src3 = host_dest; } if (host_index == X86_AX) { host_index = host_dest; } } append_move_or_load_gpr(&code, ctx, unit, insn_index, X86_AX, src2); /* Do the actual compare-and-swap. */ append_opcode(&code, X86OP_LOCK); append_insn_ModRM_mem( &code, is64, X86OP_CMPXCHG_Ev_Gv, host_src3, host_src1, host_index, insn->host_data_32); /* Undo the ADD from index reloading if necessary. */ if (index_spilled) { append_insn_ModRM_mem( &code, true, X86OP_SUB_Gv_Ev, host_src1, X86_SP, -1, ctx->regs[insn->host_data_16].spill_offset); } /* Move the result to the destination. The instruction * description says that the value of the compare target is * only written to the result register (rAX) if the compare * fails, but if the compare succeeds, rAX already has the * correct value, so we can use it unconditionally. */ append_insn_ModRM_reg(&code, is64, X86OP_MOV_Gv_Ev, host_dest, X86_AX); /* Restore RAX if necessary. */ if (ctx->regs[dest].temp_allocated) { if (ctx->regs[dest].host_temp == X86_R15) { append_insn_ModRM_reg(&code, true, X86OP_MOVD_E_V, X86_XMM15, X86_AX); } else { append_insn_ModRM_reg(&code, true, X86OP_MOV_Gv_Ev, X86_AX, X86_R15); } } if (handle->host_opt & BINREC_OPT_H_X86_CONDITION_CODES) { ctx->last_test_reg = 0; ctx->last_cmp_reg = dest; ctx->last_cmp_target = src2; ctx->last_cmp_imm = 0; } break; } // case RTLOP_CMPXCHG case RTLOP_LABEL: ASSERT(insn->label > 0); ASSERT(insn->label < unit->next_label); ASSERT(ctx->label_offsets[insn->label] < 0); if (handle->host_opt & BINREC_OPT_H_X86_BRANCH_ALIGNMENT) { /* * Intel's documentation recommends aligning all branch * targets to a multiple of 16 byte so that the instruction * decoder (which fetches aligned 16-byte blocks) can read * as many instructions as possible. However, we have to * balance that with the fact that all NOPs which appear * in the actual code path have to be decoded and executed * just like other instructions. So we use the following * heuristic to decide whether to align a label: * * - If the label follows an unconditional branch, so that * execution never falls into the block, always align * the label since there's no penalty (other than * increased code size) for doing so. * * - Otherwise, if the label is the target of a backward * branch, align it if there are less than 10 bytes left * in the current 16-byte line, since backward branches * generally indicate loops and we can thus expect them * to be reached more often by branching than by falling * through. * * - Otherwise, only align the label if there are less * than 5 bytes left in the current 16-byte line. */ bool follows_uncond = false; if (block->prev_block >= 0) { const RTLBlock *prev_block = &unit->blocks[block->prev_block]; const RTLInsn *prev_insn = &unit->insns[prev_block->last_insn]; follows_uncond = (prev_insn->opcode == RTLOP_GOTO || prev_insn->opcode == RTLOP_RETURN); } const int align_distance = (16 - code.len) & 15; bool should_align; if (follows_uncond) { should_align = true; } else if (insn->host_data_16) { /* host_data_16 is set to nonzero if the label is * targeted by a backward branch. */ should_align = (align_distance < 10); } else { should_align = (align_distance < 5); } if (should_align) { append_nops(&code, align_distance); ASSERT((code.len & 15) == 0); } } ctx->label_offsets[insn->label] = code.len; break; case RTLOP_GOTO: ASSERT(insn->label > 0); ASSERT(insn->label < unit->next_label); ASSERT(block_info->unresolved_branch_offset < 0); ASSERT(unit->label_blockmap[insn->label] >= 0); if (!reload_regs_for_block(&code, ctx, block_index, unit->label_blockmap[insn->label])) { return false; } initial_len = code.len; // Don't include setup in length check. append_jump(&code, block_info, X86OP_JMP_Jb, X86OP_JMP_Jz, insn->label, ctx->label_offsets[insn->label]); fall_through = false; break; case RTLOP_GOTO_IF_Z: case RTLOP_GOTO_IF_NZ: { ASSERT(insn->label > 0); ASSERT(insn->label < unit->next_label); ASSERT(block_info->unresolved_branch_offset < 0); const int target_block = unit->label_blockmap[insn->label]; ASSERT(target_block >= 0); uint8_t jump_condition; if (insn->host_data_16) { jump_condition = insn->host_data_16 & 0xF; } else { jump_condition = (insn->opcode == RTLOP_GOTO_IF_Z ? X86CC_Z : X86CC_NZ); } const X86Opcode short_opcode = X86OP_Jcc_Jb | jump_condition; const X86Opcode long_opcode = X86OP_Jcc_Jz | jump_condition; if (insn->host_data_16 & 0x40) { // FTESTEXC ASSERT(insn->host_data_16 & 0x10); if (!is_spilled(ctx, insn_index, src1)) { maybe_append_empty_rex( &code, ctx->regs[src1].host_reg, -1, -1); } append_insn_ModRM_ctx( &code, false, X86OP_UNARY_Eb, X86OP_UNARY_TEST, ctx, insn_index, src1); append_imm8(&code, rtlfexc_to_bits(insn->host_data_32)); ctx->last_test_reg = 0; ctx->last_cmp_reg = 0; } else { append_compare(ctx, insn_index, &code, src1, src2, insn->host_data_32, (insn->host_data_16 >> 8) & 0x1F, (jump_condition & 0xE) == X86CC_Z, (insn->host_data_16 & 0x20) != 0, -1); } /* If we have any aliases or spills to reload that would * conflict with live registers, we have to invert the sense of * the branch here and set up the registers conditionally. */ if (check_reload_conflicts(ctx, block_index, insn_index)) { uint8_t reload_buffer[RELOAD_REGS_SIZE]; CodeBuffer reload_code = {.buffer = reload_buffer, .buffer_size = sizeof(reload_buffer), .len = 0}; ASSERT(reload_regs_for_block(&reload_code, ctx, block_index, target_block)); /* Write this jump as though the next one (to the target * label) will have a 32-bit displacement. If it ends up * having an 8-bit displacement, we'll fix up this * instruction afterward. */ const long reload_jump = code.len; /* Flipping the low bit of the opcode will invert the sense * of the branch. */ const int jump_disp = reload_code.len + 5; const int jump_opcode = (jump_disp < 128 ? short_opcode ^ 1 : long_opcode ^ 1); append_jump_raw(&code, jump_opcode, jump_disp); const long reload_start = code.len; const long needed_space = reload_code.len + 5; if (UNLIKELY(code.len + needed_space > code.buffer_size)) { handle->code_len = code.len; if (UNLIKELY(!binrec_ensure_code_space( handle, needed_space))) { log_error(handle, "No memory for alias conflict" " resolution code"); return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; } memcpy(&code.buffer[code.len], reload_code.buffer, reload_code.len); code.len += reload_code.len; const long final_jump = code.len; append_jump(&code, block_info, X86OP_JMP_Jb, X86OP_JMP_Jz, insn->label, ctx->label_offsets[insn->label]); if (code.len == final_jump + 2) { /* In order for the initial (conditional) jump over the * setup code to have a 32-bit displacement, the setup * code must have been at least 123 bytes long. But in * that case, the displacement for the final jump will * be -6 (for the initial jump) - 123 - at least 2 for * this jump, which is less than -128 so it can't be * encoded in one byte. Thus, if the final jump has an * 8-bit displacement, the initial jump must also have * had an 8-bit displacement. */ ASSERT(reload_start == reload_jump + 2); code.buffer[reload_start - 1] -= 3; } initial_len = code.len; // Suppress output length check. } else { if (!reload_regs_for_block(&code, ctx, block_index, target_block)) { return false; } initial_len = code.len; // Don't include setup in length check. append_jump(&code, block_info, short_opcode, long_opcode, insn->label, ctx->label_offsets[insn->label]); } break; } // case RTLOP_GOTO_IF_Z, RTLOP_GOTO_IF_NZ case RTLOP_CALL: case RTLOP_CALL_TRANSPARENT: handle->code_len = code.len; if (!translate_call(ctx, block_index, insn_index)) { return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; code.len = handle->code_len; initial_len = code.len; // Suppress output length check. break; case RTLOP_RETURN: ASSERT(block_info->unresolved_branch_offset < 0); if (src1) { append_move_or_load_gpr(&code, ctx, unit, insn_index, X86_AX, src1); } /* If this instruction terminates the last block in the unit, * we don't need an explicit jump to the epilogue. */ ASSERT(insn_index == block->last_insn); if (block->next_block >= 0) { /* We use label 0 (normally invalid) to indicate a jump to * the function epilogue. */ append_jump(&code, block_info, X86OP_JMP_Jb, X86OP_JMP_Jz, 0, -1); } fall_through = false; break; case RTLOP_CHAIN: handle->code_len = code.len; if (!translate_chain(ctx, insn_index)) { return false; } code.buffer = handle->code_buffer; code.buffer_size = handle->code_buffer_size; code.len = handle->code_len; initial_len = code.len; // Suppress output length check. break; case RTLOP_CHAIN_RESOLVE: translate_chain_resolve(ctx, &code, insn_index); break; case RTLOP_ILLEGAL: append_opcode(&code, X86OP_UD2); break; } // switch (insn->opcode) ASSERT(code.len - initial_len <= MAX_INSN_LEN); } if (fall_through && block->next_block >= 0) { if (!reload_regs_for_block(&code, ctx, block_index, block->next_block)) { return false; } } handle->code_len = code.len; return true; } /*************************************************************************/ /************************* Other local routines **************************/ /*************************************************************************/ /** * resolve_branches: Resolve forward branches in the generated code. * * When generating a forward branch, we don't yet know what the offset to * the target instruction will be, so we use this function to fill it in * after code generation is complete. * * [Parameters] * ctx: Translation context. */ static void resolve_branches(HostX86Context *ctx) { ASSERT(ctx); ASSERT(ctx->handle); for (int i = 0; i >= 0; i = ctx->unit->blocks[i].next_block) { HostX86BlockInfo *block_info = &ctx->blocks[i]; const long branch_offset = block_info->unresolved_branch_offset; if (branch_offset >= 0) { const int label = block_info->unresolved_branch_target; ASSERT(label >= 0 && label < ctx->unit->next_label); ASSERT(ctx->label_offsets[label] >= 0); int64_t offset = ctx->label_offsets[label] - branch_offset; ASSERT(offset > 0); // Or else it would have been resolved. ASSERT(offset < INT64_C(0x80000000)); // Sanity check. uint8_t *ptr = &ctx->handle->code_buffer[branch_offset]; offset -= 4; ptr[0] = (uint8_t)(offset >> 0); ptr[1] = (uint8_t)(offset >> 8); ptr[2] = (uint8_t)(offset >> 16); ptr[3] = (uint8_t)(offset >> 24); } } } /*-----------------------------------------------------------------------*/ /** * translate_unit: Translate the RTLUnit associated with the given * translation context. * * [Parameters] * ctx: Translation context. * [Return value] * True on success, false if out of memory. */ static bool translate_unit(HostX86Context *ctx) { ASSERT(ctx); ASSERT(ctx->unit); const RTLUnit * const unit = ctx->unit; /* Push all stack frame offsets forward by the callee reserve amount * (if any) so the low-level translation logic doesn't have to worry * about it. */ if (ctx->frame_callee_reserve > 0) { for (int reg_index = 1; reg_index < unit->next_reg; reg_index++) { if (ctx->regs[reg_index].spilled) { ctx->regs[reg_index].spill_offset += ctx->frame_callee_reserve; } } for (int reg = 0; reg < 32; reg++) { if (ctx->stack_callsave[reg] >= 0) { ctx->stack_callsave[reg] += ctx->frame_callee_reserve; } } /* Currently, we always allocate a frame slot for MXCSR when * translating a call-type instruction. */ ASSERT(ctx->stack_mxcsr >= 0); ctx->stack_mxcsr += ctx->frame_callee_reserve; } if (!append_prologue(ctx)) { return false; } memset(ctx->reg_map, 0, sizeof(ctx->reg_map)); for (int i = 0; i >= 0; i = unit->blocks[i].next_block) { if (!translate_block(ctx, i)) { return false; } } if (!append_epilogue(ctx, true)) { return false; } resolve_branches(ctx); return true; } /*-----------------------------------------------------------------------*/ /** * destroy_context: Free all resources used by the given context. The * context is assumed to have been initialized. * * [Parameters] * ctx: Context to clear. */ static void destroy_context(HostX86Context *ctx) { ASSERT(ctx); ASSERT(ctx->handle); binrec_free(ctx->handle, ctx->blocks); binrec_free(ctx->handle, ctx->regs); binrec_free(ctx->handle, ctx->label_offsets); binrec_free(ctx->handle, ctx->alias_buffer); } /*-----------------------------------------------------------------------*/ /** * init_context: Set up the given context for translation. * * [Parameters] * ctx: Context to initialize. * handle: Translation handle. * unit: RTLUnit to be translated. * [Return value] * True on success, false on error. */ static bool init_context(HostX86Context *ctx, binrec_t *handle, RTLUnit *unit) { ASSERT(ctx); memset(ctx, 0, sizeof(*ctx)); ctx->handle = handle; ctx->unit = unit; ctx->blocks = binrec_malloc( handle, sizeof(*ctx->blocks) * unit->num_blocks); ctx->regs = binrec_malloc(handle, sizeof(*ctx->regs) * unit->next_reg); ctx->label_offsets = binrec_malloc( handle, sizeof(*ctx->label_offsets) * unit->next_label); ctx->alias_buffer = binrec_malloc( handle, ((4 * unit->next_alias) * unit->num_blocks)); if (!ctx->blocks || !ctx->regs || !ctx->label_offsets || !ctx->alias_buffer) { log_error(handle, "No memory for output translation context"); destroy_context(ctx); return false; } memset(ctx->blocks, 0, sizeof(*ctx->blocks) * unit->num_blocks); for (int i = 0; i < unit->num_blocks; i++) { ctx->blocks[i].unresolved_branch_offset = -1; } memset(ctx->regs, 0, sizeof(*ctx->regs) * unit->next_reg); memset(ctx->label_offsets, -1, sizeof(*ctx->label_offsets) * unit->next_label); memset(ctx->alias_buffer, 0, ((4 * unit->next_alias) * unit->num_blocks)); memset(ctx->stack_callsave, -1, sizeof(ctx->stack_callsave)); ctx->stack_mxcsr = -1; return true; } /*************************************************************************/ /************************ Translation entry point ************************/ /*************************************************************************/ bool host_x86_translate(binrec_t *handle, struct RTLUnit *unit) { ASSERT(handle); ASSERT(unit); if (!unit->num_blocks || !unit->num_insns) { log_error(handle, "No code to translate"); goto error_return; } HostX86Context ctx; if (!init_context(&ctx, handle, unit)) { goto error_return; } if (!host_x86_allocate_registers(&ctx)) { goto error_destroy_context; } if (!translate_unit(&ctx)) { log_error(handle, "Out of memory while generating code"); goto error_destroy_context; } destroy_context(&ctx); return true; error_destroy_context: destroy_context(&ctx); error_return: return false; } /*************************************************************************/ /*************************************************************************/