/* * libbinrec: a recompiling translator for machine code * Copyright (c) 2016 Andrew Church * * This software may be copied and redistributed under certain conditions; * see the file "COPYING" in the source code distribution for details. * NO WARRANTY is provided with this software. */ #ifndef BINREC_H #define BINREC_H #include #include #ifdef __cplusplus extern "C" { #endif /* * General notes * ============= * * In this header (and the library source code in general), "guest" refers * to the source CPU or architecture, i.e. the input to the translator, * and "host" or "native" refers to the target CPU or architecture, i.e. * the output of the translator. (In a few function names, such as * binrec_native_arch(), "native" is used to refer to the runtime * environment when considered as a translator output architecture.) * A program which calls into code generated by this library is referred * to as a "client program". * * The numeric values of constants and the layout of structures in this * header are _not_ part of the public API; always use the symbolic names * rather than the numeric values or offsets when writing code which * interfaces to the library. The library ABI (including those values and * offsets) will be kept consistent through revisions of a major version, * such that a program compiled against library version x.y (x >= 1) will * also run correctly using library version x.z (if z < y, features such * as optimization flags only defined later than version x.z may be * ignored, but the generated code will behave in a compatible manner). * However, the ABI may change between major versions, or at any time * before version 1.0. * * * Guest-specific notes * ==================== * * The abbreviation "PSB" below refers to the processor state block, i.e. * the structure containing the current state of the guest processor. * * libbinrec is designed to allow a certain amount of flexibility in the * structure of the PSB. The binrec_setup_t structure includes * "state_offsets_*" sub-structures for specifying the byte offset of * various members of the PSB; the client program should fill in the * structure appropriate to the selected guest architecture with the * relevant offsets within its PSB structure. * * PowerPC 32-bit architectures (BINREC_ARCH_PPC_7XX) * -------------------------------------------------- * * Floating-point registers in the PSB are assumed to be stored as pairs * of 64-bit floating point ("double"-type) values, with the first value * of each pair holding the value of that register as a scalar or the ps0 * slot of the register for paired-single mode, and the second value of * each pair holding the value of the ps1 slot for paired-single mode. * The translated code will take care of converting between single and * double precision as needed. Note that the FPR array must be aligned to * a multiple of 16 bytes to avoid crashes due to misaligned accesses. * * Full floating-point exception handling bloats the translated code * significantly; in particular, paired-single arithmetic instructions * require several hundred host instructions each to correctly identify * all possible combinations of exception conditions, though most of * those instructions will be skipped for any particular execution of the * operation. Client programs should enable as many of the floating-point * optimization flags as possible for best performance; even those marked * UNSAFE are in fact safe for the vast majority of real-world code. In * particular, if the guest code does not read FPSCR or set any of the * FPSCR exception enable bits, enabling BINREC_OPT_G_PPC_NO_FPSCR_STATE * can improve performance of floating-point code by a factor of 10 or * more. If necessary, specific optimizations can be safely disabled for * individual blocks of guest code; changing optimization flags for one * translation unit will have no effect on the behavior of any other * translated code. * * The FEX and VX bits in FPSCR are not written to the copy of FPSCR * stored in the PSB, but are instead generated when needed by a * floating-point instruction with Rc=1 or the mffs or mcrfs instructions. * (This mimics the implementation of the bits on PowerPC CPUs: they have * no associated physical storage, and instead are hardwired to the * appropriate function of other FPSCR bits.) Client code which needs to * check the state of FPSCR[FEX] or FPSCR[VX] should manually compute them * based on the relevant exception and mask bits. * * Translated code assumes that the host's floating-point rounding mode is * set based on FPSCR[RN] and all host floating-point exception flags are * clear on entry. The code will maintain these invariants on all * outbound control transfers, so a client program which does not perform * any floating-point operations or modify FPSCR on its own only needs to * set host floating-point state once, before first calling translated code. * * The FPSCR[FR] bit is not set by any floating-point instructions, though * it can be written as usual by instructions which directly manipulate * FPSCR. * * The overflow and underflow exception enable bits (OE and UE) in FPSCR * are ignored; floating-point operations are performed as if both * exceptions are masked (FPSCR[OE]=0 and FPSCR[UE]=0). However, * FPSCR[FEX] will still reflect the state of the OE and UE bits actually * stored in FPSCR, so (for example) an Rc=1 instruction that generates an * overflow exception with FPSCR[OE]=1 will set cr1.FEX to 1. * * The "non-IEEE" (NI) flag in FPSCR is ignored; floating-point operations * will always be performed in full precision. * * libbinrec implements the implementation-specific behavior of 32-bit * PowerPC processors (at least the 750CL) that single-precision * floating-point instructions accept double-precision operands, including * the quirk that a double-precision frC operand to a single-precision * multiply or multiply-add instruction is rounded to 24 bits (this latter * behavior can be suppressed with the BINREC_OPT_G_PPC_FAST_FMULS * optimization flag). * * The conditional load/store instructions (lwarx and stwcx.) rely on * hardware support for their behavior. Since such hardware support is * not necessarily available in the host environment, and since a correct * implementation requires knowledge of that environment which is not * available to libbinrec, these instructions are implemented using a * compare-and-swap heuristic: an stwcx. after a lwarx succeeds if the * value at the target address is unchanged from the value it had when the * lwarx was executed. Translated code for lwarx and stwcx makes use of * two PSB fields: reserve_flag (an 8-bit Boolean value), which records * whether an lwarx is pending, and reserve_state (a 32-bit integer value), * which stores the value loaded by the most recent lwarx. This can result * in incorrect behavior if the guest code expects the store to fail due to * writes of an identical value or writes a new value using a regular store * between the conditional load and store, but this heuristic should * suffice for typical programs. Note that the address of the lwarx is * _not_ saved; a stwcx. to a different address will still succeed if that * address contains the same value as was loaded by the lwarx instruction. * (This matches the behavior of actual PowerPC CPUs in the sense that * address mismatches between lwarx and stwcx. are ignored.) * * Access to the time base registers via the mftb instruction is * implemented by calling a host-side callback function, a pointer to * which should be stored in the PSB at the offset indicated by * state_offsets_ppc.timebase_handler. The signature of the function is * "uint64_t handler(PSB *)", taking the pointer to the PSB which was * passed to the translated code and returning the current 64-bit time * base value. If the function pointer is NULL, reads of the time base * registers will always return zero. * * Exceptions generated by the system call (sc) and trap (tw/twi) * instructions are implemented as control transfers to host-side * functions, pointers to which should be stored in the PSB at the offsets * indicated by state_offsets_ppc.sc_handler and .trap_handler * respectively. The signature of the functions is: * void sc_handler(PSB *, uint32_t insn); * void trap_handler(PSB *); * taking the pointer to the PSB which was passed to the translated code * and, for the sc handler, the instruction word which caused the call. * The value of the NIA field in the PSB is set as the SRR0 register would * be set on a true PowerPC processor: to the address of the trap * instruction for trap exceptions, and to the address of the instruction * _following_ the sc instruction for system call exceptions (but see also * the BINREC_OPT_G_PPC_SC_BLR optimization flag). The translated code * will return immediately to its caller when the handler returns, and the * call to the handler may in fact be translated as a tail call. The * translated code does not check for NULL function pointers, so it will * crash if an exception occurs and the associated function pointer is * not set. * * All instruction words with the primary opcode of the sc instruction * (0x11) are decoded as that instruction. This deviates from the PowerPC * specification, in which only the instruction 0x4400_0002 is a valid sc * instruction, but is done to allow the use of that instruction as (for * example) a callout to native code in a PowerPC system emulator in which * the specific function to call is encoded in the instruction word. If * this behavior is not desired, the system call handler can simply treat * any instruction word other than 0x4400_0002 as an illegal instruction. * * libbinrec ignores the icbi instruction, except in that execution will * always return to the caller after icbi. If the guest program generates * and executes new (guest) code on the fly, the libbinrec client should * check after each block of translated code returns whether the * instruction immediately preceding NIA is an icbi instruction, and take * appropriate action if so. * * If a D-form (immediate offset) load or store instruction has an offset * which causes the final address to wrap around the 32-bit address space, * such as lwz rD,16(rA) where the value of rA is 0xFFFF_FFF0 or greater, * the access will improperly "leak" outside the guest memory region. * However, accesses to the top 32k of memory using single-register (not * lmw/stmw) D-form instructions with rA = 0 are handled correctly. * * Some obscure hardware quirks are not emulated by the translated code; * see the list of expected failures in tests/guest-ppc/exec/750cl-common.i * for details. * * * Host-specific notes * =================== * * Intel/AMD x86 64-bit architecture (BINREC_ARCH_X86_64_*) * -------------------------------------------------------- * * Translated code assumes support for all instruction set extensions * through SSE3. (More specifically, the following CPUID feature bits are * assumed to be set: CMOV, SSE, SSE2, and SSE3.) Use of later extensions * can be enabled by setting appropriate feature bits (BINREC_FEATURE_X86_*) * in the host_features field of binrec_setup_t. * * Translated code must be located at a 16-byte-aligned address for correct * behavior. If the code is not correctly aligned, certain floating-point * operations may raise exceptions (specifically the general-protection * exception, "#GP" as described in Intel documentation). libbinrec will * always request 16-byte alignment if a code_malloc() callback is supplied. * * Translated code maintains the host stack at 16-byte alignment, as * required by both System V and Windows ABIs. If the client program calls * translated code with a misaligned stack, floating-point code may raise * exceptions due to unaligned stack accesses. * * Loads of 64-bit floating point ("double") 2-element vectors must be * 16-byte aligned, or an exception will be raised. Other data types can * be loaded from any alignment, though values not aligned to a multiple of * the value size may take additional CPU cycles to load. * * Full (non-rounding) support for fused multiply-add operations is only * implemented for CPUs which support the FMA3 instruction set. If the * BINREC_FEATURE_X86_FMA feature flag is not set in the setup structure * passed to binrec_create_handle(), fused multiply-add operations will be * translated to separate multiply and add instructions, which will cause * the intermediate result to be rounded; this in turn may cause * floating-point exceptions to be incorrectly raised in certain edge cases * (such as HUGE_VAL * HUGE_VAL - inf, in which the multiplication rounds * to infinity and the resulting subtraction of infinities triggers an * exception). * * The prohibition on tail calls in the Windows SEH ABI also prevents the * use of dynamic chaining, so calling binrec_enable_chaining() has no * effect when the host architecture is BINREC_ARCH_X86_64_WINDOWS_SEH. * If a logging callback function is provided, a warning to this effect * will be emitted if binrec_enable_chaning() is called with a true value * for the enable parameter. */ /*************************************************************************/ /*********************** Data types and constants ************************/ /*************************************************************************/ /*----------------------------- Basic types -----------------------------*/ /** * binrec_t: Type of a translation handle. This handle stores global * translation settings, such as optimization flags and functions to use * for memory allocation. */ typedef struct binrec_t binrec_t; /** * binrec_arch_t: Enumeration of architectures and variants supported by * the library. All currently supported architectures are either * guest-only or host-only; see the inline comments at each enumerator. * * As a general rule, libbinrec assumes that its input is a program * designed to run on the selected guest architecture, and therefore all * instructions encountered in the program will be valid instruction * encodings. Consequently, this enumeration only includes coarse * architecture families which encompass a group of compatible processors; * for example, the PPC_7XX architecture covers all PowerPC CPUs through * the 750CL, and the input program is assumed to use only instructions * which are valid on the architecture it was written for. See also the * note on library limitations in the README file. */ typedef enum binrec_arch_t { /* Constant used by binrec_native_arch() to indicate an unsupported * architecture. */ BINREC_ARCH_INVALID = 0, /* PowerPC 32-bit architecture as implemented in PowerPC 7xx * processors, including all other instruction set extensions through * the PowerPC 750CL. Also supports programs written for PowerPC 6xx * CPUs, with the exception of non-PowerPC instructions (such as ABS) * specific to the PowerPC 601. */ BINREC_ARCH_PPC_7XX, // Guest only. /* Intel/AMD x86 64-bit architecture, using the SysV ABI. */ BINREC_ARCH_X86_64_SYSV, // Host only. /* Intel/AMD x86 64-bit architecture, using the Windows ABI. */ BINREC_ARCH_X86_64_WINDOWS, // Host only. /* Variant of BINREC_ARCH_X86_64_WINDOWS which prepends unwind * information to the returned function. The offset to the generated * code is stored as a 64-bit value at the returned code address, and * the unwind information is found immediately after that value. See * the documentation at the top of this file for caveats when using * this architecture variant. */ BINREC_ARCH_X86_64_WINDOWS_SEH, // Host only. } binrec_arch_t; /** * binrec_loglevel_t: Enumeration of log levels which can be passed to * the log function specified in binrec_setup_t. */ typedef enum binrec_loglevel_t { BINREC_LOGLEVEL_INFO, // Informational messages. BINREC_LOGLEVEL_WARNING, // Messages indicating a potential problem. BINREC_LOGLEVEL_ERROR, // Messages indicating failure of some operation. } binrec_loglevel_t; /*--------------------- Architecture feature flags ----------------------*/ /* * These flags indicate the presence of specific features (such as optional * instructions) within a particular architecture. These are used in the * "host_features" field of binrec_setup_t. */ /** * BINREC_FEATURE_X86_*: Feature flags for the x86 architecture. */ #define BINREC_FEATURE_X86_FMA (1U << 0) // FMA3 only (FMA4 unsupported). #define BINREC_FEATURE_X86_MOVBE (1U << 1) #define BINREC_FEATURE_X86_LZCNT (1U << 2) // Also known as ABM. #define BINREC_FEATURE_X86_BMI1 (1U << 3) #define BINREC_FEATURE_X86_BMI2 (1U << 4) /*-------------------------- Setup structures ---------------------------*/ /** * binrec_setup_ppc_t: Structure which defines processor state block * offsets for PowerPC guests. Contained in binrec_setup_t. * * Each block of registers is assumed to be contiguous; for example, GPR 1 * is accessed by loading a 32-bit value from gpr + 4. All multi-byte * values are assumed to be stored in host endian order. */ typedef struct binrec_setup_ppc_t { /* General-purpose registers (32 * uint32_t) */ int gpr; /* Floating-point registers (32 * double[2]) */ int fpr; /* Paired-single quantization registers (8 * uint32_t) */ int gqr; /* Miscellaneous registers (each uint32_t) */ int lr; int ctr; int cr; int xer; int fpscr; int pvr; // Processor Version Register (SPR 287) int pir; // Processor Identification Register (SPR 1023) /* lwarx/stwcx. reservation flag (uint8_t) */ int reserve_flag; /* lwarx/stwcx. reservation state (uint32_t) */ int reserve_state; /* Next instruction address (updated on return from translated code) */ int nia; /* Pointer to function to handle time base reads. Signature: * uint64_t timebase_handler(void *state) */ int timebase_handler; /* Pointer to function to handle system calls (sc instruction). * Should return the (possibly changed) state block pointer. * Signature: void *sc_handler(void *state) */ int sc_handler; /* Pointer to function to handle trap exceptions. Should return the * (possibly changed) state block pointer. * Signature: void *trap_handler(void *state) */ int trap_handler; /* Pointers to lookup tables (of type uint16_t[64]) for the fres and * frsqrte instructions. See the BINREC_OPT_G_PPC_NATIVE_RECIPROCAL * optimization flag documentation for details. */ int fres_lut; int frsqrte_lut; } binrec_setup_ppc_t; /** * binrec_setup_t: Structure which defines various parameters used by the * translator. Used by binrec_create_handle(). */ typedef struct binrec_setup_t { /** * guest, host: BINREC_ARCH_* values indicating the architecture and * variant to translate from (guest) and to (host). binrec_translate() * will fail if the library cannot perform the requested translation. */ binrec_arch_t guest; binrec_arch_t host; /** * host_features: Bitwise-OR of feature flags (BINREC_FEATURE_*) for * the selected host architecture, indicating which features should be * assumed to be present when generating host code. */ unsigned int host_features; /** * guest_memory_base: Pointer to a region of host memory reserved as * the address space of the guest code. binrec_translate() calls will * read source machine instructions and constant data from this region. * Memory accesses within the translated code itself will use the * address passed as a parameter to the code. */ void *guest_memory_base; /** * state_offsets_*: Offsets from the beginning of the processor state * block (as passed to the generated native code) to the various guest * registers and other processor state. Use the structure appropriate * to the selected guest architecture; see the definition of each * structure for details. */ union { binrec_setup_ppc_t state_offsets_ppc; }; /** * state_offset_chain_lookup: PSB offset to a pointer to a function * which looks up translated blocks for chaining (see * binrec_enable_chaining()). Signature: * void *chain_lookup(void *state, uint32_t target_address) */ int state_offset_chain_lookup; /** * state_offset_branch_exit_flag: PSB offset to a 32-bit value to * check at intra-unit branches (see binrec_enable_branch_exit_test()). */ int state_offset_branch_exit_flag; /** * userdata: Opaque pointer which is passed to all callback functions * below. */ void *userdata; /** * malloc: Pointer to a function which allocates memory, like malloc(). * If NULL, the system's malloc() will be used. * * Like standard malloc(), this function may return either NULL or a * pointer to a zero-size memory block if passed a size of zero. * * [Parameters] * userdata: User data pointer from setup structure. * size: Size of block to allocate, in bytes. * [Return value] * Pointer to allocated memory, or NULL on error or if size == 0. */ void *(*malloc)(void *userdata, size_t size); /** * realloc: Pointer to a function which resizes a block of allocated * memory, like realloc(). If NULL, the system's realloc() will be used. * * Like standard realloc(), this function may return either NULL or a * pointer to a zero-size memory block if passed a size of zero. * * [Parameters] * userdata: User data pointer from setup structure. * ptr: Block to resize, or NULL to allocate a new block. * size: New size of block, in bytes, or 0 to free the block. * [Return value] * Pointer to allocated memory, or NULL on error or if size == 0. */ void *(*realloc)(void *userdata, void *ptr, size_t size); /** * free: Pointer to a function which frees a block of allocated * memory, like free(). If NULL, the system's free() will be used. * * [Parameters] * userdata: User data pointer from setup structure. * ptr: Block to free (may be NULL). * [Return value] * Pointer to allocated memory, or NULL on error. */ void (*free)(void *userdata, void *ptr); /** * code_malloc: Pointer to a function which allocates a block of * memory for output machine code. If NULL, the malloc() callback (or * the system's malloc(), if that callback is also NULL) will be used * and no alignment will be performed. * * [Parameters] * userdata: User data pointer from setup structure. * size: Size of block to allocate, in bytes (guaranteed to be * nonzero). * alignment: Desired address alignment, in bytes (guaranteed to * be a power of 2). * [Return value] * Pointer to allocated memory, or NULL on error. */ void *(*code_malloc)(void *userdata, size_t size, size_t alignment); /** * code_realloc: Pointer to a function which resizes a block of memory * allocated with the code_malloc() callback. If NULL, the realloc() * callback (or the system's realloc(), if that callback is also NULL) * will be used. * * [Parameters] * userdata: User data pointer from setup structure. * ptr: Block to resize (guaranteed to be non-NULL). * old_size: Current size of block, in bytes. * new_size: New size of block, in bytes (guaranteed to be nonzero). * alignment: Required address alignment, in bytes (guaranteed to * be equal to the value used for initial allocation). * [Return value] * Pointer to allocated memory, or NULL on error. */ void *(*code_realloc)(void *userdata, void *ptr, size_t old_size, size_t new_size, size_t alignment); /** * code_free: Pointer to a function which frees a block of memory * allocated with the code_malloc() callback. If NULL, the free() * callback (or the system's free(), if that callback is also NULL) * will be used. * * [Parameters] * userdata: User data pointer from setup structure. * ptr: Block to free (may be NULL). */ void (*code_free)(void *userdata, void *ptr); /** * log: Pointer to a function to log messages from the library. * If NULL, no logging will be performed. * * [Parameters] * userdata: User data pointer from setup structure. * level: Log level (BINREC_LOGLEVEL_*). * message: Log message. */ void (*log)(void *userdata, binrec_loglevel_t level, const char *message); } binrec_setup_t; /*--------------------- General optimization flags ----------------------*/ /* * Optimizations performed by the library can generally be classified into * three types: * * - Behavior-safe: optimizations which purely affect the size or speed of * the generated code and have no effect on behavior. Optimizations * such as constant folding and deconditioning fall into this category. * Optimizations can be assumed to fall under this category where not * otherwise documented. * * - Specification-safe: optimizations which may change the behavior of * the generated code, but only within limits prescribed by the relevant * specification. For example, the NATIVE_IEEE_UNDERFLOW optimization * may change the results of certain floating-point operations relative * to the results returned by guest code running on its native hardware, * but the IEEE floating-point specification allows either of two * behaviors, so with respect to that specification, the optimized code * is no less correct than the original. As long as the guest code was * written to follow the specifications rather than the precise behavior * of the guest hardware, it will still behave correctly under these * optimizations. * * - Unsafe: optimizations which can materially impact the behavior of the * generated code, such as stack frame optimization. These optimizations * can benefit code which rigorously adhere to the relevant assumptions, * such as code produced by a high-level language compiler, but they can * cause nonconformant code to misbehave or even crash. Documentation * for an unsafe optimization will clearly indicate that fact. */ /** * BINREC_OPT_BASIC: Enable basic optimization of translated code. This * includes the following transformations: * * - Branches to other (unconditional or same-conditioned) branch * instructions will be threaded through to the final branch destination. * * - Unreachable basic blocks will be eliminated from the code stream. * * - Branches to the next instruction will be eliminated. */ #define BINREC_OPT_BASIC (1<<0) /** * BINREC_OPT_DECONDITION: Convert conditional branches and moves with * constant conditions to unconditional instructions or NOPs. This is * most useful in conjunction with constant folding. */ #define BINREC_OPT_DECONDITION (1<<1) /** * BINREC_OPT_DEEP_DATA_FLOW: Perform extended data flow analysis on * values associated with guest architecture registers to find dead stores. * This optimization by itself only finds dead stores; enable BINREC_OPT_DSE * to remove them from the code stream. */ #define BINREC_OPT_DEEP_DATA_FLOW (1<<2) /** * BINREC_OPT_DSE: Perform dead store elimination (DSE) on the translated * code, removing instructions whose outputs are not used. * * Instructions with side effects, such as floating-point operations (which * could raise exceptions) or atomic read-modify-write instructions, are * not eliminated. However, floating-point instructions will be eliminated * if the BINREC_OPT_DSE_FP optimization is also enabled. */ #define BINREC_OPT_DSE (1<<3) /** * BINREC_OPT_DSE_FP: Allow elimination of floating-point operations when * performing dead store elimination. * * This optimization is UNSAFE: if an eliminated operation would have * raised a floating-point exception which the guest code checks for, the * translated code will not behave correctly. */ #define BINREC_OPT_DSE_FP (1<<4) /** * BINREC_OPT_FOLD_CONSTANTS: Look for computations whose operands are all * constant and load operations which load from a constant address within * memory marked read-only (see binrec_add_readonly_region()), and convert * them to load-immediate operations. The computed values are themselves * treated as constant, so constantness can be propagated through multiple * instructions. Intermediate values in a computation sequence which end * up being unused due to constant folding, as well as any other * instructions whose outputs which are not used elsewhere, are removed * from the code stream if BINREC_OPT_DSE is also enabled. * * Floating-point operations will not be folded unless the * BINREC_OPT_FOLD_FP_CONSTANTS optimization is also enabled. */ #define BINREC_OPT_FOLD_CONSTANTS (1<<5) /** * BINREC_OPT_FOLD_FP_CONSTANTS: Fold floating-point as well as integer * constants, performing the floating-point operations in the runtime * environment. Any floating-point exceptions generated by the operation * are discarded. * * This flag is ignored if BINREC_OPT_FOLD_CONSTANTS is not also enabled. * * This optimization is UNSAFE: if the floating-point behavior of the * runtime environment differs materially from that of the guest * architecture (such as by not complying with IEEE 754) or if a folded * operation generates a floating-point exception and the guest code * expects to detect that exception, the translated code will not behave * correctly. */ #define BINREC_OPT_FOLD_FP_CONSTANTS (1<<6) /** * BINREC_OPT_FOLD_VECTORS: Attempt to eliminate vector registers whose * values are only used as scalars. For example, if two scalar values are * merged into a vector but are immediately extracted to scalars again, * the vector register is not needed and the initial scalar values can be * forwarded to the later computations. * * This optimization only finds removable vector registers and forwards * the associated scalar values; the BINREC_OPT_DSE optimization flag is * required to eliminate the vectors from the code stream. */ #define BINREC_OPT_FOLD_VECTORS (1<<7) /** * BINREC_OPT_NATIVE_IEEE_NAN: Use the host's rules for NaN results of * floating-point operations, even when those rules differ from the guest * architecture, and allow the host to reorder operands to a floating-point * operation even if doing so would change the NaN returned for an invalid * operation. * * The IEEE specification allows differing behavior between implementations * in the following cases: * * - If more than one operand to an operation is a NaN, the returned NaN * may be any of those values. * * - If a NaN is generated due to an invalid operation, its bit pattern * may be any quiet NaN bit pattern. * * This optimization allows the translator to translate floating-point * operations directly to their host equivalents without manually checking * for NaNs, which can require several additional host instructions per * guest instruction. * * This optimization is specification-safe: as long as guest code follows * the IEEE 754 specifications, it will behave correctly under this * optimization. */ #define BINREC_OPT_NATIVE_IEEE_NAN (1<<8) /** * BINREC_OPT_NATIVE_IEEE_UNDERFLOW: Use the host's definition of * underflow for IEEE floating-point arithmetic, even when that differs * from the guest's definition. * * When translating between architectures which use different definitions * of underflow (IEEE allows two different behaviors: tiny before rounding * and tiny after rounding), this optimization allows floating-point * operations to be translated directly to their equivalent host * instructions, at the cost of different exception states for operations * with a result which is treated as underflowing on one architecture and * not the other. If this optimization is disabled, floating-point * operations must check explicitly for underflow, which can require * several additional host instructions per guest instruction. * * If the host and guest use the same "tiny" rules, floating-point * operations can always be translated directly to native instructions * (at least with regard to tininess), and this flag has no effect on * translation. * * Enabling the BINREC_OPT_DSE and BINREC_OPT_DSE_FP optimizations will * typically have the effect of enabling this optimization as well. * * This optimization is specification-safe: as long as guest code follows * the IEEE 754 specifications, it will behave correctly under this * optimization. */ #define BINREC_OPT_NATIVE_IEEE_UNDERFLOW (1<<9) /*----------- Guest-architecture-specific optimization flags ------------*/ /** * BINREC_OPT_G_PPC_ASSUME_NO_SNAN: Do not attempt to preserve the * signaling/quiet state of floating-point NaN (not-a-number) values. * * The single-precision lfs and stfs instructions preserve the state of the * "quiet" bit in a floating-point NaN (the high bit of the mantissa) when * converting it to or from double precision, but the host instructions * used to implement such a format conversion may treat it as an arithmetic * operation which quiets any incoming signaling NaN, and detecting that * such a change took place can require many more host instructions than * the conversion itself. If this optimization is enabled, the translator * will use the fastest possible method to convert between single and * double precision, ignoring the possibility of signaling NaNs. If a * signaling NaN is in fact loaded, its quiet bit will be set; this may * cause a later floating-point instruction to fail to raise an expected * invalid-operation exception, or have other unpredictable effects if the * bitwise contents of the value are used in non-floating-point operations. * * This optimization is UNSAFE: code which relies on being able to load a * signaling NaN will not behave correctly. But see the * BINREC_OPT_G_PPC_FORWARD_LOADS optimization for a way to avoid the * impact of this optimization on code which loads a signaling NaN (or * non-floating-point data which looks like one) and immediately stores it * back to memory. */ #define BINREC_OPT_G_PPC_ASSUME_NO_SNAN (1<<0) /** * BINREC_OPT_G_PPC_CONSTANT_GQRS: Assume that the values of the GQRs * (graphics quantization registers, used with paired-single load and * store instructions) are constant with respect to the entry point of a * translation unit. * * Ordinarily, translated code for a psq_l* or psq_st* instruction must * read the referenced GQR at runtime and choose the appropriate load or * store operation, which both adds several instructions' worth of latency * and significantly increases code size. If this optimization is enabled, * the translator will instead read (at translation time) the value of each * GQR referenced by guest code and translate paired-single load and store * instructions based on those values. * * If this optimization is enabled, an mtspr instruction which writes to a * GQR will cause the translated code to immediately return to its caller. * This ensures that any following load or store instructions will be * translated using the value written by the mtspr instruction. * * This optimization is UNSAFE: if the value of a GQR is not constant with * respect to any paired-single load or store instruction, the translated * code will not behave correctly. */ #define BINREC_OPT_G_PPC_CONSTANT_GQRS (1<<1) /** * BINREC_OPT_G_PPC_DETECT_FCFI_EMUL: Detect code sequences which convert * an integer to a floating-point number using bit manipulation and local * stack frame storage. * * The 32-bit PowerPC architecture does not include the fcfi (floating * convert-from-integer) instructions of the 64-bit architecture, so * programs must implement this conversion in software. One idiom used * with 32-bit integer input is to create a 64-bit floating-point value on * the stack in which the lowest bit of the mantissa has the value 1.0 * (the bit pattern of this value is 0x43300000_00000000), write the * integer to be converted into the low 32 bits of the value, then load * the value as double-precision floating point and subtract the base * value. (For signed integer input, the high bit of the integer and the * corresponding bit in the base floating-point value are inverted.) * * If this optimization is enabled, the translator will check for a * sequence of instructions matching the above pattern and translate it to * a direct int-to-float conversion instruction. Currently, the input * instruction sequence must match the following: * lis rX,0x4330 * stw rX,N(r1) // N >= 8 * stw rINPUT,N+4(r1) // signed: xoris rY,rINPUT,0x8000; stw rY,N+4(r1) * lfd fP,N(r1) * lfd fQ,const_43300000_0000000 // signed: const_43300000_80000000 * fsub fOUTPUT,fP,fQ * [optional: frsp fOUTPUT,fOUTPUT] * where "const_..." is a memory reference to a constant address registered * with libbinrec as read-only memory containing the listed 64-bit constant. * The conversion operation will be detected even if the instructions are * reordered (preserving dependencies), but detection may fail if other * operations (such as a register move) are interspersed in the dependency * chain. * * Intermediate operations in the sequence are still translated, so that * other instructions which use their values still behave correctly. * (However, the load of fP or fQ will be omitted if fOUTPUT refers to the * same register, and the writes to rX and rY may subsequently be deleted * by dead store elimination.) * * This optimization is UNSAFE: if the floating-point value generated on * the stack is overwritten by an indexed, multi-word, or non-r1-relative * store, or if another agent (such as a separate thread) modifies the * function's stack frame during the optimized instruction sequence, the * translated code will not behave correctly. (However, these situations * are not expected to arise in real-world code.) */ #define BINREC_OPT_G_PPC_DETECT_FCFI_EMUL (1<<2) /** * BINREC_OPT_G_PPC_FAST_FCTIW: Leave the high word of the destination * FPR of an fctiw or fctiwz instruction unspecified. * * The PowerPC architecture specification states that the high 32 bits of * the destination register of an fctiw or fctiwz instruction are * unspecified. However, by default libbinrec will follow the actual * implementation used on the 750CL processor, which sets the high word to * 0xFFF8_0000 (thus turning the entire 64-bit register into a NaN when * interpreted as a double-precision value). * * Enabling this optimization will allow the translator to use whatever * method is fastest on the host for converting to integer. This will not * affect the value stored in the low 32 bits of the register, but the * high 32 bits may be left unchanged or changed to some other value. * * This optimization is specification-safe: as long as guest code follows * the PowerPC architecture specification, it will behave correctly under * this optimization. */ #define BINREC_OPT_G_PPC_FAST_FCTIW (1<<3) /** * BINREC_OPT_G_PPC_FAST_FMADDS: Use simple arithmetic conversion when * rounding a double-precision multiply-add result to single precision. * * When using double-precision inputs to a single-precision fused * multiply-add instruction like fmadds or ps_madd, the result must be * rounded to single precision before storing it in the output register. * Using simple arithmetic conversion for this rounding can change the * result in certain cases, specifically under the following conditions: * the rounding mode is set to round-to-nearest, the product is exactly * halfway between two single-precision values (the low 29 bits of the * double-precision mantissa are 0x1000_0000), and the addend is tiny with * respect to the product. In this case, the infinitely precise result is * not exactly between two single-precision values, so it should round to * the nearer one; but if the output of the double-precision operation is * used for rounding, the addend will have been already rounded off, so * the rounding input will be treated as a tie and may round in the wrong * direction. * * Enabling this optimization allows the translator to ignore the * possibility of the inaccuracy described above and round double-precision * results with a simple arithmetic operation, which is significantly * faster than checking for and correcting rounding error. * * This optimization is UNSAFE: if enabled, results of single-precision * multiply-add operations with certain operands will differ in the lowest * bit from the correct values. */ #define BINREC_OPT_G_PPC_FAST_FMADDS (1<<4) /** * BINREC_OPT_G_PPC_FAST_FMULS: Do not attempt to round the second * multiplicand (frC) to a single-precision multiply or multiply-add * instruction. * * According to the PowerPC architecture specification, the result of * using non-single-precision values with single-precision instructions is * undefined. Real 32-bit PowerPC chips (at least the 750CL) just perform * the operation in double precision and round the result to single * precision -- with one exception: the second operand (frC) to a multiply * operation has its mantissa rounded to 24 bits before the multiplication * is performed. libbinrec implements this rounding on an frC operand * which is not known to already be in single-precision format; since the * library does not perform the deep analysis required to carry knowledge * of data format across branches (such as in loops), rounding may have to * be performed frequently even in guest code which properly converts all * values to single precision before using single-precision insturctions. * This rounding is fairly expensive because of the various edge cases * that need to be handled. * * If this optimization is enabled, libbinrec will assume that the frC * operand to an fmuls, fmadds, fmsubs, fnmadds, or fnmsubs instruction is * representable in single precision even if it is not known to be in * single-precision format, and will skip the rounding step. * * In the more general case of double-precision operands used with * single-precision instructions, libbinrec always performs the operation * in double precision, since it will generally be no slower (and often * faster) to round the result after the operation than to round all input * operands beforehand, and as long as the input values are representable * in single precision, rounding a double-precision result gives the same * output as performing the operation in single precision. * * This optimization is specification-safe: as long as guest code follows * the PowerPC architecture specification, it will behave correctly under * this optimization. */ #define BINREC_OPT_G_PPC_FAST_FMULS (1<<5) /** * BINREC_OPT_G_PPC_FAST_STFS: Use mathematical rather than bitwise * conversion when storing double-precision values as single precision. * * The stfs instruction (as well as stfsx, stfsu, and stfsux) are defined * to have a specific behavior with respect to double-precision values, * which has the effect of converting the value to single precision in * round-toward-zero mode if the value is in the range of values * representable in single precision but does not treat overflow or * underflow conditions specially -- thus, for example, storing an FPR * containing the double-precision value 2^256 with stfs stores the bit * pattern 0x3F80_0000, equal to 1.0 in single precision. Properly * implementing this behavior is significantly more expensive than simply * converting the value to single precision as an arithmetic operation * and storing that result, which can make this behavior a bottleneck for * programs which process large amounts of single-precision data. * * Enabling this optimization causes the translator to use ordinary * arithmetic conversion when storing double-precision values with the * stfs group of instructions. This deviates from the PowerPC * specification, but (particularly if used with the ASSUME_NO_SNAN * optimization) allows single-precision stores to be implemented with * many fewer host instructions. * * This optimization is UNSAFE: if the guest code relies on the precise * conversion behavior of stfs-group instructions, the translated code * will not behave correctly. */ #define BINREC_OPT_G_PPC_FAST_STFS (1<<6) /** * BINREC_OPT_G_PPC_FNMADD_ZERO_SIGN: Do not attempt to return the correct * sign on the result of an fnmadd[s] or fnmsub[s] instruction. * * The PowerPC fnmadd and fnmsub instructions calculate -(frA*frC+frB) and * -(frA*frC-frB), respectively. These differ from the fused multiply-add * instructions some other architectures, such as x86, in which the * operations of the same names only negate the product, such that fnmadd * calculates -(frA*frC)+frB. While this difference can be mostly covered * by translating to the opposite operation (PowerPC fnmadd -> x86 fnmsub), * this gives the wrong sign on some zero results, so a correct translation * requires using a positive fmadd/fmsub and manually negating the result. * * Enabling this optimization allows the translator to translate these * instructions to single fnmsub/fnmadd operations, at the cost of * returning zero with an incorrect sign in the cases mentioned above. * * This optimization is UNSAFE for obvious reasons, though it is believed * that most real-life PowerPC code does not differentiate between positive * and negative zero. */ #define BINREC_OPT_G_PPC_FNMADD_ZERO_SIGN (1<<7) /** * BINREC_OPT_G_PPC_FORWARD_LOADS: Save the raw value read from memory for * each load instruction, and if the same value is stored back to memory, * store the raw value instead of reading back the register. * * On little-endian hosts, this avoids the need to byte-swap values an * extra time when storing them, as well as a copy between integer and * floating-point registers for floating-point values. This optimization * can also reduce the impact of the ASSUME_NO_SNAN optimization for guest * code which uses lfs/stfs to copy non-floating-point data (cases have * been observed which, for example, copy byte-reversed floating-point data * using lfs/stfs before fixing the byte order). On the flip side, this * optimization may increase register pressure for loads which are in fact * forwarded, and this can in turn negate the benefits of the optimization * due to register spills. * * If the optimizations BINREC_OPT_G_PPC_PS_STORE_DENORMALS and * BINREC_OPT_G_PPC_CONSTANT_GQRS are active, this optimization also * allows forwarding of floating-point paired-single loads. * * Note that BINREC_OPT_DSE should always be used with this optimization * so that speculative loads for forwarding are eliminated if they are not * forwarded. */ #define BINREC_OPT_G_PPC_FORWARD_LOADS (1<<8) /** * BINREC_OPT_G_PPC_IGNORE_FPSCR_VXFOO: Do not set FPSCR exception bits * for specific invalid exception types (the "VXFOO" bits). * * The PowerPC architecture includes several FPSCR bits which indicate * specific types of floating-point invalid operation exceptions, such as * subtraction of infinities (VXISI) or use of a signaling NaN (VXSNAN). * Detecting these cases on a host architecture which does not expose such * information requires additional manual checks on the operands to each * floating-point operation and can have a severe impact on performance. * Enabling this optimization allows the translator to skip these checks, * treating any invalid-operation exception as VXSNAN whether or not any * operand was in fact a signaling NaN. (Other VXFOO exception bits are * still set in cases where doing so does not affect performance.) * * Instructions which directly manipulate FPSCR, such as mtfsf, are not * affected by this optimization and continue to behave normally. * * This optimization is UNSAFE for obvious reasons, though it is believed * that most real-life PowerPC code does not make use of the VXFOO bits. * * This optimization has no effect if BINREC_OPT_G_PPC_NO_FPSCR_STATE is * enabled. */ #define BINREC_OPT_G_PPC_IGNORE_FPSCR_VXFOO (1<<9) /** * BINREC_OPT_G_PPC_NATIVE_RECIPROCAL: Translate guest PowerPC * reciprocal-estimate instructions (fres and frsqrte) directly to their * host equivalents, maintaining compliance with the PowerPC architecture * specification but disregarding the precise behavior of the guest * architecture. * * The PowerPC architecture specifies bounds within which the results of * these instructions will fall relative to the true (mathematical) result. * Programs written to be compliant with the architecture will work * correctly regardless of the exact output of the instruction, though the * precise behavior of the program (for example, the low-end bits of the * result) may change. This flag allows the translator to choose faster * host instructions which may not give exactly the same result but still * satisfy the PowerPC architecture constraints. * * If this optimization is disabled, the translator will attempt to match * the precise behavior of the guest architecture by using lookup tables * referenced by pointers in the processor state block (see the fres_lut * and frsqrte_lut fields in binrec_setup_ppc_t). The translated code will * crash if it executes an fres or frsqrte instruction, this optimization * is not enabled, and the appropriate pointer in the state block is not set. * * The tables for the 750CL processor are as follows: * * fres: 0x3FFC,0x3E1, 0x3C1C,0x3A7, 0x3875,0x371, 0x3504,0x340, * 0x31C4,0x313, 0x2EB1,0x2EA, 0x2BC8,0x2C4, 0x2904,0x2A0, * 0x2664,0x27F, 0x23E5,0x261, 0x2184,0x245, 0x1F40,0x22A, * 0x1D16,0x212, 0x1B04,0x1FB, 0x190A,0x1E5, 0x1725,0x1D1, * 0x1554,0x1BE, 0x1396,0x1AC, 0x11EB,0x19B, 0x104F,0x18B, * 0x0EC4,0x17C, 0x0D48,0x16E, 0x0BD7,0x15B, 0x0A7C,0x15B, * 0x0922,0x143, 0x07DF,0x143, 0x069C,0x12D, 0x056F,0x12D, * 0x0442,0x11A, 0x0328,0x11A, 0x020E,0x108, 0x0106,0x106 * * frsqrte: 0x7FF4,0x7A4, 0x7852,0x700, 0x7154,0x670, 0x6AE4,0x5F2, * 0x64F2,0x584, 0x5F6E,0x524, 0x5A4C,0x4CC, 0x5580,0x47E, * 0x5102,0x43A, 0x4CCA,0x3FA, 0x48D0,0x3C2, 0x450E,0x38E, * 0x4182,0x35E, 0x3E24,0x332, 0x3AF2,0x30A, 0x37E8,0x2E6, * 0x34FD,0x568, 0x2F97,0x4F3, 0x2AA5,0x48D, 0x2618,0x435, * 0x21E4,0x3E7, 0x1DFE,0x3A2, 0x1A5C,0x365, 0x16F8,0x32E, * 0x13CA,0x2FC, 0x10CE,0x2D0, 0x0DFE,0x2A8, 0x0B57,0x283, * 0x08D4,0x261, 0x0673,0x243, 0x0431,0x226, 0x020B,0x20B * * Depending on the performance details of the host CPU and the types of * input values used by the guest code, enabling this optimization may * actually result in slower code (particularly for frsqrte), though it * will always reduce code size. * * This optimization is specification-safe: as long as guest code follows * the PowerPC architecture specification, it will behave correctly under * this optimization. */ #define BINREC_OPT_G_PPC_NATIVE_RECIPROCAL (1<<10) /** * BINREC_OPT_G_PPC_NO_FPSCR_STATE: Do not write any state bits (exception * bits, FR, FI, or FPRF) in FPSCR based on floating-point operation results. * * Enabling this optimization causes the translated code to ignore all * host FPU exception conditions and skip setting FPRF to reflect the * value type. For guest code which does not enable floating-point * exceptions or check the FPSCR status bits, this results in significantly * faster and smaller translated code with no effect on program behavior. * * The control mode bits are honored as usual, though only FPSCR[RN] has * any effect on program behavior in this case; the exception enable bits * are meaningless since exceptions are not detected, and nonzero FPSCR[NI] * is not currently supported by the translator. * * Instructions which directly manipulate FPSCR, such as mtfsf, are not * affected by this optimization and continue to behave normally. If any * of the FR/FI/FPRF bits are set by such an instruction, they will remain * set even after floating-point instructions which would normally * overwrite them. * * Floating-point instructions with the Rc bit set will copy the high 4 * bits of FPSCR to the cr1 field of CR as usual, though the bit values * will naturally not reflect the result of any floating-point operations. * The library will log a warning (once per translation unit) if such an * instruction is encountered when this optimization is enabled. * * This optimization implicitly enables BINREC_OPT_NATIVE_IEEE_UNDERFLOW. * * This optimization is UNSAFE: code which relies on any of the FPSCR * state bits will behave incorrectly if this optimization is enabled. */ #define BINREC_OPT_G_PPC_NO_FPSCR_STATE (1<<11) /** * BINREC_OPT_G_PPC_PAIRED_LWARX_STWCX: Optimize the sequence of lwarx * followed by stwcx. if there are no intervening branches or branch * targets. * * This optimization allows the translator to forward data from a paired * lwarx to its associated stwcx., avoiding unnecessary accesses to the * processor state block. * * Additionally, if the data operand to stwcx. is the same as the value * loaded with lwarx, the store itself is omitted and the code behaves as * if the store succeeded. (This case appears to arise from compilers * which always generate lwarx when reading from an atomic variable; in * such a case, a dummy stwcx. is required to clear the reservation state.) * Since conditionally storing back the loaded value will never cause a * change to memory, and since we translate lwarx/stwcx. using a compare- * and-exchange model rather than precisely emulating the reserve-and-snoop * behavior of PowerPC hardware, this transformation is safe. */ #define BINREC_OPT_G_PPC_PAIRED_LWARX_STWCX (1<<12) /** * BINREC_OPT_G_PPC_PS_STORE_DENORMALS: Do not flush denormals to zero * when storing floating-point values with the paired-single store * instructions (psq_st[u][x]). * * Normally, the paired-single store instructions flush denormal values * to zero before writing them to memory. Enabling this optimization * allows the translator to skip the expensive denormal check and write * the values straight to memory. * * Even when this optimization is enabled, psq_st instructions will flush * denormals to zero if the value is read in double precision (this depends * on the internal translator state at the particular instruction) and the * BINREC_OPT_G_PPC_FAST_STFS optimization is not enabled, since in that * case, flushing to zero is faster than producing a correct denormal value. * * This optimization is UNSAFE: code which relies on denormals being * flushed to zero by paired-single store instructions will behave * incorrectly if this optimization is enabled. */ #define BINREC_OPT_G_PPC_PS_STORE_DENORMALS (1<<13) /** * BINREC_OPT_G_PPC_SC_BLR: Optimize an instruction sequence of "sc; blr" * by setting NIA to the value of the LR register rather than the address * of the instruction following the "sc" when calling the sc handler. * This avoids the need to translate and call a block containing a single * blr after returning from the sc handler. * * This optimization is UNSAFE: the sc handler cannot recover the original * address of the instruction which triggered the exception when this * optimization is triggered. */ #define BINREC_OPT_G_PPC_SC_BLR (1<<14) /** * BINREC_OPT_G_PPC_SINGLE_PREC_INPUTS: Assume that the inputs to a * single-precision floating-point instruction are in single precision. * * The PowerPC specification requires inputs to a single-precision * floating-point instruction to be in single precision (either the result * of lfs or another single-precision load instruction, or the output of * a previous frsp or single-precision floating-point operation). However, * actual 32-bit PowerPC CPUs ignore the precision of floating-point * mathematical instructions and always perform the operation using the * full precision of the inputs, and by default libbinrec emulates this * behavior to accommodate code which takes advantage of this quirk (in * violation of the PowerPC spec). * * If this optimization is enabled, libbinrec will assume that inputs to a * single-precision instruction are proper single-precision values, and * will perform the operation in single instead of double precision when * beneficial. This improves performance when one input to such an * instruction is the output of a previous single-precision instruction in * the same basic block and the other input is an FPR which has not yet * been accessed in the block; in that case, this optimization allows the * translator to convert the second input to single precision and perform * the operation in single precision, instead of converting the first * operand to double precision, performing the operation in double * precision, then converting the result back to singel precision. * * This optimization also causes the ps_merge{00,01,10,11} instructions to * perform a simple 64-to-32 bit arithmetic conversion when loading input * values from the processor state block, rather than emulating the 750CL * quirk of truncating an excess-precision value loaded into the PS1 slot * (which requires a pair of host FPU rounding mode changes). * * This optimization is specification-safe: as long as guest code follows * the PowerPC architecture specification, it will behave correctly under * this optimization. */ #define BINREC_OPT_G_PPC_SINGLE_PREC_INPUTS (1<<15) /** * BINREC_OPT_G_PPC_TRIM_CR_STORES: Analyze the data flow through each * CR bit and eliminate stores which are not visible outside the * translated code. * * If the branch exit test is enabled, then when translated code returns * to its caller due to the branch test, this optimization may leave stale * values in CR bits which would have been overwritten by later code in * the same translation unit. This will not result in a change in behavior * as long as execution is eventually restarted with the same processor * state at the branch target address, but if the client program relies on * (or transfers execution to guest code which relies on) the value of CR, * it may not behave as expected. * * This optimization has no effect unless BINREC_OPT_G_PPC_USE_SPLIT_FIELDS * is also enabled. */ #define BINREC_OPT_G_PPC_TRIM_CR_STORES (1<<16) /** * BINREC_OPT_G_PPC_USE_SPLIT_FIELDS: Treat subfields of certain registers * as separate values, rather than directly modifying the associated bits * in the register. * * Enabling this optimization causes individual bits of CR and the FPRF * field of FPSCR to be treated as separate "variables" in their own right; * the translated code will extract their values on entry and recombine * them into the full register on exit. This allows data flow analysis to * find dead stores to specific fields, which otherwise would be obscured * by the dependency on the full register's previous state. * * If this optimization is enabled, pre- and post-instruction callbacks * and timebase handlers may see incorrect values of CR and FPSCR[FPRF] * in the processor state block. System call and trap handlers are not * affected. */ #define BINREC_OPT_G_PPC_USE_SPLIT_FIELDS (1<<17) /*------------ Host-architecture-specific optimization flags ------------*/ /** * BINREC_OPT_H_X86_ADDRESS_OPERANDS: Encode certain address calculations * directly in a load, store, or atomic instruction when feasible. If the * address operand of such an instruction is not referenced by any other * instruction (after the one that sets it), then: * * - If the address is the sum of a register and a constant, and the sum of * that constant and the offset encoded in the instruction is within the * range of a 32-bit signed integer, eliminate the addition and use the * combined offset as the access offset in the instruction. * * - If the address is the sum of two registers, eliminate the addition and * encode the access using the base-plus-index format. */ #define BINREC_OPT_H_X86_ADDRESS_OPERANDS (1<<0) /** * BINREC_OPT_H_X86_BRANCH_ALIGNMENT: Align branch targets to a multiple * of 16 bytes by inserting NOP instructions at appropriate points in the * generated code stream, when doing so seems likely to improve performance. */ #define BINREC_OPT_H_X86_BRANCH_ALIGNMENT (1<<1) /** * BINREC_OPT_H_X86_CONDITION_CODES: Track the state of the condition * codes in the EFLAGS register, and avoid adding an explicit TEST or CMP * instruction for a register if the condition codes already reflect the * value of that register. */ #define BINREC_OPT_H_X86_CONDITION_CODES (1<<2) /** * BINREC_OPT_H_X86_FIXED_REGS: When an instruction requires an operand to * be in a specific hardware register (shift counts must be in CL, for * example), try harder to allocate that hardware register for the operand. * This requires an extra pass over the translated machine code during * register allocation. */ #define BINREC_OPT_H_X86_FIXED_REGS (1<<3) /** * BINREC_OPT_H_X86_FORWARD_CONDITIONS: When a register used as the * condition for a conditional branch or move is the result of a comparison * instruction and that register is not used elsewhere, eliminate the * register and forward the comparison condition to the branch or move * instruction. * * Floating-point comparisons will not be forwarded unless the * BINREC_OPT_DSE_FP common optimization flag is enabled. */ #define BINREC_OPT_H_X86_FORWARD_CONDITIONS (1<<4) /** * BINREC_OPT_H_X86_MERGE_REGS: Try harder to avoid moving values * between registers. * * Enabling this optimization causes the register allocator to attempt to * keep values associated with guest architecture registers in the same * host register across basic blocks. Typically this will result in * faster and more compact code, but with certain code patterns it may * hurt performance by causing more-frequently-accessed values to be * spilled to memory. */ #define BINREC_OPT_H_X86_MERGE_REGS (1<<5) /** * BINREC_OPT_H_X86_STORE_IMMEDIATE: When an immediate value is used only * as the data for a store operation, encode the immediate value directly * in the instruction instead of using a register. */ #define BINREC_OPT_H_X86_STORE_IMMEDIATE (1<<6) /*************************************************************************/ /******** Interface: Library and runtime environment information *********/ /*************************************************************************/ /** * binrec_version: Return the version number of the library as a string * (for example, "1.2.3"). * * [Return value] * Library version number. */ extern const char *binrec_version(void); /** * binrec_native_arch: Return a BINREC_ARCH_* constant representing the * architecture of the runtime environment, or 0 (BINREC_ARCH_INVALID) if * the runtime environment does not correspond to a supported host * architecture. (If a nonzero value is returned, it will always be valid * as a host architecture for translation.) * * [Return value] * Runtime environment architecture (BINREC_ARCH_*), or 0 if unsupported. */ extern binrec_arch_t binrec_native_arch(void); /** * binrec_native_features: Return a bitmask of architecture features * (BINREC_FEATURE_*) supported by the runtime environment, or 0 if the * runtime environment does not correspond to a supported architecture. * * [Return value] * Runtime environment feature bitmap, or 0 if unsupported. */ extern unsigned int binrec_native_features(void); /** * binrec_guest_supported: Return whether the given architecture is * supported as a guest architecture for translation. * * [Parameters] * arch: Architecture to check. * [Return value] * True (nonzero) if the given architecture is supported as a guest * architecture, false (zero) if not. */ extern int binrec_guest_supported(binrec_arch_t arch); /** * binrec_host_supported: Return whether the given architecture is * supported as a guest architecture for translation. * * [Parameters] * arch: Architecture to check. * [Return value] * True (nonzero) if the given architecture is supported as a guest * architecture, false (zero) if not. */ extern int binrec_host_supported(binrec_arch_t arch); /*************************************************************************/ /*************** Interface: Translation handle management ****************/ /*************************************************************************/ /** * binrec_create_handle: Create a new translation handle. * * [Parameters] * setup: Pointer to a binrec_setup_t structure that defines the * translation parameters to use. * [Return value] * Newly created handle, or NULL on error. */ extern binrec_t *binrec_create_handle(const binrec_setup_t *setup); /** * binrec_destroy_handle: Destroy a translation handle. * * This function only destroys the translation handle itself; blocks of * translated code generated through the handle remain valid even after * the handle is destroyed. * * [Parameters] * handle: Handle to destroy (may be NULL). */ extern void binrec_destroy_handle(binrec_t *handle); /** * binrec_set_code_range: Set the minimum and maximum addresses from which * to read source machine instructions. Branch instructions which attempt * to jump outside this range will terminate the translation unit, and if * the source machine code runs off the end of the range, the unit will be * terminated at the final instruction completely contained within the range. * The range must not wrap around the end of the address space. * * By default, the entire address space is considered valid for reading * instructions. * * Note that binrec_translate() also allows specifying an address range for * translation. This function is intended more for the purpose of avoiding * translation of memory areas known to be outside the bounds of the input * program code, such as data segments or undefined memory, similar to the * "execute" permission bit in hardware memory management units. (It is * not currently possible to specify multiple disjoint code ranges.) * * [Parameters] * handle: Handle to operate on. * start: First address of code range. * end: Last address of code range (inclusive). */ extern void binrec_set_code_range(binrec_t *handle, uint32_t start, uint32_t end); /** * binrec_set_optimization_flags: Set which optimizations should be * performed on translated blocks. Enabling more optimizations will * improve the performance of translated code but increase the overhead * of translation; see the documentation on each optimization flag for * details. * * The set of enabled optimizations may be changed at any time without * impacting the behavior of already-translated blocks. * * By default, no optimizations are enabled. * * [Parameters] * handle: Handle to operate on. * common_opt: Bitmask of common optimizations to apply (BINREC_OPT_*). * guest_opt: Bitmask of guest-specific optimizations to apply * (BINREC_OPT_G_*). * host_opt: Bitmask of host-specific optimizations to apply * (BINREC_OPT_H_*). */ extern void binrec_set_optimization_flags( binrec_t *handle, unsigned int common_opt, unsigned int guest_opt, unsigned int host_opt); /** * binrec_set_max_inline_length: Set the maximum length (number of source * instructions, including the final return instruction) of subroutines to * inline. The default is zero, meaning no subroutines will be inlined. * * If a nonzero length limit is set with this function, then when the * translator encounters a subroutine call instruction to a fixed address, * it will scan ahead up to this many instructions for a return * instruction. If one is found, and if there are no branch instructions * that branch past the return, the subroutine will be inlined into the * current translation unit, saving the cost of jumping to a different * unit (which can be significant depending on how many guest registers * need to be spilled). * * If an inlined subroutine contains a further call instruction, that * subroutine will not be inlined regardless of its length. (But see * binrec_set_max_inline_depth() to enable such recursive inlining.) * * Note that if a nonzero length limit is set, inlining may be performed * regardless of whether any optimization flags are set. * * Inlining is not currently implemented; calling this function has no * effect in this version of the library. * * [Parameters] * handle: Handle to operate on. * length: Maximum inline length (must be at least 0). */ extern void binrec_set_max_inline_length(binrec_t *handle, int length); /** * binrec_set_max_inline_depth: Set the maximum depth of subroutines to * inline. The default is 1. * * If a depth limit greater than 1 is set with this function, then when a * call instruction is encountered during inlining, the translator will * perform the same inlining check on the called subroutine, up to the * specified depth. For example, when translating at A in the following * pseudocode: * A: call B * ret * B: call C * ret * C: call D * ret * D: call E * ret * E: nop * ret * if the maximum inline depth is set to 2 (and assuming the maximum length * is set to at least 2), both B and C will be inlined, but D will not, and * the A routine will be translated as if it was written: * A: call D * ret * * Setting a value of zero disables inlining regardless of the maximum * inline length set with binrec_set_max_inline_length(). * * Inlining is not currently implemented; calling this function has no * effect in this version of the library. * * [Parameters] * handle: Handle to operate on. * depth: Maximum inline depth (must be at least 0). */ extern void binrec_set_max_inline_depth(binrec_t *handle, int depth); /** * binrec_add_readonly_region: Mark the given region of memory as * read-only. Instructions which are known to load from read-only memory * will be translated into load-constant operations if enabled by the * BINREC_OPT_FOLD_CONSTANTS optimization flag. * * When determining whether a load operation addresses read-only memory, * only the first (lowest) address of the referenced value is checked. * Thus, a multi-byte load operation which crosses the end of a read-only * region will still be translated to a load-constant operation, and any * subsequent changes to the bytes outside the read-only region will not * be seen by the translated code. * * If the guest code performs a store operation into a region marked as * read-only, subsequent behavior of the program is undefined. * * This function may fail if too many misaligned regions are added; in * that case, rebuild the library with different values of the * READONLY_PAGE_BITS and MAX_PARTIAL_READONLY constants in src/common.h. * * The address range specified by binrec_set_code_range() is treated as * read-only with respect to the instruction stream, regardless of whether * it is explicitly marked read-only via this function. However, any data * interspersed within the instruction stream will only be treated as * constant data if the address of that data has been explicitly marked * read-only. * * [Parameters] * handle: Handle to operate on. * base: Base address (in guest memory) of read-only region. * size: Size of read-only region, in bytes. * [Return value] * True on success; false if the region could not be added because * the partial-page table is full. */ extern int binrec_add_readonly_region(binrec_t *handle, uint32_t base, uint32_t size); /** * binrec_clear_readonly_regions: Clear all read-only memory regions * added with binrec_add_readonly_region(). * * [Parameters] * handle: Handle to operate on. */ extern void binrec_clear_readonly_regions(binrec_t *handle); /** * binrec_enable_chaining: Set whether the translated code should include * logic to chain directly to other blocks of translated code on exit, * rather than returning to its caller. * * If chaining is enabled, then when the translated code would normally * return to its caller with a known next instruction address, it will * instead look up (via a function pointer stored in the PSB at the offset * given by the state_offset_chain_lookup field in binrec_setup_t) the * address of translated code corresponding to that guest address. If the * function returns a non-NULL pointer, the translated code will rewrite * itself to jump directly to that pointer on subsequent calls. (The code * will then return to its caller whether it successfully resolved the * chain or not.) This naturally requires that the code be stored in a * host memory region which allows writing. * * Chain resolution (rewriting of the code to jump to the resolved target * address) works correctly in a multithreaded environment. If two threads * try to resolve the same chain at the same time and get two different * target addresses, one of the addresses will be used for the target, but * it is undefined which one. * * The code produced by libbinrec when chaining is enabled is still * position-independent, but once a chain has been resolved, the code will * be dependent on the target address of the chain. * * Calling this function has no effect on already-translated code. * * By default, chaining is disabled. * * [Parameters] * handle: Handle to operate on. * enable: True (nonzero) to enable chaining, false (zero) to disable. */ extern void binrec_enable_chaining(binrec_t *handle, int enable); /** * binrec_enable_branch_exit_test: Set whether to check a 32-bit value in * the processor state block (pointed to by state_offset_branch_exit_flag) * immediately before a branch to another instruction within the same * translation unit. If enabled, the value is tested immediately before * taking a branch within the translated code; if the value is nonzero, * the translated code will return to its caller rather than continuing * execution at the branch target. This can be used to safely interrupt * execution of the guest code ("safely" in the sense of the PSB being * fully updated) at a finer granularity than an entire translation unit. * The flag is not tested for conditional branches which are not taken or * for branches which would return from the translated code in any case * (such as indirect branches). * * Note that enabling the branch exit test can significantly reduce the * performance of translated code; the requirement for the code to be able * to exit at any branch point limits the amount of optimization that can * be performed across branches. * * Calling this function has no effect on already-translated code. * * By default, the branch exit test is disabled. * * [Parameters] * handle: Handle to operate on. * enable: True (nonzero) to enable the branch exit test, false (zero) * to disable it. */ extern void binrec_enable_branch_exit_test(binrec_t *handle, int enable); /** * binrec_set_pre_insn_callback: Set a callback function which will be * called immediately before executing each guest instruction. This can * be used, for example, to log an execution trace or to record the state * of the guest processor at a particular point. Pass NULL to disable an * existing callback. * * The callback receives two parameters: the processor state block pointer * passed to the translated code and the address of the instruction about * to be executed. * * Calling this function has no effect on already-translated code. * * The pre- and post-instruction callbacks are generally useful only for * debugging or analysis of code at runtime, so they are set directly as * pointers in the runtime environment under the assumption that the host * architecture is that of the runtime environment. These callbacks should * not be enabled when cross-compiling to a different architecture. The * callbacks also should not attempt to modify the processor state block; * doing so results in undefined behavior after the callback returns. * * Enabling the pre- or post-instruction callback can have a significant * negative impact on performance. * * [Parameters] * handle: Handle to operate on. * callback: Callback to install, or NULL to clear any installed callback. */ extern void binrec_set_pre_insn_callback(binrec_t *handle, void (*callback)(void *, uint32_t)); /** * binrec_set_post_insn_callback: Set a callback function which will be * called immediately after executing each guest instruction. The state * of the guest processor will be the same as if translation had ended * immediately after the just-executed instruction. Pass NULL to disable * an existing callback. * * The callback receives two parameters: the processor state block pointer * passed to the translated code and the address of the instruction that * was just executed. * * Calling this function has no effect on already-translated code. * * [Parameters] * handle: Handle to operate on. * callback: Callback to install, or NULL to clear any installed callback. */ extern void binrec_set_post_insn_callback(binrec_t *handle, void (*callback)(void *, uint32_t)); /** * binrec_enable_verify: Enable or disable verification checks on * translated blocks. This has no effect on the generated code, and is * intended for catching bugs in libbinrec. * * [Parameters] * handle: Handle to operate on. * enable: True to enable verification, false to disable. */ extern void binrec_enable_verify(binrec_t *handle, int enable); /*************************************************************************/ /********************** Interface: Code translation **********************/ /*************************************************************************/ /** * binrec_translate: Translate a block of guest machine code into native * machine code. * * The "address" and "limit" parameters specify the inclusive address * bounds from which instructions will be read for this call. Translation * will stop when the translator reaches a source instruction which is not * entirely contained in the inclusive range [address,limit], or when all * code paths starting from "address" have been translated (such as when * the end of a function in the input program is reached). A value of -1 * for "limit" allows translation to continue until such a natural endpoint * is found. * * The "state" parameter can be used to provide a processor state block * (of the same format as that passed to the generated code) for the * translation routines to reference. This is used by certain * optimizations (currently only the PowerPC CONSTANT_GQRS optimization) * to generate more efficient code under the assumption that certain * elements of the processor state will remain constant for every call * to the translated code. If NULL is passed, such optimizations will be * implicitly disabled for the current binrec_translate() call. * * On success, the returned block can be executed by calling it as a * function with the following signature: * void *code(void *state, void *memory); * where the "state" parameter is a pointer to a processor state block * whose structure conforms to the structure offsets specified in the * setup data passed to binrec_create_handle(), and "memory" is a pointer * to the base of the guest memory region. The return value of the code * is the processor state block pointer; this may differ from the pointer * passed to the block if, for example, a virtual system call caused the * code's thread to migrate to a different virtual processor. (libbinrec * will never change the PSB pointer on its own, so library clients which * do not use multiple PSBs can safely ignore the return value.) * * The returned code pointer will have been allocated with the code_malloc * or code_realloc function passed in the setup structure to * binrec_create_handle(), or the relevant fallback function if code_* * functions were not supplied. * * Return-value arguments are only modified on a successful return. * * [Parameters] * handle: Handle to use for translation. * state: Guest processor state to reference for translation, or NULL * if none. * address: Address (in guest memory) of first instruction to translate. * limit: Address (in guest memory) at which to terminate translation. * code_ret: Pointer to variable to receive a pointer to the * translated machine code. * size_ret: Pointer to variable to receive the length of the * translated machine code, in bytes. * [Return value] * True (nonzero) on success, false (zero) on error. */ extern int binrec_translate(binrec_t *handle, void *state, uint32_t address, uint32_t limit, void **code_ret, long *size_ret); /*************************************************************************/ /*************************************************************************/ #ifdef __cplusplus } // extern "C" #endif #endif // BINREC_H