/*
 * libbinrec: a recompiling translator for machine code
 * Copyright (c) 2016 Andrew Church <achurch@achurch.org>
 *
 * This software may be copied and redistributed under certain conditions;
 * see the file "COPYING" in the source code distribution for details.
 * NO WARRANTY is provided with this software.
 */

#ifndef BINREC_H
#define BINREC_H

#include <stddef.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

/*
 * General notes
 * =============
 *
 * In this header (and the library source code in general), "guest" refers
 * to the source CPU or architecture, i.e. the input to the translator,
 * and "host" or "native" refers to the target CPU or architecture, i.e.
 * the output of the translator.  (In a few function names, such as
 * binrec_native_arch(), "native" is used to refer to the runtime
 * environment when considered as a translator output architecture.)
 * A program which calls into code generated by this library is referred
 * to as a "client program".
 *
 * The numeric values of constants and the layout of structures in this
 * header are _not_ part of the public API; always use the symbolic names
 * rather than the numeric values or offsets when writing code which
 * interfaces to the library.  The library ABI (including those values and
 * offsets) will be kept consistent through revisions of a major version,
 * such that a program compiled against library version x.y (x >= 1) will
 * also run correctly using library version x.z (if z < y, features such
 * as optimization flags only defined later than version x.z may be
 * ignored, but the generated code will behave in a compatible manner).
 * However, the ABI may change between major versions, or at any time
 * before version 1.0.
 *
 *
 * Guest-specific notes
 * ====================
 *
 * The abbreviation "PSB" below refers to the processor state block, i.e.
 * the structure containing the current state of the guest processor.
 *
 * libbinrec is designed to allow a certain amount of flexibility in the
 * structure of the PSB.  The binrec_setup_t structure includes
 * "state_offsets_*" sub-structures for specifying the byte offset of
 * various members of the PSB; the client program should fill in the
 * structure appropriate to the selected guest architecture with the
 * relevant offsets within its PSB structure.
 *
 * PowerPC 32-bit architectures (BINREC_ARCH_PPC_7XX)
 * --------------------------------------------------
 *
 * Floating-point registers in the PSB are assumed to be stored as pairs
 * of 64-bit floating point ("double"-type) values, with the first value
 * of each pair holding the value of that register as a scalar or the ps0
 * slot of the register for paired-single mode, and the second value of
 * each pair holding the value of the ps1 slot for paired-single mode.
 * The translated code will take care of converting between single and
 * double precision as needed.  Note that the FPR array must be aligned to
 * a multiple of 16 bytes to avoid crashes due to misaligned accesses.
 *
 * Full floating-point exception handling bloats the translated code
 * significantly; in particular, paired-single arithmetic instructions
 * require several hundred host instructions each to correctly identify
 * all possible combinations of exception conditions, though most of
 * those instructions will be skipped for any particular execution of the
 * operation.  Client programs should enable as many of the floating-point
 * optimization flags as possible for best performance; even those marked
 * UNSAFE are in fact safe for the vast majority of real-world code.  In
 * particular, if the guest code does not read FPSCR or set any of the
 * FPSCR exception enable bits, enabling BINREC_OPT_G_PPC_NO_FPSCR_STATE
 * can improve performance of floating-point code by a factor of 10 or
 * more.  If necessary, specific optimizations can be safely disabled for
 * individual blocks of guest code; changing optimization flags for one
 * translation unit will have no effect on the behavior of any other
 * translated code.
 *
 * The FEX and VX bits in FPSCR are not written to the copy of FPSCR
 * stored in the PSB, but are instead generated when needed by a
 * floating-point instruction with Rc=1 or the mffs or mcrfs instructions.
 * (This mimics the implementation of the bits on PowerPC CPUs: they have
 * no associated physical storage, and instead are hardwired to the
 * appropriate function of other FPSCR bits.)  Client code which needs to
 * check the state of FPSCR[FEX] or FPSCR[VX] should manually compute them
 * based on the relevant exception and mask bits.
 *
 * Translated code assumes that the host's floating-point rounding mode is
 * set based on FPSCR[RN] and all host floating-point exception flags are
 * clear on entry.  The code will maintain these invariants on all
 * outbound control transfers, so a client program which does not perform
 * any floating-point operations or modify FPSCR on its own only needs to
 * set host floating-point state once, before first calling translated code.
 *
 * The FPSCR[FR] bit is not set by any floating-point instructions, though
 * it can be written as usual by instructions which directly manipulate
 * FPSCR.
 *
 * The overflow and underflow exception enable bits (OE and UE) in FPSCR
 * are ignored; floating-point operations are performed as if both
 * exceptions are masked (FPSCR[OE]=0 and FPSCR[UE]=0).  However,
 * FPSCR[FEX] will still reflect the state of the OE and UE bits actually
 * stored in FPSCR, so (for example) an Rc=1 instruction that generates an
 * overflow exception with FPSCR[OE]=1 will set cr1.FEX to 1.
 *
 * The "non-IEEE" (NI) flag in FPSCR is ignored; floating-point operations
 * will always be performed in full precision.
 *
 * libbinrec implements the implementation-specific behavior of 32-bit
 * PowerPC processors (at least the 750CL) that single-precision
 * floating-point instructions accept double-precision operands, including
 * the quirk that a double-precision frC operand to a single-precision
 * multiply or multiply-add instruction is rounded to 24 bits (this latter
 * behavior can be suppressed with the BINREC_OPT_G_PPC_FAST_FMULS
 * optimization flag).
 *
 * The conditional load/store instructions (lwarx and stwcx.) rely on
 * hardware support for their behavior.  Since such hardware support is
 * not necessarily available in the host environment, and since a correct
 * implementation requires knowledge of that environment which is not
 * available to libbinrec, these instructions are implemented using a
 * compare-and-swap heuristic: an stwcx. after a lwarx succeeds if the
 * value at the target address is unchanged from the value it had when the
 * lwarx was executed.  Translated code for lwarx and stwcx makes use of
 * two PSB fields: reserve_flag (an 8-bit Boolean value), which records
 * whether an lwarx is pending, and reserve_state (a 32-bit integer value),
 * which stores the value loaded by the most recent lwarx.  This can result
 * in incorrect behavior if the guest code expects the store to fail due to
 * writes of an identical value or writes a new value using a regular store
 * between the conditional load and store, but this heuristic should
 * suffice for typical programs.  Note that the address of the lwarx is
 * _not_ saved; a stwcx. to a different address will still succeed if that
 * address contains the same value as was loaded by the lwarx instruction.
 * (This matches the behavior of actual PowerPC CPUs in the sense that
 * address mismatches between lwarx and stwcx. are ignored.)
 *
 * Access to the time base registers via the mftb instruction is
 * implemented by calling a host-side callback function, a pointer to
 * which should be stored in the PSB at the offset indicated by
 * state_offsets_ppc.timebase_handler.  The signature of the function is
 * "uint64_t handler(PSB *)", taking the pointer to the PSB which was
 * passed to the translated code and returning the current 64-bit time
 * base value.  If the function pointer is NULL, reads of the time base
 * registers will always return zero.
 *
 * Exceptions generated by the system call (sc) and trap (tw/twi)
 * instructions are implemented as control transfers to host-side
 * functions, pointers to which should be stored in the PSB at the offsets
 * indicated by state_offsets_ppc.sc_handler and .trap_handler
 * respectively.  The signature of the functions is:
 *      void sc_handler(PSB *, uint32_t insn);
 *      void trap_handler(PSB *);
 * taking the pointer to the PSB which was passed to the translated code
 * and, for the sc handler, the instruction word which caused the call.
 * The value of the NIA field in the PSB is set as the SRR0 register would
 * be set on a true PowerPC processor: to the address of the trap
 * instruction for trap exceptions, and to the address of the instruction
 * _following_ the sc instruction for system call exceptions (but see also
 * the BINREC_OPT_G_PPC_SC_BLR optimization flag).  The translated code
 * will return immediately to its caller when the handler returns, and the
 * call to the handler may in fact be translated as a tail call.  The
 * translated code does not check for NULL function pointers, so it will
 * crash if an exception occurs and the associated function pointer is
 * not set.
 *
 * All instruction words with the primary opcode of the sc instruction
 * (0x11) are decoded as that instruction.  This deviates from the PowerPC
 * specification, in which only the instruction 0x4400_0002 is a valid sc
 * instruction, but is done to allow the use of that instruction as (for
 * example) a callout to native code in a PowerPC system emulator in which
 * the specific function to call is encoded in the instruction word.  If
 * this behavior is not desired, the system call handler can simply treat
 * any instruction word other than 0x4400_0002 as an illegal instruction.
 *
 * libbinrec ignores the icbi instruction, except in that execution will
 * always return to the caller after icbi.  If the guest program generates
 * and executes new (guest) code on the fly, the libbinrec client should
 * check after each block of translated code returns whether the
 * instruction immediately preceding NIA is an icbi instruction, and take
 * appropriate action if so.
 *
 * If a D-form (immediate offset) load or store instruction has an offset
 * which causes the final address to wrap around the 32-bit address space,
 * such as lwz rD,16(rA) where the value of rA is 0xFFFF_FFF0 or greater,
 * the access will improperly "leak" outside the guest memory region.
 * However, accesses to the top 32k of memory using single-register (not
 * lmw/stmw) D-form instructions with rA = 0 are handled correctly.
 *
 * Some obscure hardware quirks are not emulated by the translated code;
 * see the list of expected failures in tests/guest-ppc/exec/750cl-common.i
 * for details.
 *
 *
 * Host-specific notes
 * ===================
 *
 * Intel/AMD x86 64-bit architecture (BINREC_ARCH_X86_64_*)
 * --------------------------------------------------------
 *
 * Translated code assumes support for all instruction set extensions
 * through SSE3.  (More specifically, the following CPUID feature bits are
 * assumed to be set: CMOV, SSE, SSE2, and SSE3.)  Use of later extensions
 * can be enabled by setting appropriate feature bits (BINREC_FEATURE_X86_*)
 * in the host_features field of binrec_setup_t.
 *
 * Translated code must be located at a 16-byte-aligned address for correct
 * behavior.  If the code is not correctly aligned, certain floating-point
 * operations may raise exceptions (specifically the general-protection
 * exception, "#GP" as described in Intel documentation).  libbinrec will
 * always request 16-byte alignment if a code_malloc() callback is supplied.
 *
 * Translated code maintains the host stack at 16-byte alignment, as
 * required by both System V and Windows ABIs.  If the client program calls
 * translated code with a misaligned stack, floating-point code may raise
 * exceptions due to unaligned stack accesses.
 *
 * Loads of 64-bit floating point ("double") 2-element vectors must be
 * 16-byte aligned, or an exception will be raised.  Other data types can
 * be loaded from any alignment, though values not aligned to a multiple of
 * the value size may take additional CPU cycles to load.
 *
 * Full (non-rounding) support for fused multiply-add operations is only
 * implemented for CPUs which support the FMA3 instruction set.  If the
 * BINREC_FEATURE_X86_FMA feature flag is not set in the setup structure
 * passed to binrec_create_handle(), fused multiply-add operations will be
 * translated to separate multiply and add instructions, which will cause
 * the intermediate result to be rounded; this in turn may cause
 * floating-point exceptions to be incorrectly raised in certain edge cases
 * (such as HUGE_VAL * HUGE_VAL - inf, in which the multiplication rounds
 * to infinity and the resulting subtraction of infinities triggers an
 * exception).
 *
 * The prohibition on tail calls in the Windows SEH ABI also prevents the
 * use of dynamic chaining, so calling binrec_enable_chaining() has no
 * effect when the host architecture is BINREC_ARCH_X86_64_WINDOWS_SEH.
 * If a logging callback function is provided, a warning to this effect
 * will be emitted if binrec_enable_chaning() is called with a true value
 * for the enable parameter.
 */

/*************************************************************************/
/*********************** Data types and constants ************************/
/*************************************************************************/

/*----------------------------- Basic types -----------------------------*/

/**
 * binrec_t:  Type of a translation handle.  This handle stores global
 * translation settings, such as optimization flags and functions to use
 * for memory allocation.
 */
typedef struct binrec_t binrec_t;

/**
 * binrec_arch_t:  Enumeration of architectures and variants supported by
 * the library.  All currently supported architectures are either
 * guest-only or host-only; see the inline comments at each enumerator.
 *
 * As a general rule, libbinrec assumes that its input is a program
 * designed to run on the selected guest architecture, and therefore all
 * instructions encountered in the program will be valid instruction
 * encodings.  Consequently, this enumeration only includes coarse
 * architecture families which encompass a group of compatible processors;
 * for example, the PPC_7XX architecture covers all PowerPC CPUs through
 * the 750CL, and the input program is assumed to use only instructions
 * which are valid on the architecture it was written for. See also the
 * note on library limitations in the README file.
 */
typedef enum binrec_arch_t {
    /* Constant used by binrec_native_arch() to indicate an unsupported
     * architecture. */
    BINREC_ARCH_INVALID = 0,

    /* PowerPC 32-bit architecture as implemented in PowerPC 7xx
     * processors, including all other instruction set extensions through
     * the PowerPC 750CL.  Also supports programs written for PowerPC 6xx
     * CPUs, with the exception of non-PowerPC instructions (such as ABS)
     * specific to the PowerPC 601. */
    BINREC_ARCH_PPC_7XX,                // Guest only.

    /* Intel/AMD x86 64-bit architecture, using the SysV ABI. */
    BINREC_ARCH_X86_64_SYSV,            // Host only.

    /* Intel/AMD x86 64-bit architecture, using the Windows ABI. */
    BINREC_ARCH_X86_64_WINDOWS,         // Host only.

    /* Variant of BINREC_ARCH_X86_64_WINDOWS which prepends unwind
     * information to the returned function.  The offset to the generated
     * code is stored as a 64-bit value at the returned code address, and
     * the unwind information is found immediately after that value.  See
     * the documentation at the top of this file for caveats when using
     * this architecture variant. */
    BINREC_ARCH_X86_64_WINDOWS_SEH,     // Host only.
} binrec_arch_t;

/**
 * binrec_loglevel_t:  Enumeration of log levels which can be passed to
 * the log function specified in binrec_setup_t.
 */
typedef enum binrec_loglevel_t {
    BINREC_LOGLEVEL_INFO,    // Informational messages.
    BINREC_LOGLEVEL_WARNING, // Messages indicating a potential problem.
    BINREC_LOGLEVEL_ERROR,   // Messages indicating failure of some operation.
} binrec_loglevel_t;

/*--------------------- Architecture feature flags ----------------------*/

/*
 * These flags indicate the presence of specific features (such as optional
 * instructions) within a particular architecture.  These are used in the
 * "host_features" field of binrec_setup_t.
 */

/**
 * BINREC_FEATURE_X86_*:  Feature flags for the x86 architecture.
 */
#define BINREC_FEATURE_X86_FMA      (1U << 0)  // FMA3 only (FMA4 unsupported).
#define BINREC_FEATURE_X86_MOVBE    (1U << 1)
#define BINREC_FEATURE_X86_LZCNT    (1U << 2)  // Also known as ABM.
#define BINREC_FEATURE_X86_BMI1     (1U << 3)
#define BINREC_FEATURE_X86_BMI2     (1U << 4)

/*-------------------------- Setup structures ---------------------------*/

/**
 * binrec_setup_ppc_t:  Structure which defines processor state block
 * offsets for PowerPC guests.  Contained in binrec_setup_t.
 *
 * Each block of registers is assumed to be contiguous; for example, GPR 1
 * is accessed by loading a 32-bit value from gpr + 4.  All multi-byte
 * values are assumed to be stored in host endian order.
 */
typedef struct binrec_setup_ppc_t {
    /* General-purpose registers (32 * uint32_t) */
    int gpr;
    /* Floating-point registers (32 * double[2]) */
    int fpr;
    /* Paired-single quantization registers (8 * uint32_t) */
    int gqr;
    /* Miscellaneous registers (each uint32_t) */
    int lr;
    int ctr;
    int cr;
    int xer;
    int fpscr;
    int pvr;  // Processor Version Register (SPR 287)
    int pir;  // Processor Identification Register (SPR 1023)
    /* lwarx/stwcx. reservation flag (uint8_t) */
    int reserve_flag;
    /* lwarx/stwcx. reservation state (uint32_t) */
    int reserve_state;
    /* Next instruction address (updated on return from translated code) */
    int nia;
    /* Pointer to function to handle time base reads.  Signature:
     * uint64_t timebase_handler(void *state) */
    int timebase_handler;
    /* Pointer to function to handle system calls (sc instruction).
     * Should return the (possibly changed) state block pointer.
     * Signature: void *sc_handler(void *state) */
    int sc_handler;
    /* Pointer to function to handle trap exceptions.  Should return the
     * (possibly changed) state block pointer.
     * Signature: void *trap_handler(void *state) */
    int trap_handler;
    /* Pointers to lookup tables (of type uint16_t[64]) for the fres and
     * frsqrte instructions.  See the BINREC_OPT_G_PPC_NATIVE_RECIPROCAL
     * optimization flag documentation for details. */
    int fres_lut;
    int frsqrte_lut;
} binrec_setup_ppc_t;


/**
 * binrec_setup_t:  Structure which defines various parameters used by the
 * translator.  Used by binrec_create_handle().
 */
typedef struct binrec_setup_t {

    /**
     * guest, host:  BINREC_ARCH_* values indicating the architecture and
     * variant to translate from (guest) and to (host).  binrec_translate()
     * will fail if the library cannot perform the requested translation.
     */
    binrec_arch_t guest;
    binrec_arch_t host;

    /**
     * host_features:  Bitwise-OR of feature flags (BINREC_FEATURE_*) for
     * the selected host architecture, indicating which features should be
     * assumed to be present when generating host code.
     */
    unsigned int host_features;

    /**
     * guest_memory_base:  Pointer to a region of host memory reserved as
     * the address space of the guest code.  binrec_translate() calls will
     * read source machine instructions and constant data from this region.
     * Memory accesses within the translated code itself will use the
     * address passed as a parameter to the code.
     */
    void *guest_memory_base;

    /**
     * state_offsets_*:  Offsets from the beginning of the processor state
     * block (as passed to the generated native code) to the various guest
     * registers and other processor state.  Use the structure appropriate
     * to the selected guest architecture; see the definition of each
     * structure for details.
     */
    union {
        binrec_setup_ppc_t state_offsets_ppc;
    };

    /**
     * state_offset_chain_lookup: PSB offset to a pointer to a function
     * which looks up translated blocks for chaining (see
     * binrec_enable_chaining()).  Signature:
     * void *chain_lookup(void *state, uint32_t target_address)
     */
    int state_offset_chain_lookup;

    /**
     * state_offset_branch_exit_flag:  PSB offset to a 32-bit value to
     * check at intra-unit branches (see binrec_enable_branch_exit_test()).
     */
    int state_offset_branch_exit_flag;

    /**
     * userdata:  Opaque pointer which is passed to all callback functions
     * below.
     */
    void *userdata;

    /**
     * malloc:  Pointer to a function which allocates memory, like malloc().
     * If NULL, the system's malloc() will be used.
     *
     * Like standard malloc(), this function may return either NULL or a
     * pointer to a zero-size memory block if passed a size of zero.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     size: Size of block to allocate, in bytes.
     * [Return value]
     *     Pointer to allocated memory, or NULL on error or if size == 0.
     */
    void *(*malloc)(void *userdata, size_t size);

    /**
     * realloc:  Pointer to a function which resizes a block of allocated
     * memory, like realloc().  If NULL, the system's realloc() will be used.
     *
     * Like standard realloc(), this function may return either NULL or a
     * pointer to a zero-size memory block if passed a size of zero.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     ptr: Block to resize, or NULL to allocate a new block.
     *     size: New size of block, in bytes, or 0 to free the block.
     * [Return value]
     *     Pointer to allocated memory, or NULL on error or if size == 0.
     */
    void *(*realloc)(void *userdata, void *ptr, size_t size);

    /**
     * free:  Pointer to a function which frees a block of allocated
     * memory, like free().  If NULL, the system's free() will be used.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     ptr: Block to free (may be NULL).
     * [Return value]
     *     Pointer to allocated memory, or NULL on error.
     */
    void (*free)(void *userdata, void *ptr);

    /**
     * code_malloc:  Pointer to a function which allocates a block of
     * memory for output machine code.  If NULL, the malloc() callback (or
     * the system's malloc(), if that callback is also NULL) will be used
     * and no alignment will be performed.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     size: Size of block to allocate, in bytes (guaranteed to be
     *         nonzero).
     *     alignment: Desired address alignment, in bytes (guaranteed to
     *         be a power of 2).
     * [Return value]
     *     Pointer to allocated memory, or NULL on error.
     */
    void *(*code_malloc)(void *userdata, size_t size, size_t alignment);

    /**
     * code_realloc:  Pointer to a function which resizes a block of memory
     * allocated with the code_malloc() callback.  If NULL, the realloc()
     * callback (or the system's realloc(), if that callback is also NULL)
     * will be used.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     ptr: Block to resize (guaranteed to be non-NULL).
     *     old_size: Current size of block, in bytes.
     *     new_size: New size of block, in bytes (guaranteed to be nonzero).
     *     alignment: Required address alignment, in bytes (guaranteed to
     *         be equal to the value used for initial allocation).
     * [Return value]
     *     Pointer to allocated memory, or NULL on error.
     */
    void *(*code_realloc)(void *userdata, void *ptr, size_t old_size,
                          size_t new_size, size_t alignment);

    /**
     * code_free:  Pointer to a function which frees a block of memory
     * allocated with the code_malloc() callback.  If NULL, the free()
     * callback (or the system's free(), if that callback is also NULL)
     * will be used.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     ptr: Block to free (may be NULL).
     */
    void (*code_free)(void *userdata, void *ptr);

    /**
     * log:  Pointer to a function to log messages from the library.
     * If NULL, no logging will be performed.
     *
     * [Parameters]
     *     userdata: User data pointer from setup structure.
     *     level: Log level (BINREC_LOGLEVEL_*).
     *     message: Log message.
     */
    void (*log)(void *userdata, binrec_loglevel_t level, const char *message);

} binrec_setup_t;

/*--------------------- General optimization flags ----------------------*/

/*
 * Optimizations performed by the library can generally be classified into
 * three types:
 *
 * - Behavior-safe: optimizations which purely affect the size or speed of
 *   the generated code and have no effect on behavior.  Optimizations
 *   such as constant folding and deconditioning fall into this category.
 *   Optimizations can be assumed to fall under this category where not
 *   otherwise documented.
 *
 * - Specification-safe: optimizations which may change the behavior of
 *   the generated code, but only within limits prescribed by the relevant
 *   specification.  For example, the NATIVE_IEEE_UNDERFLOW optimization
 *   may change the results of certain floating-point operations relative
 *   to the results returned by guest code running on its native hardware,
 *   but the IEEE floating-point specification allows either of two
 *   behaviors, so with respect to that specification, the optimized code
 *   is no less correct than the original.  As long as the guest code was
 *   written to follow the specifications rather than the precise behavior
 *   of the guest hardware, it will still behave correctly under these
 *   optimizations.
 *
 * - Unsafe: optimizations which can materially impact the behavior of the
 *   generated code, such as stack frame optimization.  These optimizations
 *   can benefit code which rigorously adhere to the relevant assumptions,
 *   such as code produced by a high-level language compiler, but they can
 *   cause nonconformant code to misbehave or even crash.  Documentation
 *   for an unsafe optimization will clearly indicate that fact.
 */

/**
 * BINREC_OPT_BASIC:  Enable basic optimization of translated code.  This
 * includes the following transformations:
 *
 * - Branches to other (unconditional or same-conditioned) branch
 *   instructions will be threaded through to the final branch destination.
 *
 * - Unreachable basic blocks will be eliminated from the code stream.
 *
 * - Branches to the next instruction will be eliminated.
 */
#define BINREC_OPT_BASIC  (1<<0)

/**
 * BINREC_OPT_DECONDITION:  Convert conditional branches and moves with
 * constant conditions to unconditional instructions or NOPs.  This is
 * most useful in conjunction with constant folding.
 */
#define BINREC_OPT_DECONDITION  (1<<1)

/**
 * BINREC_OPT_DEEP_DATA_FLOW:  Perform extended data flow analysis on
 * values associated with guest architecture registers to find dead stores.
 * This optimization by itself only finds dead stores; enable BINREC_OPT_DSE
 * to remove them from the code stream.
 */
#define BINREC_OPT_DEEP_DATA_FLOW  (1<<2)

/**
 * BINREC_OPT_DSE:  Perform dead store elimination (DSE) on the translated
 * code, removing instructions whose outputs are not used.
 *
 * Instructions with side effects, such as floating-point operations (which
 * could raise exceptions) or atomic read-modify-write instructions, are
 * not eliminated.  However, floating-point instructions will be eliminated
 * if the BINREC_OPT_DSE_FP optimization is also enabled.
 */
#define BINREC_OPT_DSE  (1<<3)

/**
 * BINREC_OPT_DSE_FP:  Allow elimination of floating-point operations when
 * performing dead store elimination.
 *
 * This optimization is UNSAFE: if an eliminated operation would have
 * raised a floating-point exception which the guest code checks for, the
 * translated code will not behave correctly.
 */
#define BINREC_OPT_DSE_FP  (1<<4)

/**
 * BINREC_OPT_FOLD_CONSTANTS:  Look for computations whose operands are all
 * constant and load operations which load from a constant address within
 * memory marked read-only (see binrec_add_readonly_region()), and convert
 * them to load-immediate operations.  The computed values are themselves
 * treated as constant, so constantness can be propagated through multiple
 * instructions.  Intermediate values in a computation sequence which end
 * up being unused due to constant folding, as well as any other
 * instructions whose outputs which are not used elsewhere, are removed
 * from the code stream if BINREC_OPT_DSE is also enabled.
 *
 * Floating-point operations will not be folded unless the
 * BINREC_OPT_FOLD_FP_CONSTANTS optimization is also enabled.
 */
#define BINREC_OPT_FOLD_CONSTANTS  (1<<5)

/**
 * BINREC_OPT_FOLD_FP_CONSTANTS:  Fold floating-point as well as integer
 * constants, performing the floating-point operations in the runtime
 * environment.  Any floating-point exceptions generated by the operation
 * are discarded.
 *
 * This flag is ignored if BINREC_OPT_FOLD_CONSTANTS is not also enabled.
 *
 * This optimization is UNSAFE: if the floating-point behavior of the
 * runtime environment differs materially from that of the guest
 * architecture (such as by not complying with IEEE 754) or if a folded
 * operation generates a floating-point exception and the guest code
 * expects to detect that exception, the translated code will not behave
 * correctly.
 */
#define BINREC_OPT_FOLD_FP_CONSTANTS  (1<<6)

/**
 * BINREC_OPT_FOLD_VECTORS:  Attempt to eliminate vector registers whose
 * values are only used as scalars.  For example, if two scalar values are
 * merged into a vector but are immediately extracted to scalars again,
 * the vector register is not needed and the initial scalar values can be
 * forwarded to the later computations.
 *
 * This optimization only finds removable vector registers and forwards
 * the associated scalar values; the BINREC_OPT_DSE optimization flag is
 * required to eliminate the vectors from the code stream.
 */
#define BINREC_OPT_FOLD_VECTORS  (1<<7)

/**
 * BINREC_OPT_NATIVE_IEEE_NAN:  Use the host's rules for NaN results of
 * floating-point operations, even when those rules differ from the guest
 * architecture, and allow the host to reorder operands to a floating-point
 * operation even if doing so would change the NaN returned for an invalid
 * operation.
 *
 * The IEEE specification allows differing behavior between implementations
 * in the following cases:
 *
 * - If more than one operand to an operation is a NaN, the returned NaN
 *   may be any of those values.
 *
 * - If a NaN is generated due to an invalid operation, its bit pattern
 *   may be any quiet NaN bit pattern.
 *
 * This optimization allows the translator to translate floating-point
 * operations directly to their host equivalents without manually checking
 * for NaNs, which can require several additional host instructions per
 * guest instruction.
 *
 * This optimization is specification-safe: as long as guest code follows
 * the IEEE 754 specifications, it will behave correctly under this
 * optimization.
 */
#define BINREC_OPT_NATIVE_IEEE_NAN  (1<<8)

/**
 * BINREC_OPT_NATIVE_IEEE_UNDERFLOW:  Use the host's definition of
 * underflow for IEEE floating-point arithmetic, even when that differs
 * from the guest's definition.
 *
 * When translating between architectures which use different definitions
 * of underflow (IEEE allows two different behaviors: tiny before rounding
 * and tiny after rounding), this optimization allows floating-point
 * operations to be translated directly to their equivalent host
 * instructions, at the cost of different exception states for operations
 * with a result which is treated as underflowing on one architecture and
 * not the other.  If this optimization is disabled, floating-point
 * operations must check explicitly for underflow, which can require
 * several additional host instructions per guest instruction.
 *
 * If the host and guest use the same "tiny" rules, floating-point
 * operations can always be translated directly to native instructions
 * (at least with regard to tininess), and this flag has no effect on
 * translation.
 *
 * Enabling the BINREC_OPT_DSE and BINREC_OPT_DSE_FP optimizations will
 * typically have the effect of enabling this optimization as well.
 *
 * This optimization is specification-safe: as long as guest code follows
 * the IEEE 754 specifications, it will behave correctly under this
 * optimization.
 */
#define BINREC_OPT_NATIVE_IEEE_UNDERFLOW  (1<<9)

/*----------- Guest-architecture-specific optimization flags ------------*/

/**
 * BINREC_OPT_G_PPC_ASSUME_NO_SNAN:  Do not attempt to preserve the
 * signaling/quiet state of floating-point NaN (not-a-number) values.
 *
 * The single-precision lfs and stfs instructions preserve the state of the
 * "quiet" bit in a floating-point NaN (the high bit of the mantissa) when
 * converting it to or from double precision, but the host instructions
 * used to implement such a format conversion may treat it as an arithmetic
 * operation which quiets any incoming signaling NaN, and detecting that
 * such a change took place can require many more host instructions than
 * the conversion itself.  If this optimization is enabled, the translator
 * will use the fastest possible method to convert between single and
 * double precision, ignoring the possibility of signaling NaNs.  If a
 * signaling NaN is in fact loaded, its quiet bit will be set; this may
 * cause a later floating-point instruction to fail to raise an expected
 * invalid-operation exception, or have other unpredictable effects if the
 * bitwise contents of the value are used in non-floating-point operations.
 *
 * This optimization is UNSAFE: code which relies on being able to load a
 * signaling NaN will not behave correctly.  But see the
 * BINREC_OPT_G_PPC_FORWARD_LOADS optimization for a way to avoid the
 * impact of this optimization on code which loads a signaling NaN (or
 * non-floating-point data which looks like one) and immediately stores it
 * back to memory.
 */
#define BINREC_OPT_G_PPC_ASSUME_NO_SNAN  (1<<0)

/**
 * BINREC_OPT_G_PPC_CONSTANT_GQRS:  Assume that the values of the GQRs
 * (graphics quantization registers, used with paired-single load and
 * store instructions) are constant with respect to the entry point of a
 * translation unit.
 *
 * Ordinarily, translated code for a psq_l* or psq_st* instruction must
 * read the referenced GQR at runtime and choose the appropriate load or
 * store operation, which both adds several instructions' worth of latency
 * and significantly increases code size.  If this optimization is enabled,
 * the translator will instead read (at translation time) the value of each
 * GQR referenced by guest code and translate paired-single load and store
 * instructions based on those values.
 *
 * If this optimization is enabled, an mtspr instruction which writes to a
 * GQR will cause the translated code to immediately return to its caller.
 * This ensures that any following load or store instructions will be
 * translated using the value written by the mtspr instruction.
 *
 * This optimization is UNSAFE: if the value of a GQR is not constant with
 * respect to any paired-single load or store instruction, the translated
 * code will not behave correctly.
 */
#define BINREC_OPT_G_PPC_CONSTANT_GQRS  (1<<1)

/**
 * BINREC_OPT_G_PPC_DETECT_FCFI_EMUL:  Detect code sequences which convert
 * an integer to a floating-point number using bit manipulation and local
 * stack frame storage.
 *
 * The 32-bit PowerPC architecture does not include the fcfi (floating
 * convert-from-integer) instructions of the 64-bit architecture, so
 * programs must implement this conversion in software.  One idiom used
 * with 32-bit integer input is to create a 64-bit floating-point value on
 * the stack in which the lowest bit of the mantissa has the value 1.0
 * (the bit pattern of this value is 0x43300000_00000000), write the
 * integer to be converted into the low 32 bits of the value, then load
 * the value as double-precision floating point and subtract the base
 * value.  (For signed integer input, the high bit of the integer and the
 * corresponding bit in the base floating-point value are inverted.)
 *
 * If this optimization is enabled, the translator will check for a
 * sequence of instructions matching the above pattern and translate it to
 * a direct int-to-float conversion instruction.  Currently, the input
 * instruction sequence must match the following:
 *     lis rX,0x4330
 *     stw rX,N(r1)        // N >= 8
 *     stw rINPUT,N+4(r1)  // signed: xoris rY,rINPUT,0x8000; stw rY,N+4(r1)
 *     lfd fP,N(r1)
 *     lfd fQ,const_43300000_0000000  // signed: const_43300000_80000000
 *     fsub fOUTPUT,fP,fQ
 *     [optional: frsp fOUTPUT,fOUTPUT]
 * where "const_..." is a memory reference to a constant address registered
 * with libbinrec as read-only memory containing the listed 64-bit constant.
 * The conversion operation will be detected even if the instructions are
 * reordered (preserving dependencies), but detection may fail if other
 * operations (such as a register move) are interspersed in the dependency
 * chain.
 *
 * Intermediate operations in the sequence are still translated, so that
 * other instructions which use their values still behave correctly.
 * (However, the load of fP or fQ will be omitted if fOUTPUT refers to the
 * same register, and the writes to rX and rY may subsequently be deleted
 * by dead store elimination.)
 *
 * This optimization is UNSAFE: if the floating-point value generated on
 * the stack is overwritten by an indexed, multi-word, or non-r1-relative
 * store, or if another agent (such as a separate thread) modifies the
 * function's stack frame during the optimized instruction sequence, the
 * translated code will not behave correctly.  (However, these situations
 * are not expected to arise in real-world code.)
 */
#define BINREC_OPT_G_PPC_DETECT_FCFI_EMUL  (1<<2)

/**
 * BINREC_OPT_G_PPC_FAST_FCTIW:  Leave the high word of the destination
 * FPR of an fctiw or fctiwz instruction unspecified.
 *
 * The PowerPC architecture specification states that the high 32 bits of
 * the destination register of an fctiw or fctiwz instruction are
 * unspecified.  However, by default libbinrec will follow the actual
 * implementation used on the 750CL processor, which sets the high word to
 * 0xFFF8_0000 (thus turning the entire 64-bit register into a NaN when
 * interpreted as a double-precision value).
 *
 * Enabling this optimization will allow the translator to use whatever
 * method is fastest on the host for converting to integer.  This will not
 * affect the value stored in the low 32 bits of the register, but the
 * high 32 bits may be left unchanged or changed to some other value.
 *
 * This optimization is specification-safe: as long as guest code follows
 * the PowerPC architecture specification, it will behave correctly under
 * this optimization.
 */
#define BINREC_OPT_G_PPC_FAST_FCTIW  (1<<3)

/**
 * BINREC_OPT_G_PPC_FAST_FMADDS:  Use simple arithmetic conversion when
 * rounding a double-precision multiply-add result to single precision.
 *
 * When using double-precision inputs to a single-precision fused
 * multiply-add instruction like fmadds or ps_madd, the result must be
 * rounded to single precision before storing it in the output register.
 * Using simple arithmetic conversion for this rounding can change the
 * result in certain cases, specifically under the following conditions:
 * the rounding mode is set to round-to-nearest, the product is exactly
 * halfway between two single-precision values (the low 29 bits of the
 * double-precision mantissa are 0x1000_0000), and the addend is tiny with
 * respect to the product.  In this case, the infinitely precise result is
 * not exactly between two single-precision values, so it should round to
 * the nearer one; but if the output of the double-precision operation is
 * used for rounding, the addend will have been already rounded off, so
 * the rounding input will be treated as a tie and may round in the wrong
 * direction.
 *
 * Enabling this optimization allows the translator to ignore the
 * possibility of the inaccuracy described above and round double-precision
 * results with a simple arithmetic operation, which is significantly
 * faster than checking for and correcting rounding error.
 *
 * This optimization is UNSAFE: if enabled, results of single-precision
 * multiply-add operations with certain operands will differ in the lowest
 * bit from the correct values.
 */
#define BINREC_OPT_G_PPC_FAST_FMADDS  (1<<4)

/**
 * BINREC_OPT_G_PPC_FAST_FMULS:  Do not attempt to round the second
 * multiplicand (frC) to a single-precision multiply or multiply-add
 * instruction.
 *
 * According to the PowerPC architecture specification, the result of
 * using non-single-precision values with single-precision instructions is
 * undefined.  Real 32-bit PowerPC chips (at least the 750CL) just perform
 * the operation in double precision and round the result to single
 * precision -- with one exception: the second operand (frC) to a multiply
 * operation has its mantissa rounded to 24 bits before the multiplication
 * is performed.  libbinrec implements this rounding on an frC operand
 * which is not known to already be in single-precision format; since the
 * library does not perform the deep analysis required to carry knowledge
 * of data format across branches (such as in loops), rounding may have to
 * be performed frequently even in guest code which properly converts all
 * values to single precision before using single-precision insturctions.
 * This rounding is fairly expensive because of the various edge cases
 * that need to be handled.
 *
 * If this optimization is enabled, libbinrec will assume that the frC
 * operand to an fmuls, fmadds, fmsubs, fnmadds, or fnmsubs instruction is
 * representable in single precision even if it is not known to be in
 * single-precision format, and will skip the rounding step.
 *
 * In the more general case of double-precision operands used with
 * single-precision instructions, libbinrec always performs the operation
 * in double precision, since it will generally be no slower (and often
 * faster) to round the result after the operation than to round all input
 * operands beforehand, and as long as the input values are representable
 * in single precision, rounding a double-precision result gives the same
 * output as performing the operation in single precision.
 *
 * This optimization is specification-safe: as long as guest code follows
 * the PowerPC architecture specification, it will behave correctly under
 * this optimization.
 */
#define BINREC_OPT_G_PPC_FAST_FMULS  (1<<5)

/**
 * BINREC_OPT_G_PPC_FAST_STFS:  Use mathematical rather than bitwise
 * conversion when storing double-precision values as single precision.
 *
 * The stfs instruction (as well as stfsx, stfsu, and stfsux) are defined
 * to have a specific behavior with respect to double-precision values,
 * which has the effect of converting the value to single precision in
 * round-toward-zero mode if the value is in the range of values
 * representable in single precision but does not treat overflow or
 * underflow conditions specially -- thus, for example, storing an FPR
 * containing the double-precision value 2^256 with stfs stores the bit
 * pattern 0x3F80_0000, equal to 1.0 in single precision.  Properly
 * implementing this behavior is significantly more expensive than simply
 * converting the value to single precision as an arithmetic operation
 * and storing that result, which can make this behavior a bottleneck for
 * programs which process large amounts of single-precision data.
 *
 * Enabling this optimization causes the translator to use ordinary
 * arithmetic conversion when storing double-precision values with the
 * stfs group of instructions.  This deviates from the PowerPC
 * specification, but (particularly if used with the ASSUME_NO_SNAN
 * optimization) allows single-precision stores to be implemented with
 * many fewer host instructions.
 *
 * This optimization is UNSAFE: if the guest code relies on the precise
 * conversion behavior of stfs-group instructions, the translated code
 * will not behave correctly.
 */
#define BINREC_OPT_G_PPC_FAST_STFS  (1<<6)

/**
 * BINREC_OPT_G_PPC_FNMADD_ZERO_SIGN:  Do not attempt to return the correct
 * sign on the result of an fnmadd[s] or fnmsub[s] instruction.
 *
 * The PowerPC fnmadd and fnmsub instructions calculate -(frA*frC+frB) and
 * -(frA*frC-frB), respectively.  These differ from the fused multiply-add
 * instructions some other architectures, such as x86, in which the
 * operations of the same names only negate the product, such that fnmadd
 * calculates -(frA*frC)+frB.  While this difference can be mostly covered
 * by translating to the opposite operation (PowerPC fnmadd -> x86 fnmsub),
 * this gives the wrong sign on some zero results, so a correct translation
 * requires using a positive fmadd/fmsub and manually negating the result.
 *
 * Enabling this optimization allows the translator to translate these
 * instructions to single fnmsub/fnmadd operations, at the cost of
 * returning zero with an incorrect sign in the cases mentioned above.
 *
 * This optimization is UNSAFE for obvious reasons, though it is believed
 * that most real-life PowerPC code does not differentiate between positive
 * and negative zero.
 */
#define BINREC_OPT_G_PPC_FNMADD_ZERO_SIGN  (1<<7)

/**
 * BINREC_OPT_G_PPC_FORWARD_LOADS:  Save the raw value read from memory for
 * each load instruction, and if the same value is stored back to memory,
 * store the raw value instead of reading back the register.
 *
 * On little-endian hosts, this avoids the need to byte-swap values an
 * extra time when storing them, as well as a copy between integer and
 * floating-point registers for floating-point values.  This optimization
 * can also reduce the impact of the ASSUME_NO_SNAN optimization for guest
 * code which uses lfs/stfs to copy non-floating-point data (cases have
 * been observed which, for example, copy byte-reversed floating-point data
 * using lfs/stfs before fixing the byte order).  On the flip side, this
 * optimization may increase register pressure for loads which are in fact
 * forwarded, and this can in turn negate the benefits of the optimization
 * due to register spills.
 *
 * If the optimizations BINREC_OPT_G_PPC_PS_STORE_DENORMALS and
 * BINREC_OPT_G_PPC_CONSTANT_GQRS are active, this optimization also
 * allows forwarding of floating-point paired-single loads.
 *
 * Note that BINREC_OPT_DSE should always be used with this optimization
 * so that speculative loads for forwarding are eliminated if they are not
 * forwarded.
 */
#define BINREC_OPT_G_PPC_FORWARD_LOADS  (1<<8)

/**
 * BINREC_OPT_G_PPC_IGNORE_FPSCR_VXFOO:  Do not set FPSCR exception bits
 * for specific invalid exception types (the "VXFOO" bits).
 *
 * The PowerPC architecture includes several FPSCR bits which indicate
 * specific types of floating-point invalid operation exceptions, such as
 * subtraction of infinities (VXISI) or use of a signaling NaN (VXSNAN).
 * Detecting these cases on a host architecture which does not expose such
 * information requires additional manual checks on the operands to each
 * floating-point operation and can have a severe impact on performance.
 * Enabling this optimization allows the translator to skip these checks,
 * treating any invalid-operation exception as VXSNAN whether or not any
 * operand was in fact a signaling NaN.  (Other VXFOO exception bits are
 * still set in cases where doing so does not affect performance.)
 *
 * Instructions which directly manipulate FPSCR, such as mtfsf, are not
 * affected by this optimization and continue to behave normally.
 *
 * This optimization is UNSAFE for obvious reasons, though it is believed
 * that most real-life PowerPC code does not make use of the VXFOO bits.
 *
 * This optimization has no effect if BINREC_OPT_G_PPC_NO_FPSCR_STATE is
 * enabled.
 */
#define BINREC_OPT_G_PPC_IGNORE_FPSCR_VXFOO  (1<<9)

/**
 * BINREC_OPT_G_PPC_NATIVE_RECIPROCAL:  Translate guest PowerPC
 * reciprocal-estimate instructions (fres and frsqrte) directly to their
 * host equivalents, maintaining compliance with the PowerPC architecture
 * specification but disregarding the precise behavior of the guest
 * architecture.
 *
 * The PowerPC architecture specifies bounds within which the results of
 * these instructions will fall relative to the true (mathematical) result.
 * Programs written to be compliant with the architecture will work
 * correctly regardless of the exact output of the instruction, though the
 * precise behavior of the program (for example, the low-end bits of the
 * result) may change.  This flag allows the translator to choose faster
 * host instructions which may not give exactly the same result but still
 * satisfy the PowerPC architecture constraints.
 *
 * If this optimization is disabled, the translator will attempt to match
 * the precise behavior of the guest architecture by using lookup tables
 * referenced by pointers in the processor state block (see the fres_lut
 * and frsqrte_lut fields in binrec_setup_ppc_t).  The translated code will
 * crash if it executes an fres or frsqrte instruction, this optimization
 * is not enabled, and the appropriate pointer in the state block is not set.
 *
 * The tables for the 750CL processor are as follows:
 *
 * fres: 0x3FFC,0x3E1, 0x3C1C,0x3A7, 0x3875,0x371, 0x3504,0x340,
 *       0x31C4,0x313, 0x2EB1,0x2EA, 0x2BC8,0x2C4, 0x2904,0x2A0,
 *       0x2664,0x27F, 0x23E5,0x261, 0x2184,0x245, 0x1F40,0x22A,
 *       0x1D16,0x212, 0x1B04,0x1FB, 0x190A,0x1E5, 0x1725,0x1D1,
 *       0x1554,0x1BE, 0x1396,0x1AC, 0x11EB,0x19B, 0x104F,0x18B,
 *       0x0EC4,0x17C, 0x0D48,0x16E, 0x0BD7,0x15B, 0x0A7C,0x15B,
 *       0x0922,0x143, 0x07DF,0x143, 0x069C,0x12D, 0x056F,0x12D,
 *       0x0442,0x11A, 0x0328,0x11A, 0x020E,0x108, 0x0106,0x106
 *
 * frsqrte: 0x7FF4,0x7A4, 0x7852,0x700, 0x7154,0x670, 0x6AE4,0x5F2,
 *          0x64F2,0x584, 0x5F6E,0x524, 0x5A4C,0x4CC, 0x5580,0x47E,
 *          0x5102,0x43A, 0x4CCA,0x3FA, 0x48D0,0x3C2, 0x450E,0x38E,
 *          0x4182,0x35E, 0x3E24,0x332, 0x3AF2,0x30A, 0x37E8,0x2E6,
 *          0x34FD,0x568, 0x2F97,0x4F3, 0x2AA5,0x48D, 0x2618,0x435,
 *          0x21E4,0x3E7, 0x1DFE,0x3A2, 0x1A5C,0x365, 0x16F8,0x32E,
 *          0x13CA,0x2FC, 0x10CE,0x2D0, 0x0DFE,0x2A8, 0x0B57,0x283,
 *          0x08D4,0x261, 0x0673,0x243, 0x0431,0x226, 0x020B,0x20B
 *
 * Depending on the performance details of the host CPU and the types of
 * input values used by the guest code, enabling this optimization may
 * actually result in slower code (particularly for frsqrte), though it
 * will always reduce code size.
 *
 * This optimization is specification-safe: as long as guest code follows
 * the PowerPC architecture specification, it will behave correctly under
 * this optimization.
 */
#define BINREC_OPT_G_PPC_NATIVE_RECIPROCAL  (1<<10)

/**
 * BINREC_OPT_G_PPC_NO_FPSCR_STATE:  Do not write any state bits (exception
 * bits, FR, FI, or FPRF) in FPSCR based on floating-point operation results.
 *
 * Enabling this optimization causes the translated code to ignore all
 * host FPU exception conditions and skip setting FPRF to reflect the
 * value type.  For guest code which does not enable floating-point
 * exceptions or check the FPSCR status bits, this results in significantly
 * faster and smaller translated code with no effect on program behavior.
 *
 * The control mode bits are honored as usual, though only FPSCR[RN] has
 * any effect on program behavior in this case; the exception enable bits
 * are meaningless since exceptions are not detected, and nonzero FPSCR[NI]
 * is not currently supported by the translator.
 *
 * Instructions which directly manipulate FPSCR, such as mtfsf, are not
 * affected by this optimization and continue to behave normally.  If any
 * of the FR/FI/FPRF bits are set by such an instruction, they will remain
 * set even after floating-point instructions which would normally
 * overwrite them.
 *
 * Floating-point instructions with the Rc bit set will copy the high 4
 * bits of FPSCR to the cr1 field of CR as usual, though the bit values
 * will naturally not reflect the result of any floating-point operations.
 * The library will log a warning (once per translation unit) if such an
 * instruction is encountered when this optimization is enabled.
 *
 * This optimization implicitly enables BINREC_OPT_NATIVE_IEEE_UNDERFLOW.
 *
 * This optimization is UNSAFE: code which relies on any of the FPSCR
 * state bits will behave incorrectly if this optimization is enabled.
 */
#define BINREC_OPT_G_PPC_NO_FPSCR_STATE  (1<<11)

/**
 * BINREC_OPT_G_PPC_PAIRED_LWARX_STWCX:  Optimize the sequence of lwarx
 * followed by stwcx. if there are no intervening branches or branch
 * targets.
 *
 * This optimization allows the translator to forward data from a paired
 * lwarx to its associated stwcx., avoiding unnecessary accesses to the
 * processor state block.
 *
 * Additionally, if the data operand to stwcx. is the same as the value
 * loaded with lwarx, the store itself is omitted and the code behaves as
 * if the store succeeded.  (This case appears to arise from compilers
 * which always generate lwarx when reading from an atomic variable; in
 * such a case, a dummy stwcx. is required to clear the reservation state.)
 * Since conditionally storing back the loaded value will never cause a
 * change to memory, and since we translate lwarx/stwcx. using a compare-
 * and-exchange model rather than precisely emulating the reserve-and-snoop
 * behavior of PowerPC hardware, this transformation is safe.
 */
#define BINREC_OPT_G_PPC_PAIRED_LWARX_STWCX  (1<<12)

/**
 * BINREC_OPT_G_PPC_PS_STORE_DENORMALS:  Do not flush denormals to zero
 * when storing floating-point values with the paired-single store
 * instructions (psq_st[u][x]).
 *
 * Normally, the paired-single store instructions flush denormal values
 * to zero before writing them to memory.  Enabling this optimization
 * allows the translator to skip the expensive denormal check and write
 * the values straight to memory.
 *
 * Even when this optimization is enabled, psq_st instructions will flush
 * denormals to zero if the value is read in double precision (this depends
 * on the internal translator state at the particular instruction) and the
 * BINREC_OPT_G_PPC_FAST_STFS optimization is not enabled, since in that
 * case, flushing to zero is faster than producing a correct denormal value.
 *
 * This optimization is UNSAFE: code which relies on denormals being
 * flushed to zero by paired-single store instructions will behave
 * incorrectly if this optimization is enabled.
 */
#define BINREC_OPT_G_PPC_PS_STORE_DENORMALS  (1<<13)

/**
 * BINREC_OPT_G_PPC_SC_BLR:  Optimize an instruction sequence of "sc; blr"
 * by setting NIA to the value of the LR register rather than the address
 * of the instruction following the "sc" when calling the sc handler.
 * This avoids the need to translate and call a block containing a single
 * blr after returning from the sc handler.
 *
 * This optimization is UNSAFE: the sc handler cannot recover the original
 * address of the instruction which triggered the exception when this
 * optimization is triggered.
 */
#define BINREC_OPT_G_PPC_SC_BLR  (1<<14)

/**
 * BINREC_OPT_G_PPC_SINGLE_PREC_INPUTS:  Assume that the inputs to a
 * single-precision floating-point instruction are in single precision.
 *
 * The PowerPC specification requires inputs to a single-precision
 * floating-point instruction to be in single precision (either the result
 * of lfs or another single-precision load instruction, or the output of
 * a previous frsp or single-precision floating-point operation).  However,
 * actual 32-bit PowerPC CPUs ignore the precision of floating-point
 * mathematical instructions and always perform the operation using the
 * full precision of the inputs, and by default libbinrec emulates this
 * behavior to accommodate code which takes advantage of this quirk (in
 * violation of the PowerPC spec).
 *
 * If this optimization is enabled, libbinrec will assume that inputs to a
 * single-precision instruction are proper single-precision values, and
 * will perform the operation in single instead of double precision when
 * beneficial.  This improves performance when one input to such an
 * instruction is the output of a previous single-precision instruction in
 * the same basic block and the other input is an FPR which has not yet
 * been accessed in the block; in that case, this optimization allows the
 * translator to convert the second input to single precision and perform
 * the operation in single precision, instead of converting the first
 * operand to double precision, performing the operation in double
 * precision, then converting the result back to singel precision.
 *
 * This optimization also causes the ps_merge{00,01,10,11} instructions to
 * perform a simple 64-to-32 bit arithmetic conversion when loading input
 * values from the processor state block, rather than emulating the 750CL
 * quirk of truncating an excess-precision value loaded into the PS1 slot
 * (which requires a pair of host FPU rounding mode changes).
 *
 * This optimization is specification-safe: as long as guest code follows
 * the PowerPC architecture specification, it will behave correctly under
 * this optimization.
 */
#define BINREC_OPT_G_PPC_SINGLE_PREC_INPUTS  (1<<15)

/**
 * BINREC_OPT_G_PPC_TRIM_CR_STORES:  Analyze the data flow through each
 * CR bit and eliminate stores which are not visible outside the
 * translated code.
 *
 * If the branch exit test is enabled, then when translated code returns
 * to its caller due to the branch test, this optimization may leave stale
 * values in CR bits which would have been overwritten by later code in
 * the same translation unit.  This will not result in a change in behavior
 * as long as execution is eventually restarted with the same processor
 * state at the branch target address, but if the client program relies on
 * (or transfers execution to guest code which relies on) the value of CR,
 * it may not behave as expected.
 *
 * This optimization has no effect unless BINREC_OPT_G_PPC_USE_SPLIT_FIELDS
 * is also enabled.
 */
#define BINREC_OPT_G_PPC_TRIM_CR_STORES  (1<<16)

/**
 * BINREC_OPT_G_PPC_USE_SPLIT_FIELDS:  Treat subfields of certain registers
 * as separate values, rather than directly modifying the associated bits
 * in the register.
 *
 * Enabling this optimization causes individual bits of CR and the FPRF
 * field of FPSCR to be treated as separate "variables" in their own right;
 * the translated code will extract their values on entry and recombine
 * them into the full register on exit.  This allows data flow analysis to
 * find dead stores to specific fields, which otherwise would be obscured
 * by the dependency on the full register's previous state.
 *
 * If this optimization is enabled, pre- and post-instruction callbacks
 * and timebase handlers may see incorrect values of CR and FPSCR[FPRF]
 * in the processor state block.  System call and trap handlers are not
 * affected.
 */
#define BINREC_OPT_G_PPC_USE_SPLIT_FIELDS  (1<<17)

/*------------ Host-architecture-specific optimization flags ------------*/

/**
 * BINREC_OPT_H_X86_ADDRESS_OPERANDS:  Encode certain address calculations
 * directly in a load, store, or atomic instruction when feasible.  If the
 * address operand of such an instruction is not referenced by any other
 * instruction (after the one that sets it), then:
 *
 * - If the address is the sum of a register and a constant, and the sum of
 *   that constant and the offset encoded in the instruction is within the
 *   range of a 32-bit signed integer, eliminate the addition and use the
 *   combined offset as the access offset in the instruction.
 *
 * - If the address is the sum of two registers, eliminate the addition and
 *   encode the access using the base-plus-index format.
 */
#define BINREC_OPT_H_X86_ADDRESS_OPERANDS  (1<<0)

/**
 * BINREC_OPT_H_X86_BRANCH_ALIGNMENT:  Align branch targets to a multiple
 * of 16 bytes by inserting NOP instructions at appropriate points in the
 * generated code stream, when doing so seems likely to improve performance.
 */
#define BINREC_OPT_H_X86_BRANCH_ALIGNMENT  (1<<1)

/**
 * BINREC_OPT_H_X86_CONDITION_CODES:  Track the state of the condition
 * codes in the EFLAGS register, and avoid adding an explicit TEST or CMP
 * instruction for a register if the condition codes already reflect the
 * value of that register.
 */
#define BINREC_OPT_H_X86_CONDITION_CODES  (1<<2)

/**
 * BINREC_OPT_H_X86_FIXED_REGS:  When an instruction requires an operand to
 * be in a specific hardware register (shift counts must be in CL, for
 * example), try harder to allocate that hardware register for the operand.
 * This requires an extra pass over the translated machine code during
 * register allocation.
 */
#define BINREC_OPT_H_X86_FIXED_REGS  (1<<3)

/**
 * BINREC_OPT_H_X86_FORWARD_CONDITIONS:  When a register used as the
 * condition for a conditional branch or move is the result of a comparison
 * instruction and that register is not used elsewhere, eliminate the
 * register and forward the comparison condition to the branch or move
 * instruction.
 *
 * Floating-point comparisons will not be forwarded unless the
 * BINREC_OPT_DSE_FP common optimization flag is enabled.
 */
#define BINREC_OPT_H_X86_FORWARD_CONDITIONS  (1<<4)

/**
 * BINREC_OPT_H_X86_MERGE_REGS:  Try harder to avoid moving values
 * between registers.
 *
 * Enabling this optimization causes the register allocator to attempt to
 * keep values associated with guest architecture registers in the same
 * host register across basic blocks.  Typically this will result in
 * faster and more compact code, but with certain code patterns it may
 * hurt performance by causing more-frequently-accessed values to be
 * spilled to memory.
 */
#define BINREC_OPT_H_X86_MERGE_REGS  (1<<5)

/**
 * BINREC_OPT_H_X86_STORE_IMMEDIATE:  When an immediate value is used only
 * as the data for a store operation, encode the immediate value directly
 * in the instruction instead of using a register.
 */
#define BINREC_OPT_H_X86_STORE_IMMEDIATE  (1<<6)

/*************************************************************************/
/******** Interface: Library and runtime environment information *********/
/*************************************************************************/

/**
 * binrec_version:  Return the version number of the library as a string
 * (for example, "1.2.3").
 *
 * [Return value]
 *     Library version number.
 */
extern const char *binrec_version(void);

/**
 * binrec_native_arch:  Return a BINREC_ARCH_* constant representing the
 * architecture of the runtime environment, or 0 (BINREC_ARCH_INVALID) if
 * the runtime environment does not correspond to a supported host
 * architecture.  (If a nonzero value is returned, it will always be valid
 * as a host architecture for translation.)
 *
 * [Return value]
 *     Runtime environment architecture (BINREC_ARCH_*), or 0 if unsupported.
 */
extern binrec_arch_t binrec_native_arch(void);

/**
 * binrec_native_features:  Return a bitmask of architecture features
 * (BINREC_FEATURE_*) supported by the runtime environment, or 0 if the
 * runtime environment does not correspond to a supported architecture.
 *
 * [Return value]
 *     Runtime environment feature bitmap, or 0 if unsupported.
 */
extern unsigned int binrec_native_features(void);

/**
 * binrec_guest_supported:  Return whether the given architecture is
 * supported as a guest architecture for translation.
 *
 * [Parameters]
 *     arch: Architecture to check.
 * [Return value]
 *     True (nonzero) if the given architecture is supported as a guest
 *     architecture, false (zero) if not.
 */
extern int binrec_guest_supported(binrec_arch_t arch);

/**
 * binrec_host_supported:  Return whether the given architecture is
 * supported as a guest architecture for translation.
 *
 * [Parameters]
 *     arch: Architecture to check.
 * [Return value]
 *     True (nonzero) if the given architecture is supported as a guest
 *     architecture, false (zero) if not.
 */
extern int binrec_host_supported(binrec_arch_t arch);

/*************************************************************************/
/*************** Interface: Translation handle management ****************/
/*************************************************************************/

/**
 * binrec_create_handle:  Create a new translation handle.
 *
 * [Parameters]
 *     setup: Pointer to a binrec_setup_t structure that defines the
 *         translation parameters to use.
 * [Return value]
 *     Newly created handle, or NULL on error.
 */
extern binrec_t *binrec_create_handle(const binrec_setup_t *setup);

/**
 * binrec_destroy_handle:  Destroy a translation handle.
 *
 * This function only destroys the translation handle itself; blocks of
 * translated code generated through the handle remain valid even after
 * the handle is destroyed.
 *
 * [Parameters]
 *     handle: Handle to destroy (may be NULL).
 */
extern void binrec_destroy_handle(binrec_t *handle);

/**
 * binrec_set_code_range:  Set the minimum and maximum addresses from which
 * to read source machine instructions.  Branch instructions which attempt
 * to jump outside this range will terminate the translation unit, and if
 * the source machine code runs off the end of the range, the unit will be
 * terminated at the final instruction completely contained within the range.
 * The range must not wrap around the end of the address space.
 *
 * By default, the entire address space is considered valid for reading
 * instructions.
 *
 * Note that binrec_translate() also allows specifying an address range for
 * translation.  This function is intended more for the purpose of avoiding
 * translation of memory areas known to be outside the bounds of the input
 * program code, such as data segments or undefined memory, similar to the
 * "execute" permission bit in hardware memory management units.  (It is
 * not currently possible to specify multiple disjoint code ranges.)
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     start: First address of code range.
 *     end: Last address of code range (inclusive).
 */
extern void binrec_set_code_range(binrec_t *handle, uint32_t start,
                                  uint32_t end);

/**
 * binrec_set_optimization_flags:  Set which optimizations should be
 * performed on translated blocks.  Enabling more optimizations will
 * improve the performance of translated code but increase the overhead
 * of translation; see the documentation on each optimization flag for
 * details.
 *
 * The set of enabled optimizations may be changed at any time without
 * impacting the behavior of already-translated blocks.
 *
 * By default, no optimizations are enabled.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     common_opt: Bitmask of common optimizations to apply (BINREC_OPT_*).
 *     guest_opt: Bitmask of guest-specific optimizations to apply
 *         (BINREC_OPT_G_*).
 *     host_opt: Bitmask of host-specific optimizations to apply
 *         (BINREC_OPT_H_*).
 */
extern void binrec_set_optimization_flags(
    binrec_t *handle, unsigned int common_opt, unsigned int guest_opt,
    unsigned int host_opt);

/**
 * binrec_set_max_inline_length:  Set the maximum length (number of source
 * instructions, including the final return instruction) of subroutines to
 * inline.  The default is zero, meaning no subroutines will be inlined.
 *
 * If a nonzero length limit is set with this function, then when the
 * translator encounters a subroutine call instruction to a fixed address,
 * it will scan ahead up to this many instructions for a return
 * instruction.  If one is found, and if there are no branch instructions
 * that branch past the return, the subroutine will be inlined into the
 * current translation unit, saving the cost of jumping to a different
 * unit (which can be significant depending on how many guest registers
 * need to be spilled).
 *
 * If an inlined subroutine contains a further call instruction, that
 * subroutine will not be inlined regardless of its length.  (But see
 * binrec_set_max_inline_depth() to enable such recursive inlining.)
 *
 * Note that if a nonzero length limit is set, inlining may be performed
 * regardless of whether any optimization flags are set.
 *
 * Inlining is not currently implemented; calling this function has no
 * effect in this version of the library.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     length: Maximum inline length (must be at least 0).
 */
extern void binrec_set_max_inline_length(binrec_t *handle, int length);

/**
 * binrec_set_max_inline_depth:  Set the maximum depth of subroutines to
 * inline.  The default is 1.
 *
 * If a depth limit greater than 1 is set with this function, then when a
 * call instruction is encountered during inlining, the translator will
 * perform the same inlining check on the called subroutine, up to the
 * specified depth.  For example, when translating at A in the following
 * pseudocode:
 *     A: call B
 *        ret
 *     B: call C
 *        ret
 *     C: call D
 *        ret
 *     D: call E
 *        ret
 *     E: nop
 *        ret
 * if the maximum inline depth is set to 2 (and assuming the maximum length
 * is set to at least 2), both B and C will be inlined, but D will not, and
 * the A routine will be translated as if it was written:
 *     A: call D
 *        ret
 *
 * Setting a value of zero disables inlining regardless of the maximum
 * inline length set with binrec_set_max_inline_length().
 *
 * Inlining is not currently implemented; calling this function has no
 * effect in this version of the library.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     depth: Maximum inline depth (must be at least 0).
 */
extern void binrec_set_max_inline_depth(binrec_t *handle, int depth);

/**
 * binrec_add_readonly_region:  Mark the given region of memory as
 * read-only.  Instructions which are known to load from read-only memory
 * will be translated into load-constant operations if enabled by the
 * BINREC_OPT_FOLD_CONSTANTS optimization flag.
 *
 * When determining whether a load operation addresses read-only memory,
 * only the first (lowest) address of the referenced value is checked.
 * Thus, a multi-byte load operation which crosses the end of a read-only
 * region will still be translated to a load-constant operation, and any
 * subsequent changes to the bytes outside the read-only region will not
 * be seen by the translated code.
 *
 * If the guest code performs a store operation into a region marked as
 * read-only, subsequent behavior of the program is undefined.
 *
 * This function may fail if too many misaligned regions are added; in
 * that case, rebuild the library with different values of the
 * READONLY_PAGE_BITS and MAX_PARTIAL_READONLY constants in src/common.h.
 *
 * The address range specified by binrec_set_code_range() is treated as
 * read-only with respect to the instruction stream, regardless of whether
 * it is explicitly marked read-only via this function.  However, any data
 * interspersed within the instruction stream will only be treated as
 * constant data if the address of that data has been explicitly marked
 * read-only.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     base: Base address (in guest memory) of read-only region.
 *     size: Size of read-only region, in bytes.
 * [Return value]
 *     True on success; false if the region could not be added because
 *     the partial-page table is full.
 */
extern int binrec_add_readonly_region(binrec_t *handle,
                                      uint32_t base, uint32_t size);

/**
 * binrec_clear_readonly_regions:  Clear all read-only memory regions
 * added with binrec_add_readonly_region().
 *
 * [Parameters]
 *     handle: Handle to operate on.
 */
extern void binrec_clear_readonly_regions(binrec_t *handle);

/**
 * binrec_enable_chaining:  Set whether the translated code should include
 * logic to chain directly to other blocks of translated code on exit,
 * rather than returning to its caller.
 *
 * If chaining is enabled, then when the translated code would normally
 * return to its caller with a known next instruction address, it will
 * instead look up (via a function pointer stored in the PSB at the offset
 * given by the state_offset_chain_lookup field in binrec_setup_t) the
 * address of translated code corresponding to that guest address.  If the
 * function returns a non-NULL pointer, the translated code will rewrite
 * itself to jump directly to that pointer on subsequent calls.  (The code
 * will then return to its caller whether it successfully resolved the
 * chain or not.)  This naturally requires that the code be stored in a
 * host memory region which allows writing.
 *
 * Chain resolution (rewriting of the code to jump to the resolved target
 * address) works correctly in a multithreaded environment.  If two threads
 * try to resolve the same chain at the same time and get two different
 * target addresses, one of the addresses will be used for the target, but
 * it is undefined which one.
 *
 * The code produced by libbinrec when chaining is enabled is still
 * position-independent, but once a chain has been resolved, the code will
 * be dependent on the target address of the chain.
 *
 * Calling this function has no effect on already-translated code.
 *
 * By default, chaining is disabled.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     enable: True (nonzero) to enable chaining, false (zero) to disable.
 */
extern void binrec_enable_chaining(binrec_t *handle, int enable);

/**
 * binrec_enable_branch_exit_test:  Set whether to check a 32-bit value in
 * the processor state block (pointed to by state_offset_branch_exit_flag)
 * immediately before a branch to another instruction within the same
 * translation unit.  If enabled, the value is tested immediately before
 * taking a branch within the translated code; if the value is nonzero,
 * the translated code will return to its caller rather than continuing
 * execution at the branch target.   This can be used to safely interrupt
 * execution of the guest code ("safely" in the sense of the PSB being
 * fully updated) at a finer granularity than an entire translation unit.
 * The flag is not tested for conditional branches which are not taken or
 * for branches which would return from the translated code in any case
 * (such as indirect branches).
 *
 * Note that enabling the branch exit test can significantly reduce the
 * performance of translated code; the requirement for the code to be able
 * to exit at any branch point limits the amount of optimization that can
 * be performed across branches.
 *
 * Calling this function has no effect on already-translated code.
 *
 * By default, the branch exit test is disabled.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     enable: True (nonzero) to enable the branch exit test, false (zero)
 *         to disable it.
 */
extern void binrec_enable_branch_exit_test(binrec_t *handle, int enable);

/**
 * binrec_set_pre_insn_callback:  Set a callback function which will be
 * called immediately before executing each guest instruction.  This can
 * be used, for example, to log an execution trace or to record the state
 * of the guest processor at a particular point.  Pass NULL to disable an
 * existing callback.
 *
 * The callback receives two parameters: the processor state block pointer
 * passed to the translated code and the address of the instruction about
 * to be executed.
 *
 * Calling this function has no effect on already-translated code.
 *
 * The pre- and post-instruction callbacks are generally useful only for
 * debugging or analysis of code at runtime, so they are set directly as
 * pointers in the runtime environment under the assumption that the host
 * architecture is that of the runtime environment.  These callbacks should
 * not be enabled when cross-compiling to a different architecture.  The
 * callbacks also should not attempt to modify the processor state block;
 * doing so results in undefined behavior after the callback returns.
 *
 * Enabling the pre- or post-instruction callback can have a significant
 * negative impact on performance.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     callback: Callback to install, or NULL to clear any installed callback.
 */
extern void binrec_set_pre_insn_callback(binrec_t *handle,
                                         void (*callback)(void *, uint32_t));

/**
 * binrec_set_post_insn_callback:  Set a callback function which will be
 * called immediately after executing each guest instruction.  The state
 * of the guest processor will be the same as if translation had ended
 * immediately after the just-executed instruction.  Pass NULL to disable
 * an existing callback.
 *
 * The callback receives two parameters: the processor state block pointer
 * passed to the translated code and the address of the instruction that
 * was just executed.
 *
 * Calling this function has no effect on already-translated code.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     callback: Callback to install, or NULL to clear any installed callback.
 */
extern void binrec_set_post_insn_callback(binrec_t *handle,
                                          void (*callback)(void *, uint32_t));

/**
 * binrec_enable_verify:  Enable or disable verification checks on
 * translated blocks.  This has no effect on the generated code, and is
 * intended for catching bugs in libbinrec.
 *
 * [Parameters]
 *     handle: Handle to operate on.
 *     enable: True to enable verification, false to disable.
 */
extern void binrec_enable_verify(binrec_t *handle, int enable);

/*************************************************************************/
/********************** Interface: Code translation **********************/
/*************************************************************************/

/**
 * binrec_translate:  Translate a block of guest machine code into native
 * machine code.
 *
 * The "address" and "limit" parameters specify the inclusive address
 * bounds from which instructions will be read for this call.  Translation
 * will stop when the translator reaches a source instruction which is not
 * entirely contained in the inclusive range [address,limit],  or when all
 * code paths starting from "address" have been translated (such as when
 * the end of a function in the input program is reached).  A value of -1
 * for "limit" allows translation to continue until such a natural endpoint
 * is found.
 *
 * The "state" parameter can be used to provide a processor state block
 * (of the same format as that passed to the generated code) for the
 * translation routines to reference.  This is used by certain
 * optimizations (currently only the PowerPC CONSTANT_GQRS optimization)
 * to generate more efficient code under the assumption that certain
 * elements of the processor state will remain constant for every call
 * to the translated code.  If NULL is passed, such optimizations will be
 * implicitly disabled for the current binrec_translate() call.
 *
 * On success, the returned block can be executed by calling it as a
 * function with the following signature:
 *     void *code(void *state, void *memory);
 * where the "state" parameter is a pointer to a processor state block
 * whose structure conforms to the structure offsets specified in the
 * setup data passed to binrec_create_handle(), and "memory" is a pointer
 * to the base of the guest memory region.  The return value of the code
 * is the processor state block pointer; this may differ from the pointer
 * passed to the block if, for example, a virtual system call caused the
 * code's thread to migrate to a different virtual processor.  (libbinrec
 * will never change the PSB pointer on its own, so library clients which
 * do not use multiple PSBs can safely ignore the return value.)
 *
 * The returned code pointer will have been allocated with the code_malloc
 * or code_realloc function passed in the setup structure to
 * binrec_create_handle(), or the relevant fallback function if code_*
 * functions were not supplied.
 *
 * Return-value arguments are only modified on a successful return.
 *
 * [Parameters]
 *     handle: Handle to use for translation.
 *     state: Guest processor state to reference for translation, or NULL
 *         if none.
 *     address: Address (in guest memory) of first instruction to translate.
 *     limit: Address (in guest memory) at which to terminate translation.
 *     code_ret: Pointer to variable to receive a pointer to the
 *         translated machine code.
 *     size_ret: Pointer to variable to receive the length of the
 *         translated machine code, in bytes.
 * [Return value]
 *     True (nonzero) on success, false (zero) on error.
 */
extern int binrec_translate(binrec_t *handle, void *state,
                            uint32_t address, uint32_t limit,
                            void **code_ret, long *size_ret);

/*************************************************************************/
/*************************************************************************/

#ifdef __cplusplus
}  // extern "C"
#endif

#endif  // BINREC_H