michael@0: #ifndef LIBDISASM_H michael@0: #define LIBDISASM_H michael@0: michael@0: #ifdef WIN32 michael@0: #include michael@0: #endif michael@0: michael@0: #include michael@0: michael@0: /* 'NEW" types michael@0: * __________________________________________________________________________*/ michael@0: #ifndef LIBDISASM_QWORD_H /* do not interfere with qword.h */ michael@0: #define LIBDISASM_QWORD_H michael@0: #ifdef _MSC_VER michael@0: typedef __int64 qword_t; michael@0: #else michael@0: typedef int64_t qword_t; michael@0: #endif michael@0: #endif michael@0: michael@0: #include michael@0: michael@0: #ifdef __cplusplus michael@0: extern "C" { michael@0: #endif michael@0: michael@0: /* 'NEW" x86 API michael@0: * __________________________________________________________________________*/ michael@0: michael@0: michael@0: /* ========================================= Error Reporting */ michael@0: /* REPORT CODES michael@0: * These are passed to a reporter function passed at initialization. michael@0: * Each code determines the type of the argument passed to the reporter; michael@0: * this allows the report to recover from errors, or just log them. michael@0: */ michael@0: enum x86_report_codes { michael@0: report_disasm_bounds, /* RVA OUT OF BOUNDS : The disassembler could michael@0: not disassemble the supplied RVA as it is michael@0: out of the range of the buffer. The michael@0: application should store the address and michael@0: attempt to determine what section of the michael@0: binary it is in, then disassemble the michael@0: address from the bytes in that section. michael@0: data: uint32_t rva */ michael@0: report_insn_bounds, /* INSTRUCTION OUT OF BOUNDS: The disassembler michael@0: could not disassemble the instruction as michael@0: the instruction would require bytes beyond michael@0: the end of the current buffer. This usually michael@0: indicated garbage bytes at the end of a michael@0: buffer, or an incorrectly-sized buffer. michael@0: data: uint32_t rva */ michael@0: report_invalid_insn, /* INVALID INSTRUCTION: The disassembler could michael@0: not disassemble the instruction as it has an michael@0: invalid combination of opcodes and operands. michael@0: This will stop automated disassembly; the michael@0: application can restart the disassembly michael@0: after the invalid instruction. michael@0: data: uint32_t rva */ michael@0: report_unknown michael@0: }; michael@0: michael@0: /* 'arg' is optional arbitrary data provided by the code passing the michael@0: * callback -- for example, it could be 'this' or 'self' in OOP code. michael@0: * 'code' is provided by libdisasm, it is one of the above michael@0: * 'data' is provided by libdisasm and is context-specific, per the enums */ michael@0: typedef void (*DISASM_REPORTER)( enum x86_report_codes code, michael@0: void *data, void *arg ); michael@0: michael@0: michael@0: /* x86_report_error : Call the register reporter to report an error */ michael@0: void x86_report_error( enum x86_report_codes code, void *data ); michael@0: michael@0: /* ========================================= Libdisasm Management Routines */ michael@0: enum x86_options { /* these can be ORed together */ michael@0: opt_none= 0, michael@0: opt_ignore_nulls=1, /* ignore sequences of > 4 NULL bytes */ michael@0: opt_16_bit=2, /* 16-bit/DOS disassembly */ michael@0: opt_att_mnemonics=4, /* use AT&T syntax names for alternate opcode mnemonics */ michael@0: }; michael@0: michael@0: /* management routines */ michael@0: /* 'arg' is caller-specific data which is passed as the first argument michael@0: * to the reporter callback routine */ michael@0: int x86_init( enum x86_options options, DISASM_REPORTER reporter, void *arg); michael@0: void x86_set_reporter( DISASM_REPORTER reporter, void *arg); michael@0: void x86_set_options( enum x86_options options ); michael@0: enum x86_options x86_get_options( void ); michael@0: int x86_cleanup(void); michael@0: michael@0: michael@0: /* ========================================= Instruction Representation */ michael@0: /* these defines are only intended for use in the array decl's */ michael@0: #define MAX_REGNAME 8 michael@0: michael@0: #define MAX_PREFIX_STR 32 michael@0: #define MAX_MNEM_STR 16 michael@0: #define MAX_INSN_SIZE 20 /* same as in i386.h */ michael@0: #define MAX_OP_STRING 32 /* max possible operand size in string form */ michael@0: #define MAX_OP_RAW_STRING 64 /* max possible operand size in raw form */ michael@0: #define MAX_OP_XML_STRING 256 /* max possible operand size in xml form */ michael@0: #define MAX_NUM_OPERANDS 8 /* max # implicit and explicit operands */ michael@0: /* in these, the '2 *' is arbitrary: the max # of operands should require michael@0: * more space than the rest of the insn */ michael@0: #define MAX_INSN_STRING 512 /* 2 * 8 * MAX_OP_STRING */ michael@0: #define MAX_INSN_RAW_STRING 1024 /* 2 * 8 * MAX_OP_RAW_STRING */ michael@0: #define MAX_INSN_XML_STRING 4096 /* 2 * 8 * MAX_OP_XML_STRING */ michael@0: michael@0: enum x86_reg_type { /* NOTE: these may be ORed together */ michael@0: reg_gen = 0x00001, /* general purpose */ michael@0: reg_in = 0x00002, /* incoming args, ala RISC */ michael@0: reg_out = 0x00004, /* args to calls, ala RISC */ michael@0: reg_local = 0x00008, /* local vars, ala RISC */ michael@0: reg_fpu = 0x00010, /* FPU data register */ michael@0: reg_seg = 0x00020, /* segment register */ michael@0: reg_simd = 0x00040, /* SIMD/MMX reg */ michael@0: reg_sys = 0x00080, /* restricted/system register */ michael@0: reg_sp = 0x00100, /* stack pointer */ michael@0: reg_fp = 0x00200, /* frame pointer */ michael@0: reg_pc = 0x00400, /* program counter */ michael@0: reg_retaddr = 0x00800, /* return addr for func */ michael@0: reg_cond = 0x01000, /* condition code / flags */ michael@0: reg_zero = 0x02000, /* zero register, ala RISC */ michael@0: reg_ret = 0x04000, /* return value */ michael@0: reg_src = 0x10000, /* array/rep source */ michael@0: reg_dest = 0x20000, /* array/rep destination */ michael@0: reg_count = 0x40000 /* array/rep/loop counter */ michael@0: }; michael@0: michael@0: /* x86_reg_t : an X86 CPU register */ michael@0: typedef struct { michael@0: char name[MAX_REGNAME]; michael@0: enum x86_reg_type type; /* what register is used for */ michael@0: unsigned int size; /* size of register in bytes */ michael@0: unsigned int id; /* register ID #, for quick compares */ michael@0: unsigned int alias; /* ID of reg this is an alias for */ michael@0: unsigned int shift; /* amount to shift aliased reg by */ michael@0: } x86_reg_t; michael@0: michael@0: /* x86_ea_t : an X86 effective address (address expression) */ michael@0: typedef struct { michael@0: unsigned int scale; /* scale factor */ michael@0: x86_reg_t index, base; /* index, base registers */ michael@0: int32_t disp; /* displacement */ michael@0: char disp_sign; /* is negative? 1/0 */ michael@0: char disp_size; /* 0, 1, 2, 4 */ michael@0: } x86_ea_t; michael@0: michael@0: /* x86_absolute_t : an X86 segment:offset address (descriptor) */ michael@0: typedef struct { michael@0: unsigned short segment; /* loaded directly into CS */ michael@0: union { michael@0: unsigned short off16; /* loaded directly into IP */ michael@0: uint32_t off32; /* loaded directly into EIP */ michael@0: } offset; michael@0: } x86_absolute_t; michael@0: michael@0: enum x86_op_type { /* mutually exclusive */ michael@0: op_unused = 0, /* empty/unused operand: should never occur */ michael@0: op_register = 1, /* CPU register */ michael@0: op_immediate = 2, /* Immediate Value */ michael@0: op_relative_near = 3, /* Relative offset from IP */ michael@0: op_relative_far = 4, /* Relative offset from IP */ michael@0: op_absolute = 5, /* Absolute address (ptr16:32) */ michael@0: op_expression = 6, /* Address expression (scale/index/base/disp) */ michael@0: op_offset = 7, /* Offset from start of segment (m32) */ michael@0: op_unknown michael@0: }; michael@0: michael@0: #define x86_optype_is_address( optype ) \ michael@0: ( optype == op_absolute || optype == op_offset ) michael@0: #define x86_optype_is_relative( optype ) \ michael@0: ( optype == op_relative_near || optype == op_relative_far ) michael@0: #define x86_optype_is_memory( optype ) \ michael@0: ( optype > op_immediate && optype < op_unknown ) michael@0: michael@0: enum x86_op_datatype { /* these use Intel's lame terminology */ michael@0: op_byte = 1, /* 1 byte integer */ michael@0: op_word = 2, /* 2 byte integer */ michael@0: op_dword = 3, /* 4 byte integer */ michael@0: op_qword = 4, /* 8 byte integer */ michael@0: op_dqword = 5, /* 16 byte integer */ michael@0: op_sreal = 6, /* 4 byte real (single real) */ michael@0: op_dreal = 7, /* 8 byte real (double real) */ michael@0: op_extreal = 8, /* 10 byte real (extended real) */ michael@0: op_bcd = 9, /* 10 byte binary-coded decimal */ michael@0: op_ssimd = 10, /* 16 byte : 4 packed single FP (SIMD, MMX) */ michael@0: op_dsimd = 11, /* 16 byte : 2 packed double FP (SIMD, MMX) */ michael@0: op_sssimd = 12, /* 4 byte : scalar single FP (SIMD, MMX) */ michael@0: op_sdsimd = 13, /* 8 byte : scalar double FP (SIMD, MMX) */ michael@0: op_descr32 = 14, /* 6 byte Intel descriptor 2:4 */ michael@0: op_descr16 = 15, /* 4 byte Intel descriptor 2:2 */ michael@0: op_pdescr32 = 16, /* 6 byte Intel pseudo-descriptor 32:16 */ michael@0: op_pdescr16 = 17, /* 6 byte Intel pseudo-descriptor 8:24:16 */ michael@0: op_bounds16 = 18, /* signed 16:16 lower:upper bounds */ michael@0: op_bounds32 = 19, /* signed 32:32 lower:upper bounds */ michael@0: op_fpuenv16 = 20, /* 14 byte FPU control/environment data */ michael@0: op_fpuenv32 = 21, /* 28 byte FPU control/environment data */ michael@0: op_fpustate16 = 22, /* 94 byte FPU state (env & reg stack) */ michael@0: op_fpustate32 = 23, /* 108 byte FPU state (env & reg stack) */ michael@0: op_fpregset = 24, /* 512 bytes: register set */ michael@0: op_fpreg = 25, /* FPU register */ michael@0: op_none = 0xFF, /* operand without a datatype (INVLPG) */ michael@0: }; michael@0: michael@0: enum x86_op_access { /* ORed together */ michael@0: op_read = 1, michael@0: op_write = 2, michael@0: op_execute = 4 michael@0: }; michael@0: michael@0: enum x86_op_flags { /* ORed together, but segs are mutually exclusive */ michael@0: op_signed = 1, /* signed integer */ michael@0: op_string = 2, /* possible string or array */ michael@0: op_constant = 4, /* symbolic constant */ michael@0: op_pointer = 8, /* operand points to a memory address */ michael@0: op_sysref = 0x010, /* operand is a syscall number */ michael@0: op_implied = 0x020, /* operand is implicit in the insn */ michael@0: op_hardcode = 0x40, /* operand is hardcoded in insn definition */ michael@0: /* NOTE: an 'implied' operand is one which can be considered a side michael@0: * effect of the insn, e.g. %esp being modified by PUSH or POP. A michael@0: * 'hard-coded' operand is one which is specified in the instruction michael@0: * definition, e.g. %es:%edi in MOVSB or 1 in ROL Eb, 1. The difference michael@0: * is that hard-coded operands are printed by disassemblers and are michael@0: * required to re-assemble, while implicit operands are invisible. */ michael@0: op_es_seg = 0x100, /* ES segment override */ michael@0: op_cs_seg = 0x200, /* CS segment override */ michael@0: op_ss_seg = 0x300, /* SS segment override */ michael@0: op_ds_seg = 0x400, /* DS segment override */ michael@0: op_fs_seg = 0x500, /* FS segment override */ michael@0: op_gs_seg = 0x600 /* GS segment override */ michael@0: }; michael@0: michael@0: /* x86_op_t : an X86 instruction operand */ michael@0: typedef struct { michael@0: enum x86_op_type type; /* operand type */ michael@0: enum x86_op_datatype datatype; /* operand size */ michael@0: enum x86_op_access access; /* operand access [RWX] */ michael@0: enum x86_op_flags flags; /* misc flags */ michael@0: union { michael@0: /* sizeof will have to work on these union members! */ michael@0: /* immediate values */ michael@0: char sbyte; michael@0: short sword; michael@0: int32_t sdword; michael@0: qword_t sqword; michael@0: unsigned char byte; michael@0: unsigned short word; michael@0: uint32_t dword; michael@0: qword_t qword; michael@0: float sreal; michael@0: double dreal; michael@0: /* misc large/non-native types */ michael@0: unsigned char extreal[10]; michael@0: unsigned char bcd[10]; michael@0: qword_t dqword[2]; michael@0: unsigned char simd[16]; michael@0: unsigned char fpuenv[28]; michael@0: /* offset from segment */ michael@0: uint32_t offset; michael@0: /* ID of CPU register */ michael@0: x86_reg_t reg; michael@0: /* offsets from current insn */ michael@0: char relative_near; michael@0: int32_t relative_far; michael@0: /* segment:offset */ michael@0: x86_absolute_t absolute; michael@0: /* effective address [expression] */ michael@0: x86_ea_t expression; michael@0: } data; michael@0: /* this is needed to make formatting operands more sane */ michael@0: void * insn; /* pointer to x86_insn_t owning operand */ michael@0: } x86_op_t; michael@0: michael@0: /* Linked list of x86_op_t; provided for manual traversal of the operand michael@0: * list in an insn. Users wishing to add operands to this list, e.g. to add michael@0: * implicit operands, should use x86_operand_new in x86_operand_list.h */ michael@0: typedef struct x86_operand_list { michael@0: x86_op_t op; michael@0: struct x86_operand_list *next; michael@0: } x86_oplist_t; michael@0: michael@0: enum x86_insn_group { michael@0: insn_none = 0, /* invalid instruction */ michael@0: insn_controlflow = 1, michael@0: insn_arithmetic = 2, michael@0: insn_logic = 3, michael@0: insn_stack = 4, michael@0: insn_comparison = 5, michael@0: insn_move = 6, michael@0: insn_string = 7, michael@0: insn_bit_manip = 8, michael@0: insn_flag_manip = 9, michael@0: insn_fpu = 10, michael@0: insn_interrupt = 13, michael@0: insn_system = 14, michael@0: insn_other = 15 michael@0: }; michael@0: michael@0: enum x86_insn_type { michael@0: insn_invalid = 0, /* invalid instruction */ michael@0: /* insn_controlflow */ michael@0: insn_jmp = 0x1001, michael@0: insn_jcc = 0x1002, michael@0: insn_call = 0x1003, michael@0: insn_callcc = 0x1004, michael@0: insn_return = 0x1005, michael@0: /* insn_arithmetic */ michael@0: insn_add = 0x2001, michael@0: insn_sub = 0x2002, michael@0: insn_mul = 0x2003, michael@0: insn_div = 0x2004, michael@0: insn_inc = 0x2005, michael@0: insn_dec = 0x2006, michael@0: insn_shl = 0x2007, michael@0: insn_shr = 0x2008, michael@0: insn_rol = 0x2009, michael@0: insn_ror = 0x200A, michael@0: /* insn_logic */ michael@0: insn_and = 0x3001, michael@0: insn_or = 0x3002, michael@0: insn_xor = 0x3003, michael@0: insn_not = 0x3004, michael@0: insn_neg = 0x3005, michael@0: /* insn_stack */ michael@0: insn_push = 0x4001, michael@0: insn_pop = 0x4002, michael@0: insn_pushregs = 0x4003, michael@0: insn_popregs = 0x4004, michael@0: insn_pushflags = 0x4005, michael@0: insn_popflags = 0x4006, michael@0: insn_enter = 0x4007, michael@0: insn_leave = 0x4008, michael@0: /* insn_comparison */ michael@0: insn_test = 0x5001, michael@0: insn_cmp = 0x5002, michael@0: /* insn_move */ michael@0: insn_mov = 0x6001, /* move */ michael@0: insn_movcc = 0x6002, /* conditional move */ michael@0: insn_xchg = 0x6003, /* exchange */ michael@0: insn_xchgcc = 0x6004, /* conditional exchange */ michael@0: /* insn_string */ michael@0: insn_strcmp = 0x7001, michael@0: insn_strload = 0x7002, michael@0: insn_strmov = 0x7003, michael@0: insn_strstore = 0x7004, michael@0: insn_translate = 0x7005, /* xlat */ michael@0: /* insn_bit_manip */ michael@0: insn_bittest = 0x8001, michael@0: insn_bitset = 0x8002, michael@0: insn_bitclear = 0x8003, michael@0: /* insn_flag_manip */ michael@0: insn_clear_carry = 0x9001, michael@0: insn_clear_zero = 0x9002, michael@0: insn_clear_oflow = 0x9003, michael@0: insn_clear_dir = 0x9004, michael@0: insn_clear_sign = 0x9005, michael@0: insn_clear_parity = 0x9006, michael@0: insn_set_carry = 0x9007, michael@0: insn_set_zero = 0x9008, michael@0: insn_set_oflow = 0x9009, michael@0: insn_set_dir = 0x900A, michael@0: insn_set_sign = 0x900B, michael@0: insn_set_parity = 0x900C, michael@0: insn_tog_carry = 0x9010, michael@0: insn_tog_zero = 0x9020, michael@0: insn_tog_oflow = 0x9030, michael@0: insn_tog_dir = 0x9040, michael@0: insn_tog_sign = 0x9050, michael@0: insn_tog_parity = 0x9060, michael@0: /* insn_fpu */ michael@0: insn_fmov = 0xA001, michael@0: insn_fmovcc = 0xA002, michael@0: insn_fneg = 0xA003, michael@0: insn_fabs = 0xA004, michael@0: insn_fadd = 0xA005, michael@0: insn_fsub = 0xA006, michael@0: insn_fmul = 0xA007, michael@0: insn_fdiv = 0xA008, michael@0: insn_fsqrt = 0xA009, michael@0: insn_fcmp = 0xA00A, michael@0: insn_fcos = 0xA00C, michael@0: insn_fldpi = 0xA00D, michael@0: insn_fldz = 0xA00E, michael@0: insn_ftan = 0xA00F, michael@0: insn_fsine = 0xA010, michael@0: insn_fsys = 0xA020, michael@0: /* insn_interrupt */ michael@0: insn_int = 0xD001, michael@0: insn_intcc = 0xD002, /* not present in x86 ISA */ michael@0: insn_iret = 0xD003, michael@0: insn_bound = 0xD004, michael@0: insn_debug = 0xD005, michael@0: insn_trace = 0xD006, michael@0: insn_invalid_op = 0xD007, michael@0: insn_oflow = 0xD008, michael@0: /* insn_system */ michael@0: insn_halt = 0xE001, michael@0: insn_in = 0xE002, /* input from port/bus */ michael@0: insn_out = 0xE003, /* output to port/bus */ michael@0: insn_cpuid = 0xE004, michael@0: /* insn_other */ michael@0: insn_nop = 0xF001, michael@0: insn_bcdconv = 0xF002, /* convert to or from BCD */ michael@0: insn_szconv = 0xF003 /* change size of operand */ michael@0: }; michael@0: michael@0: /* These flags specify special characteristics of the instruction, such as michael@0: * whether the inatruction is privileged or whether it serializes the michael@0: * pipeline. michael@0: * NOTE : These may not be accurate for all instructions; updates to the michael@0: * opcode tables have not been completed. */ michael@0: enum x86_insn_note { michael@0: insn_note_ring0 = 1, /* Only available in ring 0 */ michael@0: insn_note_smm = 2, /* "" in System Management Mode */ michael@0: insn_note_serial = 4, /* Serializing instruction */ michael@0: insn_note_nonswap = 8, /* Does not swap arguments in att-style formatting */ michael@0: insn_note_nosuffix = 16, /* Does not have size suffix in att-style formatting */ michael@0: }; michael@0: michael@0: /* This specifies what effects the instruction has on the %eflags register */ michael@0: enum x86_flag_status { michael@0: insn_carry_set = 0x1, /* CF */ michael@0: insn_zero_set = 0x2, /* ZF */ michael@0: insn_oflow_set = 0x4, /* OF */ michael@0: insn_dir_set = 0x8, /* DF */ michael@0: insn_sign_set = 0x10, /* SF */ michael@0: insn_parity_set = 0x20, /* PF */ michael@0: insn_carry_or_zero_set = 0x40, michael@0: insn_zero_set_or_sign_ne_oflow = 0x80, michael@0: insn_carry_clear = 0x100, michael@0: insn_zero_clear = 0x200, michael@0: insn_oflow_clear = 0x400, michael@0: insn_dir_clear = 0x800, michael@0: insn_sign_clear = 0x1000, michael@0: insn_parity_clear = 0x2000, michael@0: insn_sign_eq_oflow = 0x4000, michael@0: insn_sign_ne_oflow = 0x8000 michael@0: }; michael@0: michael@0: /* The CPU model in which the insturction first appeared; this can be used michael@0: * to mask out instructions appearing in earlier or later models or to michael@0: * check the portability of a binary. michael@0: * NOTE : These may not be accurate for all instructions; updates to the michael@0: * opcode tables have not been completed. */ michael@0: enum x86_insn_cpu { michael@0: cpu_8086 = 1, /* Intel */ michael@0: cpu_80286 = 2, michael@0: cpu_80386 = 3, michael@0: cpu_80387 = 4, michael@0: cpu_80486 = 5, michael@0: cpu_pentium = 6, michael@0: cpu_pentiumpro = 7, michael@0: cpu_pentium2 = 8, michael@0: cpu_pentium3 = 9, michael@0: cpu_pentium4 = 10, michael@0: cpu_k6 = 16, /* AMD */ michael@0: cpu_k7 = 32, michael@0: cpu_athlon = 48 michael@0: }; michael@0: michael@0: /* CPU ISA subsets: These are derived from the Instruction Groups in michael@0: * Intel Vol 1 Chapter 5; they represent subsets of the IA32 ISA but michael@0: * do not reflect the 'type' of the instruction in the same way that michael@0: * x86_insn_group does. In short, these are AMD/Intel's somewhat useless michael@0: * designations. michael@0: * NOTE : These may not be accurate for all instructions; updates to the michael@0: * opcode tables have not been completed. */ michael@0: enum x86_insn_isa { michael@0: isa_gp = 1, /* general purpose */ michael@0: isa_fp = 2, /* floating point */ michael@0: isa_fpumgt = 3, /* FPU/SIMD management */ michael@0: isa_mmx = 4, /* Intel MMX */ michael@0: isa_sse1 = 5, /* Intel SSE SIMD */ michael@0: isa_sse2 = 6, /* Intel SSE2 SIMD */ michael@0: isa_sse3 = 7, /* Intel SSE3 SIMD */ michael@0: isa_3dnow = 8, /* AMD 3DNow! SIMD */ michael@0: isa_sys = 9 /* system instructions */ michael@0: }; michael@0: michael@0: enum x86_insn_prefix { michael@0: insn_no_prefix = 0, michael@0: insn_rep_zero = 1, /* REPZ and REPE */ michael@0: insn_rep_notzero = 2, /* REPNZ and REPNZ */ michael@0: insn_lock = 4 /* LOCK: */ michael@0: }; michael@0: michael@0: /* TODO: maybe provide insn_new/free(), and have disasm return new insn_t */ michael@0: /* x86_insn_t : an X86 instruction */ michael@0: typedef struct { michael@0: /* information about the instruction */ michael@0: uint32_t addr; /* load address */ michael@0: uint32_t offset; /* offset into file/buffer */ michael@0: enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ michael@0: enum x86_insn_type type; /* type, e.g. INS_BRANCH */ michael@0: enum x86_insn_note note; /* note, e.g. RING0 */ michael@0: unsigned char bytes[MAX_INSN_SIZE]; michael@0: unsigned char size; /* size of insn in bytes */ michael@0: /* 16/32-bit mode settings */ michael@0: unsigned char addr_size; /* default address size : 2 or 4 */ michael@0: unsigned char op_size; /* default operand size : 2 or 4 */ michael@0: /* CPU/instruction set */ michael@0: enum x86_insn_cpu cpu; michael@0: enum x86_insn_isa isa; michael@0: /* flags */ michael@0: enum x86_flag_status flags_set; /* flags set or tested by insn */ michael@0: enum x86_flag_status flags_tested; michael@0: /* stack */ michael@0: unsigned char stack_mod; /* 0 or 1 : is the stack modified? */ michael@0: int32_t stack_mod_val; /* val stack is modified by if known */ michael@0: michael@0: /* the instruction proper */ michael@0: enum x86_insn_prefix prefix; /* prefixes ORed together */ michael@0: char prefix_string[MAX_PREFIX_STR]; /* prefixes [might be truncated] */ michael@0: char mnemonic[MAX_MNEM_STR]; michael@0: x86_oplist_t *operands; /* list of explicit/implicit operands */ michael@0: size_t operand_count; /* total number of operands */ michael@0: size_t explicit_count; /* number of explicit operands */ michael@0: /* convenience fields for user */ michael@0: void *block; /* code block containing this insn */ michael@0: void *function; /* function containing this insn */ michael@0: int tag; /* tag the insn as seen/processed */ michael@0: } x86_insn_t; michael@0: michael@0: michael@0: /* returns 0 if an instruction is invalid, 1 if valid */ michael@0: int x86_insn_is_valid( x86_insn_t *insn ); michael@0: michael@0: /* DISASSEMBLY ROUTINES michael@0: * Canonical order of arguments is michael@0: * (buf, buf_len, buf_rva, offset, len, insn, func, arg, resolve_func) michael@0: * ...but of course all of these are not used at the same time. michael@0: */ michael@0: michael@0: michael@0: /* Function prototype for caller-supplied callback routine michael@0: * These callbacks are intended to process 'insn' further, e.g. by michael@0: * adding it to a linked list, database, etc */ michael@0: typedef void (*DISASM_CALLBACK)( x86_insn_t *insn, void * arg ); michael@0: michael@0: /* Function prototype for caller-supplied address resolver. michael@0: * This routine is used to determine the rva to disassemble next, given michael@0: * the 'dest' operand of a jump/call. This allows the caller to resolve michael@0: * jump/call targets stored in a register or on the stack, and also allows michael@0: * the caller to prevent endless loops by checking if an address has michael@0: * already been disassembled. If an address cannot be resolved from the michael@0: * operand, or if the address has already been disassembled, this routine michael@0: * should return -1; in all other cases the RVA to be disassembled next michael@0: * should be returned. */ michael@0: typedef int32_t (*DISASM_RESOLVER)( x86_op_t *op, x86_insn_t * current_insn, michael@0: void *arg ); michael@0: michael@0: michael@0: /* x86_disasm: Disassemble a single instruction from a buffer of bytes. michael@0: * Returns size of instruction in bytes. michael@0: * Caller is responsible for calling x86_oplist_free() on michael@0: * a reused "insn" to avoid leaking memory when calling this michael@0: * function repeatedly. michael@0: * buf : Buffer of bytes to disassemble michael@0: * buf_len : Length of the buffer michael@0: * buf_rva : Load address of the start of the buffer michael@0: * offset : Offset in buffer to disassemble michael@0: * insn : Structure to fill with disassembled instruction michael@0: */ michael@0: unsigned int x86_disasm( unsigned char *buf, unsigned int buf_len, michael@0: uint32_t buf_rva, unsigned int offset, michael@0: x86_insn_t * insn ); michael@0: michael@0: /* x86_disasm_range: Sequential disassembly of a range of bytes in a buffer, michael@0: * invoking a callback function each time an instruction michael@0: * is successfully disassembled. The 'range' refers to the michael@0: * bytes between 'offset' and 'offset + len' in the buffer; michael@0: * 'len' is assumed to be less than the length of the buffer. michael@0: * Returns number of instructions processed. michael@0: * buf : Buffer of bytes to disassemble (e.g. .text section) michael@0: * buf_rva : Load address of buffer (e.g. ELF Virtual Address) michael@0: * offset : Offset in buffer to start disassembly at michael@0: * len : Number of bytes to disassemble michael@0: * func : Callback function to invoke (may be NULL) michael@0: * arg : Arbitrary data to pass to callback (may be NULL) michael@0: */ michael@0: unsigned int x86_disasm_range( unsigned char *buf, uint32_t buf_rva, michael@0: unsigned int offset, unsigned int len, michael@0: DISASM_CALLBACK func, void *arg ); michael@0: michael@0: /* x86_disasm_forward: Flow-of-execution disassembly of the bytes in a buffer, michael@0: * invoking a callback function each time an instruction michael@0: * is successfully disassembled. michael@0: * buf : Buffer to disassemble (e.g. .text section) michael@0: * buf_len : Number of bytes in buffer michael@0: * buf_rva : Load address of buffer (e.g. ELF Virtual Address) michael@0: * offset : Offset in buffer to start disassembly at (e.g. entry point) michael@0: * func : Callback function to invoke (may be NULL) michael@0: * arg : Arbitrary data to pass to callback (may be NULL) michael@0: * resolver: Caller-supplied address resolver. If no resolver is michael@0: * supplied, a default internal one is used -- however the michael@0: * internal resolver does NOT catch loops and could end up michael@0: * disassembling forever.. michael@0: * r_arg : Arbitrary data to pass to resolver (may be NULL) michael@0: */ michael@0: unsigned int x86_disasm_forward( unsigned char *buf, unsigned int buf_len, michael@0: uint32_t buf_rva, unsigned int offset, michael@0: DISASM_CALLBACK func, void *arg, michael@0: DISASM_RESOLVER resolver, void *r_arg ); michael@0: michael@0: /* Instruction operands: these are stored as a list of explicit and michael@0: * implicit operands. It is recommended that the 'foreach' routines michael@0: * be used to when examining operands for purposes of data flow analysis */ michael@0: michael@0: /* Operand FOREACH callback: 'arg' is an abritrary parameter passed to the michael@0: * foreach routine, 'insn' is the x86_insn_t whose operands are being michael@0: * iterated over, and 'op' is the current x86_op_t */ michael@0: typedef void (*x86_operand_fn)(x86_op_t *op, x86_insn_t *insn, void *arg); michael@0: michael@0: /* FOREACH types: these are used to limit the foreach results to michael@0: * operands which match a certain "type" (implicit or explicit) michael@0: * or which are accessed in certain ways (e.g. read or write). Note michael@0: * that this operates on the operand list of single instruction, so michael@0: * specifying the 'real' operand type (register, memory, etc) is not michael@0: * useful. Note also that by definition Execute Access implies Read michael@0: * Access and implies Not Write Access. michael@0: * The "type" (implicit or explicit) and the access method can michael@0: * be ORed together, e.g. op_wo | op_explicit */ michael@0: enum x86_op_foreach_type { michael@0: op_any = 0, /* ALL operands (explicit, implicit, rwx) */ michael@0: op_dest = 1, /* operands with Write access */ michael@0: op_src = 2, /* operands with Read access */ michael@0: op_ro = 3, /* operands with Read but not Write access */ michael@0: op_wo = 4, /* operands with Write but not Read access */ michael@0: op_xo = 5, /* operands with Execute access */ michael@0: op_rw = 6, /* operands with Read AND Write access */ michael@0: op_implicit = 0x10, /* operands that are implied by the opcode */ michael@0: op_explicit = 0x20 /* operands that are not side-effects */ michael@0: }; michael@0: michael@0: michael@0: /* free the operand list associated with an instruction -- useful for michael@0: * preventing memory leaks when free()ing an x86_insn_t */ michael@0: void x86_oplist_free( x86_insn_t *insn ); michael@0: michael@0: /* Operand foreach: invokes 'func' with 'insn' and 'arg' as arguments. The michael@0: * 'type' parameter is used to select only operands matching specific michael@0: * criteria. */ michael@0: int x86_operand_foreach( x86_insn_t *insn, x86_operand_fn func, void *arg, michael@0: enum x86_op_foreach_type type); michael@0: michael@0: /* convenience routine: returns count of operands matching 'type' */ michael@0: size_t x86_operand_count( x86_insn_t *insn, enum x86_op_foreach_type type ); michael@0: michael@0: /* accessor functions for the operands */ michael@0: x86_op_t * x86_operand_1st( x86_insn_t *insn ); michael@0: x86_op_t * x86_operand_2nd( x86_insn_t *insn ); michael@0: x86_op_t * x86_operand_3rd( x86_insn_t *insn ); michael@0: michael@0: /* these allow libdisasm 2.0 accessor functions to still be used */ michael@0: #define x86_get_dest_operand( insn ) x86_operand_1st( insn ) michael@0: #define x86_get_src_operand( insn ) x86_operand_2nd( insn ) michael@0: #define x86_get_imm_operand( insn ) x86_operand_3rd( insn ) michael@0: michael@0: /* get size of operand data in bytes */ michael@0: unsigned int x86_operand_size( x86_op_t *op ); michael@0: michael@0: /* Operand Convenience Routines: the following three routines are common michael@0: * operations on operands, intended to ease the burden of the programmer. */ michael@0: michael@0: /* Get Address: return the value of an offset operand, or the offset of michael@0: * a segment:offset absolute address */ michael@0: uint32_t x86_get_address( x86_insn_t *insn ); michael@0: michael@0: /* Get Relative Offset: return as a sign-extended int32_t the near or far michael@0: * relative offset operand, or 0 if there is none. There can be only one michael@0: * relaive offset operand in an instruction. */ michael@0: int32_t x86_get_rel_offset( x86_insn_t *insn ); michael@0: michael@0: /* Get Branch Target: return the x86_op_t containing the target of michael@0: * a jump or call operand, or NULL if there is no branch target. michael@0: * Internally, a 'branch target' is defined as any operand with michael@0: * Execute Access set. There can be only one branch target per instruction. */ michael@0: x86_op_t * x86_get_branch_target( x86_insn_t *insn ); michael@0: michael@0: /* Get Immediate: return the x86_op_t containing the immediate operand michael@0: * for this instruction, or NULL if there is no immediate operand. There michael@0: * can be only one immediate operand per instruction */ michael@0: x86_op_t * x86_get_imm( x86_insn_t *insn ); michael@0: michael@0: /* Get Raw Immediate Data: returns a pointer to the immediate data encoded michael@0: * in the instruction. This is useful for large data types [>32 bits] currently michael@0: * not supported by libdisasm, or for determining if the disassembler michael@0: * screwed up the conversion of the immediate data. Note that 'imm' in this michael@0: * context refers to immediate data encoded at the end of an instruction as michael@0: * detailed in the Intel Manual Vol II Chapter 2; it does not refer to the michael@0: * 'op_imm' operand (the third operand in instructions like 'mul' */ michael@0: unsigned char * x86_get_raw_imm( x86_insn_t *insn ); michael@0: michael@0: michael@0: /* More accessor fuctions, this time for user-defined info... */ michael@0: /* set the address (usually RVA) of the insn */ michael@0: void x86_set_insn_addr( x86_insn_t *insn, uint32_t addr ); michael@0: michael@0: /* set the offset (usually offset into file) of the insn */ michael@0: void x86_set_insn_offset( x86_insn_t *insn, unsigned int offset ); michael@0: michael@0: /* set a pointer to the function owning the instruction. The michael@0: * type of 'func' is user-defined; libdisasm does not use the func field. */ michael@0: void x86_set_insn_function( x86_insn_t *insn, void * func ); michael@0: michael@0: /* set a pointer to the block of code owning the instruction. The michael@0: * type of 'block' is user-defined; libdisasm does not use the block field. */ michael@0: void x86_set_insn_block( x86_insn_t *insn, void * block ); michael@0: michael@0: /* instruction tagging: these routines allow the programmer to mark michael@0: * instructions as "seen" in a DFS, for example. libdisasm does not use michael@0: * the tag field.*/ michael@0: /* set insn->tag to 1 */ michael@0: void x86_tag_insn( x86_insn_t *insn ); michael@0: /* set insn->tag to 0 */ michael@0: void x86_untag_insn( x86_insn_t *insn ); michael@0: /* return insn->tag */ michael@0: int x86_insn_is_tagged( x86_insn_t *insn ); michael@0: michael@0: michael@0: /* Disassembly formats: michael@0: * AT&T is standard AS/GAS-style: "mnemonic\tsrc, dest, imm" michael@0: * Intel is standard MASM/NASM/TASM: "mnemonic\tdest,src, imm" michael@0: * Native is tab-delimited: "RVA\tbytes\tmnemonic\tdest\tsrc\timm" michael@0: * XML is your typical ... michael@0: * Raw is addr|offset|size|bytes|prefix... see libdisasm_formats.7 michael@0: */ michael@0: enum x86_asm_format { michael@0: unknown_syntax = 0, /* never use! */ michael@0: native_syntax, /* header: 35 bytes */ michael@0: intel_syntax, /* header: 23 bytes */ michael@0: att_syntax, /* header: 23 bytes */ michael@0: xml_syntax, /* header: 679 bytes */ michael@0: raw_syntax /* header: 172 bytes */ michael@0: }; michael@0: michael@0: /* format (sprintf) an operand into 'buf' using specified syntax */ michael@0: int x86_format_operand(x86_op_t *op, char *buf, int len, michael@0: enum x86_asm_format format); michael@0: michael@0: /* format (sprintf) an instruction mnemonic into 'buf' using specified syntax */ michael@0: int x86_format_mnemonic(x86_insn_t *insn, char *buf, int len, michael@0: enum x86_asm_format format); michael@0: michael@0: /* format (sprintf) an instruction into 'buf' using specified syntax; michael@0: * this includes formatting all operands */ michael@0: int x86_format_insn(x86_insn_t *insn, char *buf, int len, enum x86_asm_format); michael@0: michael@0: /* fill 'buf' with a description of the format's syntax */ michael@0: int x86_format_header( char *buf, int len, enum x86_asm_format format); michael@0: michael@0: /* Endianness of an x86 CPU : 0 is big, 1 is little; always returns 1 */ michael@0: unsigned int x86_endian(void); michael@0: michael@0: /* Default address and operand size in bytes */ michael@0: unsigned int x86_addr_size(void); michael@0: unsigned int x86_op_size(void); michael@0: michael@0: /* Size of a machine word in bytes */ michael@0: unsigned int x86_word_size(void); michael@0: michael@0: /* maximum size of a code instruction */ michael@0: #define x86_max_inst_size(x) x86_max_insn_size(x) michael@0: unsigned int x86_max_insn_size(void); michael@0: michael@0: /* register IDs of Stack, Frame, Instruction pointer and Flags register */ michael@0: unsigned int x86_sp_reg(void); michael@0: unsigned int x86_fp_reg(void); michael@0: unsigned int x86_ip_reg(void); michael@0: unsigned int x86_flag_reg(void); michael@0: michael@0: /* fill 'reg' struct with details of register 'id' */ michael@0: void x86_reg_from_id( unsigned int id, x86_reg_t * reg ); michael@0: michael@0: /* convenience macro demonstrating how to get an aliased register; proto is michael@0: * void x86_get_aliased_reg( x86_reg_t *alias_reg, x86_reg_t *output_reg ) michael@0: * where 'alias_reg' is a reg operand and 'output_reg' is filled with the michael@0: * register that the operand is an alias for */ michael@0: #define x86_get_aliased_reg( alias_reg, output_reg ) \ michael@0: x86_reg_from_id( alias_reg->alias, output_reg ) michael@0: michael@0: michael@0: /* ================================== Invariant Instruction Representation */ michael@0: /* Invariant instructions are used for generating binary signatures; michael@0: * the instruction is modified so that all variant bytes in an instruction michael@0: * are replaced with a wildcard byte. michael@0: * michael@0: * A 'variant byte' is one that is expected to be modified by either the michael@0: * static or the dynamic linker: for example, an address encoded in an michael@0: * instruction. michael@0: * michael@0: * By comparing the invariant representation of one instruction [or of a michael@0: * sequence of instructions] with the invariant representation of another, michael@0: * one determine whether the two invariant representations are from the same michael@0: * relocatable object [.o] file. Thus one can use binary signatures [which michael@0: * are just sequences of invariant instruction representations] to look for michael@0: * library routines which have been statically-linked into a binary. michael@0: * michael@0: * The invariant routines are faster and smaller than the disassembly michael@0: * routines; they can be used to determine the size of an instruction michael@0: * without all of the overhead of a full instruction disassembly. michael@0: */ michael@0: michael@0: /* This byte is used to replace variant bytes */ michael@0: #define X86_WILDCARD_BYTE 0xF4 michael@0: michael@0: typedef struct { michael@0: enum x86_op_type type; /* operand type */ michael@0: enum x86_op_datatype datatype; /* operand size */ michael@0: enum x86_op_access access; /* operand access [RWX] */ michael@0: enum x86_op_flags flags; /* misc flags */ michael@0: } x86_invariant_op_t; michael@0: michael@0: typedef struct { michael@0: unsigned char bytes[64]; /* invariant representation */ michael@0: unsigned int size; /* number of bytes in insn */ michael@0: enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ michael@0: enum x86_insn_type type; /* type, e.g. INS_BRANCH */ michael@0: x86_invariant_op_t operands[3]; /* operands: dest, src, imm */ michael@0: } x86_invariant_t; michael@0: michael@0: michael@0: /* return a version of the instruction with the variant bytes masked out */ michael@0: size_t x86_invariant_disasm( unsigned char *buf, int buf_len, michael@0: x86_invariant_t *inv ); michael@0: /* return the size in bytes of the intruction pointed to by 'buf'; michael@0: * this used x86_invariant_disasm since it faster than x86_disasm */ michael@0: size_t x86_size_disasm( unsigned char *buf, unsigned int buf_len ); michael@0: michael@0: #ifdef __cplusplus michael@0: } michael@0: #endif michael@0: michael@0: michael@0: #endif