michael@0: // Copyright (c) 2012 The Chromium Authors. All rights reserved.
michael@0: // Use of this source code is governed by a BSD-style license that can be
michael@0: // found in the LICENSE file.
michael@0: 
michael@0: // Implementation of MiniDisassembler.
michael@0: 
michael@0: #ifdef _WIN64
michael@0: #error The code in this file should not be used on 64-bit Windows.
michael@0: #endif
michael@0: 
michael@0: #include "sandbox/win/src/sidestep/mini_disassembler.h"
michael@0: 
michael@0: namespace sidestep {
michael@0: 
michael@0: MiniDisassembler::MiniDisassembler(bool operand_default_is_32_bits,
michael@0:                                    bool address_default_is_32_bits)
michael@0:     : operand_default_is_32_bits_(operand_default_is_32_bits),
michael@0:       address_default_is_32_bits_(address_default_is_32_bits) {
michael@0:   Initialize();
michael@0: }
michael@0: 
michael@0: MiniDisassembler::MiniDisassembler()
michael@0:     : operand_default_is_32_bits_(true),
michael@0:       address_default_is_32_bits_(true) {
michael@0:   Initialize();
michael@0: }
michael@0: 
michael@0: InstructionType MiniDisassembler::Disassemble(
michael@0:     unsigned char* start_byte,
michael@0:     unsigned int* instruction_bytes) {
michael@0:   // Clean up any state from previous invocations.
michael@0:   Initialize();
michael@0: 
michael@0:   // Start by processing any prefixes.
michael@0:   unsigned char* current_byte = start_byte;
michael@0:   unsigned int size = 0;
michael@0:   InstructionType instruction_type = ProcessPrefixes(current_byte, &size);
michael@0: 
michael@0:   if (IT_UNKNOWN == instruction_type)
michael@0:     return instruction_type;
michael@0: 
michael@0:   current_byte += size;
michael@0:   size = 0;
michael@0: 
michael@0:   // Invariant: We have stripped all prefixes, and the operand_is_32_bits_
michael@0:   // and address_is_32_bits_ flags are correctly set.
michael@0: 
michael@0:   instruction_type = ProcessOpcode(current_byte, 0, &size);
michael@0: 
michael@0:   // Check for error processing instruction
michael@0:   if ((IT_UNKNOWN == instruction_type_) || (IT_UNUSED == instruction_type_)) {
michael@0:     return IT_UNKNOWN;
michael@0:   }
michael@0: 
michael@0:   current_byte += size;
michael@0: 
michael@0:   // Invariant: operand_bytes_ indicates the total size of operands
michael@0:   // specified by the opcode and/or ModR/M byte and/or SIB byte.
michael@0:   // pCurrentByte points to the first byte after the ModR/M byte, or after
michael@0:   // the SIB byte if it is present (i.e. the first byte of any operands
michael@0:   // encoded in the instruction).
michael@0: 
michael@0:   // We get the total length of any prefixes, the opcode, and the ModR/M and
michael@0:   // SIB bytes if present, by taking the difference of the original starting
michael@0:   // address and the current byte (which points to the first byte of the
michael@0:   // operands if present, or to the first byte of the next instruction if
michael@0:   // they are not).  Adding the count of bytes in the operands encoded in
michael@0:   // the instruction gives us the full length of the instruction in bytes.
michael@0:   *instruction_bytes += operand_bytes_ + (current_byte - start_byte);
michael@0: 
michael@0:   // Return the instruction type, which was set by ProcessOpcode().
michael@0:   return instruction_type_;
michael@0: }
michael@0: 
michael@0: void MiniDisassembler::Initialize() {
michael@0:   operand_is_32_bits_ = operand_default_is_32_bits_;
michael@0:   address_is_32_bits_ = address_default_is_32_bits_;
michael@0:   operand_bytes_ = 0;
michael@0:   have_modrm_ = false;
michael@0:   should_decode_modrm_ = false;
michael@0:   instruction_type_ = IT_UNKNOWN;
michael@0:   got_f2_prefix_ = false;
michael@0:   got_f3_prefix_ = false;
michael@0:   got_66_prefix_ = false;
michael@0: }
michael@0: 
michael@0: InstructionType MiniDisassembler::ProcessPrefixes(unsigned char* start_byte,
michael@0:                                                   unsigned int* size) {
michael@0:   InstructionType instruction_type = IT_GENERIC;
michael@0:   const Opcode& opcode = s_ia32_opcode_map_[0].table_[*start_byte];
michael@0: 
michael@0:   switch (opcode.type_) {
michael@0:     case IT_PREFIX_ADDRESS:
michael@0:       address_is_32_bits_ = !address_default_is_32_bits_;
michael@0:       goto nochangeoperand;
michael@0:     case IT_PREFIX_OPERAND:
michael@0:       operand_is_32_bits_ = !operand_default_is_32_bits_;
michael@0:       nochangeoperand:
michael@0:     case IT_PREFIX:
michael@0: 
michael@0:       if (0xF2 == (*start_byte))
michael@0:         got_f2_prefix_ = true;
michael@0:       else if (0xF3 == (*start_byte))
michael@0:         got_f3_prefix_ = true;
michael@0:       else if (0x66 == (*start_byte))
michael@0:         got_66_prefix_ = true;
michael@0: 
michael@0:       instruction_type = opcode.type_;
michael@0:       (*size)++;
michael@0:       // we got a prefix, so add one and check next byte
michael@0:       ProcessPrefixes(start_byte + 1, size);
michael@0:     default:
michael@0:       break;   // not a prefix byte
michael@0:   }
michael@0: 
michael@0:   return instruction_type;
michael@0: }
michael@0: 
michael@0: InstructionType MiniDisassembler::ProcessOpcode(unsigned char* start_byte,
michael@0:                                                 unsigned int table_index,
michael@0:                                                 unsigned int* size) {
michael@0:   const OpcodeTable& table = s_ia32_opcode_map_[table_index];   // Get our table
michael@0:   unsigned char current_byte = (*start_byte) >> table.shift_;
michael@0:   current_byte = current_byte & table.mask_;  // Mask out the bits we will use
michael@0: 
michael@0:   // Check whether the byte we have is inside the table we have.
michael@0:   if (current_byte < table.min_lim_ || current_byte > table.max_lim_) {
michael@0:     instruction_type_ = IT_UNKNOWN;
michael@0:     return instruction_type_;
michael@0:   }
michael@0: 
michael@0:   const Opcode& opcode = table.table_[current_byte];
michael@0:   if (IT_UNUSED == opcode.type_) {
michael@0:     // This instruction is not used by the IA-32 ISA, so we indicate
michael@0:     // this to the user.  Probably means that we were pointed to
michael@0:     // a byte in memory that was not the start of an instruction.
michael@0:     instruction_type_ = IT_UNUSED;
michael@0:     return instruction_type_;
michael@0:   } else if (IT_REFERENCE == opcode.type_) {
michael@0:     // We are looking at an opcode that has more bytes (or is continued
michael@0:     // in the ModR/M byte).  Recursively find the opcode definition in
michael@0:     // the table for the opcode's next byte.
michael@0:     (*size)++;
michael@0:     ProcessOpcode(start_byte + 1, opcode.table_index_, size);
michael@0:     return instruction_type_;
michael@0:   }
michael@0: 
michael@0:   const SpecificOpcode* specific_opcode = reinterpret_cast<
michael@0:                                               const SpecificOpcode*>(&opcode);
michael@0:   if (opcode.is_prefix_dependent_) {
michael@0:     if (got_f2_prefix_ && opcode.opcode_if_f2_prefix_.mnemonic_ != 0) {
michael@0:       specific_opcode = &opcode.opcode_if_f2_prefix_;
michael@0:     } else if (got_f3_prefix_ && opcode.opcode_if_f3_prefix_.mnemonic_ != 0) {
michael@0:       specific_opcode = &opcode.opcode_if_f3_prefix_;
michael@0:     } else if (got_66_prefix_ && opcode.opcode_if_66_prefix_.mnemonic_ != 0) {
michael@0:       specific_opcode = &opcode.opcode_if_66_prefix_;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // Inv: The opcode type is known.
michael@0:   instruction_type_ = specific_opcode->type_;
michael@0: 
michael@0:   // Let's process the operand types to see if we have any immediate
michael@0:   // operands, and/or a ModR/M byte.
michael@0: 
michael@0:   ProcessOperand(specific_opcode->flag_dest_);
michael@0:   ProcessOperand(specific_opcode->flag_source_);
michael@0:   ProcessOperand(specific_opcode->flag_aux_);
michael@0: 
michael@0:   // Inv: We have processed the opcode and incremented operand_bytes_
michael@0:   // by the number of bytes of any operands specified by the opcode
michael@0:   // that are stored in the instruction (not registers etc.).  Now
michael@0:   // we need to return the total number of bytes for the opcode and
michael@0:   // for the ModR/M or SIB bytes if they are present.
michael@0: 
michael@0:   if (table.mask_ != 0xff) {
michael@0:     if (have_modrm_) {
michael@0:       // we're looking at a ModR/M byte so we're not going to
michael@0:       // count that into the opcode size
michael@0:       ProcessModrm(start_byte, size);
michael@0:       return IT_GENERIC;
michael@0:     } else {
michael@0:       // need to count the ModR/M byte even if it's just being
michael@0:       // used for opcode extension
michael@0:       (*size)++;
michael@0:       return IT_GENERIC;
michael@0:     }
michael@0:   } else {
michael@0:     if (have_modrm_) {
michael@0:       // The ModR/M byte is the next byte.
michael@0:       (*size)++;
michael@0:       ProcessModrm(start_byte + 1, size);
michael@0:       return IT_GENERIC;
michael@0:     } else {
michael@0:       (*size)++;
michael@0:       return IT_GENERIC;
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: bool MiniDisassembler::ProcessOperand(int flag_operand) {
michael@0:   bool succeeded = true;
michael@0:   if (AM_NOT_USED == flag_operand)
michael@0:     return succeeded;
michael@0: 
michael@0:   // Decide what to do based on the addressing mode.
michael@0:   switch (flag_operand & AM_MASK) {
michael@0:     // No ModR/M byte indicated by these addressing modes, and no
michael@0:     // additional (e.g. immediate) parameters.
michael@0:     case AM_A:  // Direct address
michael@0:     case AM_F:  // EFLAGS register
michael@0:     case AM_X:  // Memory addressed by the DS:SI register pair
michael@0:     case AM_Y:  // Memory addressed by the ES:DI register pair
michael@0:     case AM_IMPLICIT:  // Parameter is implicit, occupies no space in
michael@0:                        // instruction
michael@0:       break;
michael@0: 
michael@0:     // There is a ModR/M byte but it does not necessarily need
michael@0:     // to be decoded.
michael@0:     case AM_C:  // reg field of ModR/M selects a control register
michael@0:     case AM_D:  // reg field of ModR/M selects a debug register
michael@0:     case AM_G:  // reg field of ModR/M selects a general register
michael@0:     case AM_P:  // reg field of ModR/M selects an MMX register
michael@0:     case AM_R:  // mod field of ModR/M may refer only to a general register
michael@0:     case AM_S:  // reg field of ModR/M selects a segment register
michael@0:     case AM_T:  // reg field of ModR/M selects a test register
michael@0:     case AM_V:  // reg field of ModR/M selects a 128-bit XMM register
michael@0:       have_modrm_ = true;
michael@0:       break;
michael@0: 
michael@0:     // In these addressing modes, there is a ModR/M byte and it needs to be
michael@0:     // decoded. No other (e.g. immediate) params than indicated in ModR/M.
michael@0:     case AM_E:  // Operand is either a general-purpose register or memory,
michael@0:                 // specified by ModR/M byte
michael@0:     case AM_M:  // ModR/M byte will refer only to memory
michael@0:     case AM_Q:  // Operand is either an MMX register or memory (complex
michael@0:                 // evaluation), specified by ModR/M byte
michael@0:     case AM_W:  // Operand is either a 128-bit XMM register or memory (complex
michael@0:                 // eval), specified by ModR/M byte
michael@0:       have_modrm_ = true;
michael@0:       should_decode_modrm_ = true;
michael@0:       break;
michael@0: 
michael@0:     // These addressing modes specify an immediate or an offset value
michael@0:     // directly, so we need to look at the operand type to see how many
michael@0:     // bytes.
michael@0:     case AM_I:  // Immediate data.
michael@0:     case AM_J:  // Jump to offset.
michael@0:     case AM_O:  // Operand is at offset.
michael@0:       switch (flag_operand & OT_MASK) {
michael@0:         case OT_B:  // Byte regardless of operand-size attribute.
michael@0:           operand_bytes_ += OS_BYTE;
michael@0:           break;
michael@0:         case OT_C:  // Byte or word, depending on operand-size attribute.
michael@0:           if (operand_is_32_bits_)
michael@0:             operand_bytes_ += OS_WORD;
michael@0:           else
michael@0:             operand_bytes_ += OS_BYTE;
michael@0:           break;
michael@0:         case OT_D:  // Doubleword, regardless of operand-size attribute.
michael@0:           operand_bytes_ += OS_DOUBLE_WORD;
michael@0:           break;
michael@0:         case OT_DQ:  // Double-quadword, regardless of operand-size attribute.
michael@0:           operand_bytes_ += OS_DOUBLE_QUAD_WORD;
michael@0:           break;
michael@0:         case OT_P:  // 32-bit or 48-bit pointer, depending on operand-size
michael@0:                     // attribute.
michael@0:           if (operand_is_32_bits_)
michael@0:             operand_bytes_ += OS_48_BIT_POINTER;
michael@0:           else
michael@0:             operand_bytes_ += OS_32_BIT_POINTER;
michael@0:           break;
michael@0:         case OT_PS:  // 128-bit packed single-precision floating-point data.
michael@0:           operand_bytes_ += OS_128_BIT_PACKED_SINGLE_PRECISION_FLOATING;
michael@0:           break;
michael@0:         case OT_Q:  // Quadword, regardless of operand-size attribute.
michael@0:           operand_bytes_ += OS_QUAD_WORD;
michael@0:           break;
michael@0:         case OT_S:  // 6-byte pseudo-descriptor.
michael@0:           operand_bytes_ += OS_PSEUDO_DESCRIPTOR;
michael@0:           break;
michael@0:         case OT_SD:  // Scalar Double-Precision Floating-Point Value
michael@0:         case OT_PD:  // Unaligned packed double-precision floating point value
michael@0:           operand_bytes_ += OS_DOUBLE_PRECISION_FLOATING;
michael@0:           break;
michael@0:         case OT_SS:
michael@0:           // Scalar element of a 128-bit packed single-precision
michael@0:           // floating data.
michael@0:           // We simply return enItUnknown since we don't have to support
michael@0:           // floating point
michael@0:           succeeded = false;
michael@0:           break;
michael@0:         case OT_V:  // Word or doubleword, depending on operand-size attribute.
michael@0:           if (operand_is_32_bits_)
michael@0:             operand_bytes_ += OS_DOUBLE_WORD;
michael@0:           else
michael@0:             operand_bytes_ += OS_WORD;
michael@0:           break;
michael@0:         case OT_W:  // Word, regardless of operand-size attribute.
michael@0:           operand_bytes_ += OS_WORD;
michael@0:           break;
michael@0: 
michael@0:         // Can safely ignore these.
michael@0:         case OT_A:  // Two one-word operands in memory or two double-word
michael@0:                     // operands in memory
michael@0:         case OT_PI:  // Quadword MMX technology register (e.g. mm0)
michael@0:         case OT_SI:  // Doubleword integer register (e.g., eax)
michael@0:           break;
michael@0: 
michael@0:         default:
michael@0:           break;
michael@0:       }
michael@0:       break;
michael@0: 
michael@0:     default:
michael@0:       break;
michael@0:   }
michael@0: 
michael@0:   return succeeded;
michael@0: }
michael@0: 
michael@0: bool MiniDisassembler::ProcessModrm(unsigned char* start_byte,
michael@0:                                     unsigned int* size) {
michael@0:   // If we don't need to decode, we just return the size of the ModR/M
michael@0:   // byte (there is never a SIB byte in this case).
michael@0:   if (!should_decode_modrm_) {
michael@0:     (*size)++;
michael@0:     return true;
michael@0:   }
michael@0: 
michael@0:   // We never care about the reg field, only the combination of the mod
michael@0:   // and r/m fields, so let's start by packing those fields together into
michael@0:   // 5 bits.
michael@0:   unsigned char modrm = (*start_byte);
michael@0:   unsigned char mod = modrm & 0xC0;  // mask out top two bits to get mod field
michael@0:   modrm = modrm & 0x07;  // mask out bottom 3 bits to get r/m field
michael@0:   mod = mod >> 3;  // shift the mod field to the right place
michael@0:   modrm = mod | modrm;  // combine the r/m and mod fields as discussed
michael@0:   mod = mod >> 3;  // shift the mod field to bits 2..0
michael@0: 
michael@0:   // Invariant: modrm contains the mod field in bits 4..3 and the r/m field
michael@0:   // in bits 2..0, and mod contains the mod field in bits 2..0
michael@0: 
michael@0:   const ModrmEntry* modrm_entry = 0;
michael@0:   if (address_is_32_bits_)
michael@0:     modrm_entry = &s_ia32_modrm_map_[modrm];
michael@0:   else
michael@0:     modrm_entry = &s_ia16_modrm_map_[modrm];
michael@0: 
michael@0:   // Invariant: modrm_entry points to information that we need to decode
michael@0:   // the ModR/M byte.
michael@0: 
michael@0:   // Add to the count of operand bytes, if the ModR/M byte indicates
michael@0:   // that some operands are encoded in the instruction.
michael@0:   if (modrm_entry->is_encoded_in_instruction_)
michael@0:     operand_bytes_ += modrm_entry->operand_size_;
michael@0: 
michael@0:   // Process the SIB byte if necessary, and return the count
michael@0:   // of ModR/M and SIB bytes.
michael@0:   if (modrm_entry->use_sib_byte_) {
michael@0:     (*size)++;
michael@0:     return ProcessSib(start_byte + 1, mod, size);
michael@0:   } else {
michael@0:     (*size)++;
michael@0:     return true;
michael@0:   }
michael@0: }
michael@0: 
michael@0: bool MiniDisassembler::ProcessSib(unsigned char* start_byte,
michael@0:                                   unsigned char mod,
michael@0:                                   unsigned int* size) {
michael@0:   // get the mod field from the 2..0 bits of the SIB byte
michael@0:   unsigned char sib_base = (*start_byte) & 0x07;
michael@0:   if (0x05 == sib_base) {
michael@0:     switch (mod) {
michael@0:       case 0x00:  // mod == 00
michael@0:       case 0x02:  // mod == 10
michael@0:         operand_bytes_ += OS_DOUBLE_WORD;
michael@0:         break;
michael@0:       case 0x01:  // mod == 01
michael@0:         operand_bytes_ += OS_BYTE;
michael@0:         break;
michael@0:       case 0x03:  // mod == 11
michael@0:         // According to the IA-32 docs, there does not seem to be a disp
michael@0:         // value for this value of mod
michael@0:       default:
michael@0:         break;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   (*size)++;
michael@0:   return true;
michael@0: }
michael@0: 
michael@0: };  // namespace sidestep