diff --git a/asm_differ/__init__.py b/asm_differ/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/asm_differ/diff.py b/asm_differ/diff.py new file mode 100755 index 0000000..ca34574 --- /dev/null +++ b/asm_differ/diff.py @@ -0,0 +1,4014 @@ +#!/usr/bin/env python3 +# PYTHON_ARGCOMPLETE_OK +import argparse +import enum +import sys +from typing import ( + Any, + Callable, + Dict, + Iterator, + List, + Sequence, + Match, + NoReturn, + Optional, + Pattern, + Set, + Tuple, + Type, + Union, +) + + +def fail(msg: str) -> NoReturn: + print(msg, file=sys.stderr) + sys.exit(1) + + +def static_assert_unreachable(x: NoReturn) -> NoReturn: + raise Exception("Unreachable! " + repr(x)) + + +# The encoding to use for reading the map file to text. +# It is expected the map file will be ASCII-only or mostly, +# which UTF-8 is compatible with. +MAPFILE_ENCODING = "UTF-8" +# In case the map file isn't UTF-8 and some bytes can't be decoded as UTF-8, +# don't error and subtitute with U+FFFD. +# (cf the Python documentation on `codecs.replace_errors`) +MAPFILE_ENCODING_ERROR_HANDLER = "replace" + + +class DiffMode(enum.Enum): + SINGLE = "single" + SINGLE_BASE = "single_base" + NORMAL = "normal" + THREEWAY_PREV = "3prev" + THREEWAY_BASE = "3base" + + +# ==== COMMAND-LINE ==== + +parser = argparse.ArgumentParser( + description="Diff MIPS, PPC, AArch64, ARM32, SH2, SH4, or m68k assembly." +) +start_argument = parser.add_argument( + "start", + help="Function name or address to start diffing from.", +) +parser.add_argument( + "end", + nargs="?", + help="Address to end diff at.", +) +parser.add_argument( + "-o", + dest="diff_obj", + action="store_true", + help="""Diff .o files rather than a whole binary. This makes it possible to + see symbol names. (Recommended)""", +) +parser.add_argument( + "-f", + "--file", + dest="file", + type=str, + help="""File path for a file being diffed. When used the map + file isn't searched for the function given. Useful for dynamically + linked libraries.""", +) +parser.add_argument( + "-e", + "--elf", + dest="diff_elf_symbol", + metavar="SYMBOL", + help="""Diff a given function in two ELFs, one being stripped and the other + one non-stripped. Requires objdump from binutils 2.33+.""", +) +parser.add_argument( + "-c", + "--source", + dest="show_source", + action="store_true", + help="Show source code (if possible). Only works with -o or -e.", +) +parser.add_argument( + "-C", + "--source-old-binutils", + dest="source_old_binutils", + action="store_true", + help="""Tweak --source handling to make it work with binutils < 2.33. + Implies --source.""", +) +parser.add_argument( + "-j", + "--section", + dest="diff_section", + default=".text", + metavar="SECTION", + help="Diff restricted to a given output section.", +) +parser.add_argument( + "-L", + "--line-numbers", + dest="show_line_numbers", + action="store_const", + const=True, + help="""Show source line numbers in output, when available. May be enabled by + default depending on diff_settings.py.""", +) +parser.add_argument( + "--no-line-numbers", + dest="show_line_numbers", + action="store_const", + const=False, + help="Hide source line numbers in output.", +) +parser.add_argument( + "--inlines", + dest="inlines", + action="store_true", + help="Show inline function calls (if possible). Only works with -o or -e.", +) +parser.add_argument( + "--base-asm", + dest="base_asm", + metavar="FILE", + help="Read assembly from given file instead of configured base img.", +) +parser.add_argument( + "--write-asm", + dest="write_asm", + metavar="FILE", + help="Write the current assembly output to file, e.g. for use with --base-asm.", +) +parser.add_argument( + "-m", + "--make", + dest="make", + action="store_true", + help="Automatically run 'make' on the .o file or binary before diffing.", +) +parser.add_argument( + "-l", + "--skip-lines", + dest="skip_lines", + metavar="LINES", + type=int, + default=0, + help="Skip the first LINES lines of output.", +) +parser.add_argument( + "-s", + "--stop-at-ret", + dest="stop_at_ret", + action="count", + help="""Stop disassembling at the first return instruction. + You can also pass -ss to stop at the second return instruction, and so on.""", +) +parser.add_argument( + "-i", + "--ignore-large-imms", + dest="ignore_large_imms", + action="store_true", + help="Pretend all large enough immediates are the same.", +) +parser.add_argument( + "-I", + "--ignore-addr-diffs", + dest="ignore_addr_diffs", + action="store_true", + help="Ignore address differences. Currently only affects AArch64 and ARM32.", +) +parser.add_argument( + "-B", + "--no-show-branches", + dest="show_branches", + action="store_false", + help="Don't visualize branches/branch targets.", +) +parser.add_argument( + "-R", + "--no-show-rodata-refs", + dest="show_rodata_refs", + action="store_false", + help="Don't show .rodata -> .text references (typically from jump tables).", +) +parser.add_argument( + "-S", + "--base-shift", + dest="base_shift", + metavar="N", + type=str, + default="0", + help="""Diff position N in our img against position N + shift in the base img. + Arithmetic is allowed, so e.g. |-S "0x1234 - 0x4321"| is a reasonable + flag to pass if it is known that position 0x1234 in the base img syncs + up with position 0x4321 in our img. Not supported together with -o.""", +) +parser.add_argument( + "-w", + "--watch", + dest="watch", + action="store_true", + help="""Automatically update when source/object files change. + Recommended in combination with -m.""", +) +parser.add_argument( + "-y", + "--yes", + dest="agree", + action="store_true", + help="""Automatically agree to any yes/no questions asked. + Useful if you really want to use the -w option without -m.""", +) +parser.add_argument( + "-0", + "--diff_mode=single_base", + dest="diff_mode", + action="store_const", + const=DiffMode.SINGLE_BASE, + help="""View the base asm only (not a diff).""", +) +parser.add_argument( + "-1", + "--diff_mode=single", + dest="diff_mode", + action="store_const", + const=DiffMode.SINGLE, + help="""View the current asm only (not a diff).""", +) +parser.add_argument( + "-3", + "--threeway=prev", + dest="diff_mode", + action="store_const", + const=DiffMode.THREEWAY_PREV, + help="""Show a three-way diff between target asm, current asm, and asm + prior to -w rebuild. Requires -w.""", +) +parser.add_argument( + "-b", + "--threeway=base", + dest="diff_mode", + action="store_const", + const=DiffMode.THREEWAY_BASE, + help="""Show a three-way diff between target asm, current asm, and asm + when diff.py was started. Requires -w.""", +) +parser.add_argument( + "--width", + dest="column_width", + metavar="COLS", + type=int, + default=50, + help="Sets the width of the left and right view column.", +) +parser.add_argument( + "--algorithm", + dest="algorithm", + default="levenshtein", + choices=["levenshtein", "difflib"], + help="""Diff algorithm to use. Levenshtein gives the minimum diff, while difflib + aims for long sections of equal opcodes. Defaults to %(default)s.""", +) +parser.add_argument( + "--max-size", + "--max-lines", + metavar="LINES", + dest="max_lines", + type=int, + default=1024, + help="The maximum length of the diff, in lines.", +) +parser.add_argument( + "--no-pager", + dest="no_pager", + action="store_true", + help="""Disable the pager; write output directly to stdout, then exit. + Incompatible with --watch.""", +) +parser.add_argument( + "--format", + choices=("color", "plain", "html", "json"), + default="color", + help="Output format, default is color. --format=html or json implies --no-pager.", +) +parser.add_argument( + "-U", + "--compress-matching", + metavar="N", + dest="compress_matching", + type=int, + help="""Compress streaks of matching lines, leaving N lines of context + around non-matching parts.""", +) +parser.add_argument( + "-V", + "--compress-sameinstr", + metavar="N", + dest="compress_sameinstr", + type=int, + help="""Compress streaks of lines with same instructions (but possibly + different regalloc), leaving N lines of context around other parts.""", +) +parser.add_argument( + "-d", + "--diff-function-symbols", + dest="diff_function_symbols", + action="store_true", + help="Include and diff function symbols.", +) + +# ==== IMPORTS ==== + +# (We do imports late to optimize auto-complete performance.) + +import abc +from collections import Counter, defaultdict +from dataclasses import asdict, dataclass, field, replace +import difflib +import html +import itertools +import json +import os +import queue +import re +import string +import struct +import subprocess +import threading +import time +import traceback + + +MISSING_PREREQUISITES = ( + "Missing prerequisite python module {}. " + "Run `python3 -m pip install --user colorama watchdog levenshtein cxxfilt` to install prerequisites (cxxfilt only needed with --source)." +) + +try: + from colorama import Back, Fore, Style + import watchdog +except ModuleNotFoundError as e: + fail(MISSING_PREREQUISITES.format(e.name)) + +# ==== CONFIG ==== + + +@dataclass +class ProjectSettings: + arch_str: str + objdump_executable: str + objdump_flags: List[str] + build_command: List[str] + map_format: str + build_dir: str + map_address_offset: int + baseimg: Optional[str] + myimg: Optional[str] + mapfile: Optional[str] + source_directories: Optional[List[str]] + source_extensions: List[str] + show_line_numbers_default: bool + disassemble_all: bool + reg_categories: Dict[str, int] + expected_dir: str + + +@dataclass +class Compress: + context: int + same_instr: bool + + +@dataclass +class Config: + arch: "ArchSettings" + + # Build/objdump options + diff_obj: bool + file: Optional[str] + make: bool + source_old_binutils: bool + diff_section: str + inlines: bool + max_function_size_lines: int + max_function_size_bytes: int + + # Display options + formatter: "Formatter" + diff_mode: DiffMode + base_shift: int + skip_lines: int + compress: Optional[Compress] + show_rodata_refs: bool + show_branches: bool + show_line_numbers: bool + show_source: bool + stop_at_ret: Optional[int] + ignore_large_imms: bool + ignore_addr_diffs: bool + algorithm: str + reg_categories: Dict[str, int] + diff_function_symbols: bool + + # Score options + score_stack_differences = True + penalty_stackdiff = 1 + penalty_regalloc = 5 + penalty_reordering = 60 + penalty_insertion = 100 + penalty_deletion = 100 + + +def create_project_settings(settings: Dict[str, Any]) -> ProjectSettings: + return ProjectSettings( + arch_str=settings.get("arch", "mips"), + baseimg=settings.get("baseimg"), + myimg=settings.get("myimg"), + mapfile=settings.get("mapfile"), + build_command=settings.get( + "make_command", ["make", *settings.get("makeflags", [])] + ), + source_directories=settings.get("source_directories"), + source_extensions=settings.get( + "source_extensions", [".c", ".h", ".cpp", ".hpp", ".s"] + ), + objdump_executable=get_objdump_executable(settings.get("objdump_executable")), + objdump_flags=settings.get("objdump_flags", []), + expected_dir=settings.get("expected_dir", "expected/"), + map_format=settings.get("map_format", "gnu"), + map_address_offset=settings.get( + "map_address_offset", settings.get("ms_map_address_offset", 0) + ), + build_dir=settings.get("build_dir", settings.get("mw_build_dir", "build/")), + show_line_numbers_default=settings.get("show_line_numbers_default", True), + disassemble_all=settings.get("disassemble_all", False), + reg_categories=settings.get("reg_categories", {}), + ) + + +def create_config(args: argparse.Namespace, project: ProjectSettings) -> Config: + arch = get_arch(project.arch_str) + + formatter: Formatter + if args.format == "plain": + formatter = PlainFormatter(column_width=args.column_width) + elif args.format == "color": + formatter = AnsiFormatter(column_width=args.column_width) + elif args.format == "html": + formatter = HtmlFormatter() + elif args.format == "json": + formatter = JsonFormatter(arch_str=arch.name) + else: + raise ValueError(f"Unsupported --format: {args.format}") + + compress = None + if args.compress_matching is not None: + compress = Compress(args.compress_matching, False) + if args.compress_sameinstr is not None: + if compress is not None: + raise ValueError( + "Cannot pass both --compress-matching and --compress-sameinstr" + ) + compress = Compress(args.compress_sameinstr, True) + + show_line_numbers = args.show_line_numbers + if show_line_numbers is None: + show_line_numbers = project.show_line_numbers_default + + return Config( + arch=arch, + # Build/objdump options + diff_obj=args.diff_obj, + file=args.file, + make=args.make, + source_old_binutils=args.source_old_binutils + or "llvm-" in project.objdump_executable, + diff_section=args.diff_section, + inlines=args.inlines, + max_function_size_lines=args.max_lines, + max_function_size_bytes=args.max_lines * 4, + # Display options + formatter=formatter, + diff_mode=args.diff_mode or DiffMode.NORMAL, + base_shift=eval_int( + args.base_shift, "Failed to parse --base-shift (-S) argument as an integer." + ), + skip_lines=args.skip_lines, + compress=compress, + show_rodata_refs=args.show_rodata_refs, + show_branches=args.show_branches, + show_line_numbers=show_line_numbers, + show_source=args.show_source or args.source_old_binutils, + stop_at_ret=args.stop_at_ret, + ignore_large_imms=args.ignore_large_imms, + ignore_addr_diffs=args.ignore_addr_diffs, + algorithm=args.algorithm, + reg_categories=project.reg_categories, + diff_function_symbols=args.diff_function_symbols, + ) + + +def get_objdump_executable(objdump_executable: Optional[str]) -> str: + if objdump_executable is not None: + return objdump_executable + + objdump_candidates = [ + "mips-linux-gnu-objdump", + "mips64-elf-objdump", + "mips-elf-objdump", + "sh-elf-objdump", + "sh4-linux-gnu-objdump", + "m68k-elf-objdump", + ] + for objdump_cand in objdump_candidates: + try: + subprocess.check_call( + [objdump_cand, "--version"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + return objdump_cand + except subprocess.CalledProcessError: + pass + except FileNotFoundError: + pass + + return fail( + f"Missing binutils; please ensure {' or '.join(objdump_candidates)} exists, or configure objdump_executable." + ) + + +def get_arch(arch_str: str) -> "ArchSettings": + for settings in ARCH_SETTINGS: + if arch_str == settings.name: + return settings + raise ValueError(f"Unknown architecture: {arch_str}") + + +BUFFER_CMD: List[str] = ["tail", "-c", str(10**9)] + +# -S truncates long lines instead of wrapping them +# -R interprets color escape sequences +# -i ignores case when searching +# -c something about how the screen gets redrawn; I don't remember the purpose +# -#6 makes left/right arrow keys scroll by 6 characters +LESS_CMD: List[str] = ["less", "-SRic", "-+F", "-+X", "-#6"] + +DEBOUNCE_DELAY: float = 0.1 + +# ==== FORMATTING ==== + + +@enum.unique +class BasicFormat(enum.Enum): + NONE = enum.auto() + IMMEDIATE = enum.auto() + STACK = enum.auto() + REGISTER = enum.auto() + REGISTER_CATEGORY = enum.auto() + DELAY_SLOT = enum.auto() + DIFF_CHANGE = enum.auto() + DIFF_ADD = enum.auto() + DIFF_REMOVE = enum.auto() + SOURCE_FILENAME = enum.auto() + SOURCE_FUNCTION = enum.auto() + SOURCE_LINE_NUM = enum.auto() + SOURCE_OTHER = enum.auto() + + +@dataclass(frozen=True) +class RotationFormat: + group: str + index: int + key: str + + +Format = Union[BasicFormat, RotationFormat] +FormatFunction = Callable[[str], Format] + + +class Text: + segments: List[Tuple[str, Format]] + + def __init__(self, line: str = "", f: Format = BasicFormat.NONE) -> None: + self.segments = [(line, f)] if line else [] + + def reformat(self, f: Format) -> "Text": + return Text(self.plain(), f) + + def plain(self) -> str: + return "".join(s for s, f in self.segments) + + def __repr__(self) -> str: + return f"" + + def __bool__(self) -> bool: + return any(s for s, f in self.segments) + + def __str__(self) -> str: + # Use Formatter.apply(...) instead + return NotImplemented + + def __eq__(self, other: object) -> bool: + return NotImplemented + + def __add__(self, other: Union["Text", str]) -> "Text": + if isinstance(other, str): + other = Text(other) + result = Text() + # If two adjacent segments have the same format, merge their lines + if ( + self.segments + and other.segments + and self.segments[-1][1] == other.segments[0][1] + ): + result.segments = ( + self.segments[:-1] + + [(self.segments[-1][0] + other.segments[0][0], self.segments[-1][1])] + + other.segments[1:] + ) + else: + result.segments = self.segments + other.segments + return result + + def __radd__(self, other: Union["Text", str]) -> "Text": + if isinstance(other, str): + other = Text(other) + return other + self + + def finditer(self, pat: Pattern[str]) -> Iterator[Match[str]]: + """Replacement for `pat.finditer(text)` that operates on the inner text, + and returns the exact same matches as `Text.sub(pat, ...)`.""" + for chunk, f in self.segments: + for match in pat.finditer(chunk): + yield match + + def sub(self, pat: Pattern[str], sub_fn: Callable[[Match[str]], "Text"]) -> "Text": + result = Text() + for chunk, f in self.segments: + i = 0 + for match in pat.finditer(chunk): + start, end = match.start(), match.end() + assert i <= start <= end <= len(chunk) + sub = sub_fn(match) + if i != start: + result.segments.append((chunk[i:start], f)) + result.segments.extend(sub.segments) + i = end + if chunk[i:]: + result.segments.append((chunk[i:], f)) + return result + + def ljust(self, column_width: int) -> "Text": + length = sum(len(x) for x, _ in self.segments) + return self + " " * max(column_width - length, 0) + + +@dataclass +class TableLine: + key: Optional[str] + is_data_ref: bool + cells: Tuple[Tuple[Text, Optional["Line"]], ...] + + +@dataclass +class TableData: + headers: Tuple[Text, ...] + current_score: int + max_score: int + previous_score: Optional[int] + lines: List[TableLine] + + +class Formatter(abc.ABC): + @abc.abstractmethod + def apply_format(self, chunk: str, f: Format) -> str: + """Apply the formatting `f` to `chunk` and escape the contents.""" + ... + + @abc.abstractmethod + def table(self, data: TableData) -> str: + """Format a multi-column table with metadata""" + ... + + def apply(self, text: Text) -> str: + return "".join(self.apply_format(chunk, f) for chunk, f in text.segments) + + @staticmethod + def outputline_texts(line: TableLine) -> Tuple[Text, ...]: + return tuple(cell[0] for cell in line.cells) + + +@dataclass +class PlainFormatter(Formatter): + column_width: int + + def apply_format(self, chunk: str, f: Format) -> str: + return chunk + + def table(self, data: TableData) -> str: + rows = [data.headers] + [self.outputline_texts(line) for line in data.lines] + return "\n".join( + "".join(self.apply(x.ljust(self.column_width)) for x in row) for row in rows + ) + + +@dataclass +class AnsiFormatter(Formatter): + # Additional ansi escape codes not in colorama. See: + # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_(Select_Graphic_Rendition)_parameters + STYLE_UNDERLINE = "\x1b[4m" + STYLE_NO_UNDERLINE = "\x1b[24m" + STYLE_INVERT = "\x1b[7m" + STYLE_RESET = "\x1b[0m" + + BASIC_ANSI_CODES = { + BasicFormat.NONE: "", + BasicFormat.IMMEDIATE: Fore.LIGHTBLUE_EX, + BasicFormat.STACK: Fore.YELLOW, + BasicFormat.REGISTER: Fore.YELLOW, + BasicFormat.REGISTER_CATEGORY: Fore.LIGHTYELLOW_EX, + BasicFormat.DIFF_CHANGE: Fore.LIGHTBLUE_EX, + BasicFormat.DIFF_ADD: Fore.GREEN, + BasicFormat.DIFF_REMOVE: Fore.RED, + BasicFormat.SOURCE_FILENAME: Style.DIM + Style.BRIGHT, + BasicFormat.SOURCE_FUNCTION: Style.DIM + Style.BRIGHT + STYLE_UNDERLINE, + BasicFormat.SOURCE_LINE_NUM: Fore.LIGHTBLACK_EX, + BasicFormat.SOURCE_OTHER: Style.DIM, + } + + BASIC_ANSI_CODES_UNDO = { + BasicFormat.NONE: "", + BasicFormat.SOURCE_FILENAME: Style.NORMAL, + BasicFormat.SOURCE_FUNCTION: Style.NORMAL + STYLE_NO_UNDERLINE, + BasicFormat.SOURCE_OTHER: Style.NORMAL, + } + + ROTATION_ANSI_COLORS = [ + Fore.MAGENTA, + Fore.CYAN, + Fore.GREEN, + Fore.RED, + Fore.LIGHTYELLOW_EX, + Fore.LIGHTMAGENTA_EX, + Fore.LIGHTCYAN_EX, + Fore.LIGHTGREEN_EX, + Fore.LIGHTBLACK_EX, + ] + + column_width: int + + def apply_format(self, chunk: str, f: Format) -> str: + if f == BasicFormat.NONE: + return chunk + undo_ansi_code = Fore.RESET + if isinstance(f, BasicFormat): + ansi_code = self.BASIC_ANSI_CODES[f] + undo_ansi_code = self.BASIC_ANSI_CODES_UNDO.get(f, undo_ansi_code) + elif isinstance(f, RotationFormat): + ansi_code = self.ROTATION_ANSI_COLORS[ + f.index % len(self.ROTATION_ANSI_COLORS) + ] + else: + static_assert_unreachable(f) + return f"{ansi_code}{chunk}{undo_ansi_code}" + + def table(self, data: TableData) -> str: + rows = [(data.headers, False)] + [ + ( + self.outputline_texts(line), + line.is_data_ref, + ) + for line in data.lines + ] + return "\n".join( + "".join( + (self.STYLE_INVERT if is_data_ref else "") + + self.apply(x.ljust(self.column_width)) + + (self.STYLE_RESET if is_data_ref else "") + for x in row + ) + for (row, is_data_ref) in rows + ) + + +@dataclass +class HtmlFormatter(Formatter): + rotation_formats: int = 9 + + def apply_format(self, chunk: str, f: Format) -> str: + chunk = html.escape(chunk) + if f == BasicFormat.NONE: + return chunk + if isinstance(f, BasicFormat): + class_name = f.name.lower().replace("_", "-") + data_attr = "" + elif isinstance(f, RotationFormat): + class_name = f"rotation-{f.index % self.rotation_formats}" + rotation_key = html.escape(f"{f.group};{f.key}", quote=True) + data_attr = f'data-rotation="{rotation_key}"' + else: + static_assert_unreachable(f) + return f"{chunk}" + + def table(self, data: TableData) -> str: + def table_row(line: Tuple[Text, ...], is_data_ref: bool, cell_el: str) -> str: + tr_attrs = " class='data-ref'" if is_data_ref else "" + output_row = f" " + for cell in line: + cell_html = self.apply(cell) + output_row += f"<{cell_el}>{cell_html}" + output_row += "\n" + return output_row + + output = "\n" + output += " \n" + output += table_row(data.headers, False, "th") + output += " \n" + output += " \n" + output += "".join( + table_row(self.outputline_texts(line), line.is_data_ref, "td") + for line in data.lines + ) + output += " \n" + output += "
\n" + return output + + +@dataclass +class PythonFormatter(Formatter): + arch_str: str + + def apply_format(self, chunk: str, f: Format) -> str: + # This method is unused by this formatter + return NotImplemented + + def table(self, data: TableData) -> str: + # This method is unused by this formatter + return NotImplemented + + def raw(self, data: TableData) -> Dict[str, Any]: + def serialize_format(s: str, f: Format) -> Dict[str, Any]: + if f == BasicFormat.NONE: + return {"text": s} + elif isinstance(f, BasicFormat): + return {"text": s, "format": f.name.lower()} + elif isinstance(f, RotationFormat): + attrs = asdict(f) + attrs.update({"text": s, "format": "rotation"}) + return attrs + else: + static_assert_unreachable(f) + + def serialize(text: Optional[Text]) -> List[Dict[str, Any]]: + if text is None: + return [] + return [serialize_format(s, f) for s, f in text.segments] + + output: Dict[str, Any] = {} + output["arch_str"] = self.arch_str + output["header"] = { + name: serialize(h) + for h, name in zip(data.headers, ("base", "current", "previous")) + } + output["current_score"] = data.current_score + output["max_score"] = data.max_score + if data.previous_score is not None: + output["previous_score"] = data.previous_score + output_rows: List[Dict[str, Any]] = [] + for row in data.lines: + output_row: Dict[str, Any] = {} + output_row["key"] = row.key + output_row["is_data_ref"] = row.is_data_ref + iters: List[Tuple[str, Text, Optional[Line]]] = [ + (label, *cell) + for label, cell in zip(("base", "current", "previous"), row.cells) + ] + if all(line is None for _, _, line in iters): + # Skip rows that were only for displaying source code + continue + for column_name, text, line in iters: + column: Dict[str, Any] = {} + column["text"] = serialize(text) + if line: + if line.mnemonic is not None: + column["mnemonic"] = line.mnemonic + if line.symbol is not None: + column["symbol"] = line.symbol + if line.line_num is not None: + column["line"] = line.line_num + if line.branch_target is not None: + column["branch"] = line.branch_target + if line.source_lines: + column["src"] = line.source_lines + if line.comment is not None: + column["src_comment"] = line.comment + if line.source_line_num is not None: + column["src_line"] = line.source_line_num + if line or column["text"]: + output_row[column_name] = column + output_rows.append(output_row) + output["rows"] = output_rows + return output + + +@dataclass +class JsonFormatter(PythonFormatter): + def table(self, data: TableData) -> str: + output = super().raw(data) + return json.dumps(output) + + +def format_fields( + pat: Pattern[str], + out1: Text, + out2: Text, + color1: FormatFunction, + color2: Optional[FormatFunction] = None, +) -> Tuple[Text, Text]: + diffs = [ + of.group() != nf.group() + for (of, nf) in zip(out1.finditer(pat), out2.finditer(pat)) + ] + + it = iter(diffs) + + def maybe_color(color: FormatFunction, s: str) -> Text: + return Text(s, color(s)) if next(it, False) else Text(s) + + out1 = out1.sub(pat, lambda m: maybe_color(color1, m.group())) + it = iter(diffs) + out2 = out2.sub(pat, lambda m: maybe_color(color2 or color1, m.group())) + + return out1, out2 + + +def symbol_formatter(group: str, base_index: int) -> FormatFunction: + symbol_formats: Dict[str, Format] = {} + + def symbol_format(s: str) -> Format: + # TODO: it would be nice to use a unique Format for each symbol, so we could + # add extra UI elements in the HTML version + f = symbol_formats.get(s) + if f is None: + index = len(symbol_formats) + base_index + f = RotationFormat(key=s, index=index, group=group) + symbol_formats[s] = f + return f + + return symbol_format + + +# ==== LOGIC ==== + +ObjdumpCommand = Tuple[List[str], str, Optional[str]] + +# eval_expr adapted from https://stackoverflow.com/a/9558001 + +import ast +import operator as op + +operators: Dict[Type[Union[ast.operator, ast.unaryop]], Any] = { + ast.Add: op.add, + ast.Sub: op.sub, + ast.Mult: op.mul, + ast.Div: op.floordiv, + ast.USub: op.neg, + ast.Pow: op.pow, + ast.BitXor: op.xor, + ast.BitOr: op.or_, + ast.BitAnd: op.and_, + ast.Invert: op.inv, +} + + +def eval_expr(expr: str) -> Any: + return eval_(ast.parse(expr, mode="eval").body) + + +def eval_(node: ast.AST) -> Any: + if ( + hasattr(ast, "Constant") + and isinstance(node, ast.Constant) + and isinstance(node.value, int) + ): # Python 3.8+ + return node.value + elif isinstance(node, ast.BinOp): + return operators[type(node.op)](eval_(node.left), eval_(node.right)) + elif isinstance(node, ast.UnaryOp): + return operators[type(node.op)](eval_(node.operand)) + elif sys.version_info < (3, 8) and isinstance(node, ast.Num): + return node.n + else: + raise TypeError(node) + + +def maybe_eval_int(expr: str) -> Optional[int]: + try: + ret = eval_expr(expr) + if not isinstance(ret, int): + raise Exception("not an integer") + return ret + except Exception: + return None + + +def eval_int(expr: str, emsg: str) -> int: + ret = maybe_eval_int(expr) + if ret is None: + fail(emsg) + return ret + + +def eval_line_num(expr: str) -> Optional[int]: + expr = expr.strip().replace(":", "") + if expr == "": + return None + return int(expr, 16) + + +def run_make(target: str, project: ProjectSettings) -> None: + subprocess.check_call(project.build_command + [target]) + + +def run_make_capture_output( + target: str, project: ProjectSettings +) -> "subprocess.CompletedProcess[bytes]": + return subprocess.run( + project.build_command + [target], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + + +def restrict_to_function(dump: str, fn_name: str) -> str: + try: + # Find the start of the line that contains ":" + ind = dump.rfind("\n", 0, dump.index(f"<{fn_name}>:")) + 1 + return dump[ind:] + except ValueError: + return "" + + +def serialize_rodata_references(references: List[Tuple[int, int, str]]) -> str: + return "".join( + f"DATAREF {text_offset} {from_offset} {from_section}\n" + for (text_offset, from_offset, from_section) in references + ) + + +def maybe_get_objdump_source_flags(config: Config) -> List[str]: + flags = [] + + if config.show_line_numbers or config.show_source: + flags.append("--line-numbers") + + if config.show_source: + flags.append("--source") + + if not config.source_old_binutils: + flags.append("--source-comment=│ ") + + if config.inlines: + flags.append("--inlines") + + return flags + + +def run_objdump(cmd: ObjdumpCommand, config: Config, project: ProjectSettings) -> str: + flags, target, restrict = cmd + try: + out = subprocess.run( + [project.objdump_executable] + + config.arch.arch_flags + + project.objdump_flags + + flags + + [target], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ).stdout + except subprocess.CalledProcessError as e: + print(e.stdout) + print(e.stderr) + if "unrecognized option '--source-comment" in e.stderr: + fail("** Try using --source-old-binutils instead of --source **") + raise e + + obj_data: Optional[bytes] = None + if config.diff_obj: + with open(target, "rb") as f: + obj_data = f.read() + + return preprocess_objdump_out(restrict, obj_data, out, config) + + +def preprocess_objdump_out( + restrict: Optional[str], obj_data: Optional[bytes], objdump_out: str, config: Config +) -> str: + """ + Preprocess the output of objdump into a format that `process()` expects. + This format is suitable for saving to disk with `--write-asm`. + + - Optionally filter the output to a single function (`restrict`) + - Otherwise, strip objdump header (6 lines) + - Prepend .data references ("DATAREF" lines) when working with object files + """ + out = objdump_out + + if restrict is not None: + out = restrict_to_function(out, restrict) + else: + for i in range(6): + out = out[out.find("\n") + 1 :] + out = out.rstrip("\n") + + if obj_data and config.show_rodata_refs: + out = ( + serialize_rodata_references(parse_elf_rodata_references(obj_data, config)) + + out + ) + + processor = config.arch.proc(config) + return processor.preprocess_objdump(out) + + +def search_build_objects(objname: str, project: ProjectSettings) -> Optional[str]: + objfiles = [ + os.path.join(dirpath, f) + for dirpath, _, filenames in os.walk(project.build_dir) + for f in filenames + if f == objname + ] + if len(objfiles) > 1: + all_objects = "\n".join(objfiles) + fail( + f"Found multiple objects of the same name {objname} in {project.build_dir}, " + f"cannot determine which to diff against: \n{all_objects}" + ) + if len(objfiles) == 1: + return objfiles[0] + + return None + + +def search_map_file( + fn_name: str, project: ProjectSettings, config: Config, *, for_binary: bool +) -> Tuple[Optional[str], Optional[int]]: + if not project.mapfile: + fail(f"No map file configured; cannot find function {fn_name}.") + + try: + with open( + project.mapfile, + encoding=MAPFILE_ENCODING, + errors=MAPFILE_ENCODING_ERROR_HANDLER, + ) as f: + contents = f.read() + except Exception: + fail(f"Failed to open map file {project.mapfile} for reading.") + + if project.map_format == "gnu": + if for_binary and "load address" not in contents: + fail( + 'Failed to find "load address" in map file. Maybe you need to add\n' + '"export LANG := C" to your Makefile to avoid localized output?' + ) + + lines = contents.split("\n") + + try: + cur_objfile = None + ram_to_rom = None + cands = [] + last_line = "" + for line in lines: + if line.startswith(" " + config.diff_section): + cur_objfile = line.split()[3] + if "load address" in line: + tokens = last_line.split() + line.split() + ram = int(tokens[1], 0) + rom = int(tokens[5], 0) + ram_to_rom = rom - ram + if line.endswith(" " + fn_name) or f" {fn_name} = 0x" in line: + ram = int(line.split()[0], 0) + if (for_binary and ram_to_rom is not None) or ( + not for_binary and cur_objfile is not None + ): + cands.append((cur_objfile, ram + (ram_to_rom or 0))) + last_line = line + except Exception as e: + traceback.print_exc() + fail(f"Internal error while parsing map file") + + if len(cands) > 1: + fail(f"Found multiple occurrences of function {fn_name} in map file.") + if len(cands) == 1: + return cands[0] + elif project.map_format == "mw": + find = re.findall( + # start address, size, virtual address, file offset, alignment + r" [0-9a-f]{8} [0-9a-f]{6} ([0-9a-f]{8})(?: ([0-9a-f]{8}))?(?: +\S{1,2})? +" + + re.escape(fn_name) + + r"(?: \(entry of " + + re.escape(config.diff_section) + + r"\))? \t" + # object name + + r"(\S+)", + contents, + ) + if len(find) > 1: + fail(f"Found multiple occurrences of function {fn_name} in map file.") + if len(find) == 1: + if find[0][1]: + rom = int(find[0][1], 16) + else: + rom = None + objname = find[0][2] + objfile = search_build_objects(objname, project) + + # TODO Currently the ram-rom conversion only works for diffing ELF + # executables, but it would likely be more convenient to diff DOLs. + # At this time it is recommended to always use -o when running the diff + # script as this mode does not make use of the ram-rom conversion. + if objfile is not None: + return objfile, rom + elif project.map_format == "ms": + load_address_find = re.search( + r"Preferred load address is ([0-9a-f]+)", + contents, + ) + if not load_address_find: + fail(f"Couldn't find module load address in map file.") + load_address = int(load_address_find.group(1), 16) + + diff_segment_find = re.search( + r"([0-9a-f]+):[0-9a-f]+ [0-9a-f]+H " + re.escape(config.diff_section), + contents, + ) + if not diff_segment_find: + fail(f"Couldn't find segment for section in map file.") + diff_segment = diff_segment_find.group(1) + + find = re.findall( + r" (?:" + + re.escape(diff_segment) + + r")\S+\s+(?:" + + re.escape(fn_name) + + r")\s+\S+ ... \S+", + contents, + ) + if len(find) > 1: + fail(f"Found multiple occurrences of function {fn_name} in map file.") + if len(find) == 1: + names_find = re.search(r"(\S+) ... (\S+)", find[0]) + assert names_find is not None + fileofs = int(names_find.group(1), 16) - load_address + if for_binary: + return None, fileofs + + objname = names_find.group(2) + objfile = search_build_objects(objname, project) + if objfile is not None: + return objfile, fileofs + else: + fail(f"Linker map format {project.map_format} unrecognised.") + return None, None + + +def parse_elf_rodata_references( + data: bytes, config: Config +) -> List[Tuple[int, int, str]]: + e_ident = data[:16] + if e_ident[:4] != b"\x7FELF": + return [] + + SHT_SYMTAB = 2 + SHT_REL = 9 + SHT_RELA = 4 + R_MIPS_32 = 2 + R_MIPS_GPREL32 = 12 + + is_32bit = e_ident[4] == 1 + is_little_endian = e_ident[5] == 1 + str_end = "<" if is_little_endian else ">" + str_off = "I" if is_32bit else "Q" + + def read(spec: str, offset: int) -> Tuple[int, ...]: + spec = spec.replace("P", str_off) + size = struct.calcsize(spec) + return struct.unpack(str_end + spec, data[offset : offset + size]) + + ( + e_type, + e_machine, + e_version, + e_entry, + e_phoff, + e_shoff, + e_flags, + e_ehsize, + e_phentsize, + e_phnum, + e_shentsize, + e_shnum, + e_shstrndx, + ) = read("HHIPPPIHHHHHH", 16) + if e_type != 1: # relocatable + return [] + assert e_shoff != 0 + assert e_shnum != 0 # don't support > 0xFF00 sections + assert e_shstrndx != 0 + + @dataclass + class Section: + sh_name: int + sh_type: int + sh_flags: int + sh_addr: int + sh_offset: int + sh_size: int + sh_link: int + sh_info: int + sh_addralign: int + sh_entsize: int + + sections = [ + Section(*read("IIPPPPIIPP", e_shoff + i * e_shentsize)) for i in range(e_shnum) + ] + shstr = sections[e_shstrndx] + sec_name_offs = [shstr.sh_offset + s.sh_name for s in sections] + sec_names = [data[offset : data.index(b"\0", offset)] for offset in sec_name_offs] + + symtab_sections = [i for i in range(e_shnum) if sections[i].sh_type == SHT_SYMTAB] + assert len(symtab_sections) == 1 + symtab = sections[symtab_sections[0]] + + section_name = config.diff_section.encode("utf-8") + text_sections = [ + i + for i in range(e_shnum) + if sec_names[i] == section_name and sections[i].sh_size != 0 + ] + if len(text_sections) != 1: + return [] + text_section = text_sections[0] + + ret: List[Tuple[int, int, str]] = [] + for s in sections: + if s.sh_type == SHT_REL or s.sh_type == SHT_RELA: + if s.sh_info == text_section: + # Skip section_name -> section_name references + continue + sec_name = sec_names[s.sh_info].decode("latin1") + if sec_name not in (".rodata", ".late_rodata"): + continue + sec_base = sections[s.sh_info].sh_offset + for i in range(0, s.sh_size, s.sh_entsize): + if s.sh_type == SHT_REL: + r_offset, r_info = read("PP", s.sh_offset + i) + else: + r_offset, r_info, r_addend = read("PPP", s.sh_offset + i) + + if is_32bit: + r_sym = r_info >> 8 + r_type = r_info & 0xFF + sym_offset = symtab.sh_offset + symtab.sh_entsize * r_sym + st_name, st_value, st_size, st_info, st_other, st_shndx = read( + "IIIBBH", sym_offset + ) + else: + r_sym = r_info >> 32 + r_type = r_info & 0xFFFFFFFF + sym_offset = symtab.sh_offset + symtab.sh_entsize * r_sym + st_name, st_info, st_other, st_shndx, st_value, st_size = read( + "IBBHQQ", sym_offset + ) + if st_shndx == text_section: + if s.sh_type == SHT_REL: + if e_machine == 8 and r_type in (R_MIPS_32, R_MIPS_GPREL32): + (r_addend,) = read("I", sec_base + r_offset) + else: + continue + text_offset = (st_value + r_addend) & 0xFFFFFFFF + ret.append((text_offset, r_offset, sec_name)) + return ret + + +def dump_elf( + start: str, + end: Optional[str], + diff_elf_symbol: str, + config: Config, + project: ProjectSettings, +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + if not project.baseimg or not project.myimg: + fail("Missing myimg/baseimg in config.") + if config.base_shift: + fail("--base-shift not compatible with -e") + + start_addr = eval_int(start, "Start address must be an integer expression.") + + if end is not None: + end_addr = eval_int(end, "End address must be an integer expression.") + else: + end_addr = start_addr + config.max_function_size_bytes + + flags1 = [ + f"--start-address={start_addr}", + f"--stop-address={end_addr}", + ] + + if project.disassemble_all: + disassemble_flag = "-D" + else: + disassemble_flag = "-d" + + if "llvm-" in project.objdump_executable: + flags2 = ["--disassemble", f"--disassemble-symbols={diff_elf_symbol}"] + else: + flags2 = [ + f"--disassemble={diff_elf_symbol}", + ] + + objdump_flags = [disassemble_flag, "-rz", "-j", config.diff_section] + return ( + project.myimg, + (objdump_flags + flags1, project.baseimg, None), + ( + objdump_flags + flags2 + maybe_get_objdump_source_flags(config), + project.myimg, + None, + ), + ) + + +def dump_objfile( + start: str, end: Optional[str], config: Config, project: ProjectSettings +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + if config.base_shift: + fail("--base-shift not compatible with -o") + if end is not None: + fail("end address not supported together with -o") + if start.startswith("0"): + fail("numerical start address not supported with -o; pass a function name") + + objfile = config.file + if not objfile: + objfile, _ = search_map_file(start, project, config, for_binary=False) + + if not objfile: + fail("Not able to find .o file for function.") + + if config.make: + run_make(objfile, project) + + if not os.path.isfile(objfile): + fail(f"Not able to find .o file for function: {objfile} is not a file.") + + refobjfile = os.path.join(project.expected_dir, objfile) + if config.diff_mode != DiffMode.SINGLE and not os.path.isfile(refobjfile): + fail(f'Please ensure an OK .o file exists at "{refobjfile}".') + + if project.disassemble_all: + disassemble_flag = "-D" + else: + disassemble_flag = "-d" + + objdump_flags = [disassemble_flag, "-rz", "-j", config.diff_section] + return ( + objfile, + (objdump_flags, refobjfile, start), + (objdump_flags + maybe_get_objdump_source_flags(config), objfile, start), + ) + + +def dump_binary( + start: str, end: Optional[str], config: Config, project: ProjectSettings +) -> Tuple[str, ObjdumpCommand, ObjdumpCommand]: + binfile = config.file or project.myimg + if not project.baseimg or not binfile: + fail("Missing myimg/baseimg in config.") + if config.make: + run_make(binfile, project) + if not os.path.isfile(binfile): + fail(f"Not able to find binary file: {binfile}") + start_addr = maybe_eval_int(start) + if start_addr is None and config.file is None: + _, start_addr = search_map_file(start, project, config, for_binary=True) + if start_addr is None: + fail("Not able to find function in map file.") + start_addr += project.map_address_offset + elif start_addr is None: + fail("Start address must be an integer expression when using binary -f") + if end is not None: + end_addr = eval_int(end, "End address must be an integer expression.") + else: + end_addr = start_addr + config.max_function_size_bytes + objdump_flags = ["-Dz", "-bbinary"] + ["-EB" if config.arch.big_endian else "-EL"] + flags1 = [ + f"--start-address={start_addr + config.base_shift}", + f"--stop-address={end_addr + config.base_shift}", + ] + flags2 = [f"--start-address={start_addr}", f"--stop-address={end_addr}"] + return ( + binfile, + (objdump_flags + flags1, project.baseimg, None), + (objdump_flags + flags2, binfile, None), + ) + + +# The base class is a no-op. +class AsmProcessor: + def __init__(self, config: Config) -> None: + self.config = config + + # Called during run_objdump() for arch-specific normalization. Runs before + # diff-processing, i.e. process(). + def preprocess_objdump(self, objdump: str) -> str: + return objdump + + def pre_process( + self, mnemonic: str, args: str, next_row: Optional[str], comment: Optional[str] + ) -> Tuple[str, str]: + return mnemonic, args + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + return prev, None + + def normalize(self, mnemonic: str, row: str) -> str: + """This should be called exactly once for each line.""" + arch = self.config.arch + row = self._normalize_arch_specific(mnemonic, row) + if self.config.ignore_large_imms and mnemonic not in arch.branch_instructions: + row = re.sub(self.config.arch.re_large_imm, "", row) + return row + + def _normalize_arch_specific(self, mnemonic: str, row: str) -> str: + return row + + def post_process(self, lines: List["Line"]) -> None: + return + + def is_end_of_function(self, mnemonic: str, args: str) -> bool: + return False + + +class AsmProcessorMIPS(AsmProcessor): + def __init__(self, config: Config) -> None: + super().__init__(config) + self.seen_jr_ra = False + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + arch = self.config.arch + if "R_MIPS_NONE" in row or "R_MIPS_JALR" in row: + # GNU as emits no-op relocations immediately after real ones when + # assembling with -mabi=64. Return without trying to parse 'imm' as an + # integer. + return prev, None + before, imm, after = parse_relocated_line(prev) + addend = reloc_addend_from_imm(imm, before, self.config.arch) + repl = row.split()[-1] + addend + if "R_MIPS_LO16" in row: + repl = f"%lo({repl})" + elif "R_MIPS_HI16" in row: + # Ideally we'd pair up R_MIPS_LO16 and R_MIPS_HI16 to generate a + # correct addend for each, but objdump doesn't give us the order of + # the relocations, so we can't find the right LO16. :( + repl = f"%hi({repl})" + elif "R_MIPS_26" in row: + # Function calls + pass + elif "R_MIPS_PC16" in row: + # Branch to glabel. This gives confusing output, but there's not much + # we can do here. + pass + elif "R_MIPS_GPREL16" in row: + repl = f"%gp_rel({repl})" + elif "R_MIPS_GOT16" in row: + repl = f"%got({repl})" + elif "R_MIPS_CALL16" in row: + repl = f"%call16({repl})" + elif "R_MIPS_LITERAL" in row: + # MWCC emits R_MIPS_LITERAL for float literals which don't have the same structure as the + # relocations emitted by GCC + # Only remove the addend if it is found in the relocation + if addend != "": + # .lit4+0x4000-0x4000 -> .lit4+0x4000 + repl = repl[: -len(addend)] + else: + assert False, f"unknown relocation type '{row}' for line '{prev}'" + return before + repl + after, repl + + def is_end_of_function(self, mnemonic: str, args: str) -> bool: + if self.seen_jr_ra: + return True + if mnemonic == "jr" and args == "ra": + self.seen_jr_ra = True + return False + + +class AsmProcessorPPC(AsmProcessor): + def pre_process( + self, mnemonic: str, args: str, next_row: Optional[str], comment: Optional[str] + ) -> Tuple[str, str]: + if next_row and "R_PPC_EMB_SDA21" in next_row: + # With sda21 relocs, the linker transforms `r0` into `r2`/`r13`, and + # we may encounter this in either pre-transformed or post-transformed + # versions depending on if the .o file comes from compiler output or + # from disassembly. Normalize, to make sure both forms are treated as + # equivalent. + + args = args.replace("(r2)", "(0)") + args = args.replace("(r13)", "(0)") + args = args.replace(",r2,", ",0,") + args = args.replace(",r13,", ",0,") + + # We want to convert li and lis with an sda21 reloc, + # because the r0 to r2/r13 transformation results in + # turning an li/lis into an addi/addis with r2/r13 arg + # our preprocessing normalizes all versions to addi with a 0 arg + if mnemonic in {"li", "lis"}: + mnemonic = mnemonic.replace("li", "addi") + args_parts = args.split(",") + args = args_parts[0] + ",0," + args_parts[1] + if ( + next_row + and ("R_PPC_REL24" in next_row or "R_PPC_REL14" in next_row) + and ".text+0x" in next_row + and mnemonic in PPC_BRANCH_INSTRUCTIONS + ): + # GCC emits a relocation of "R_PPC_REL14" or "R_PPC_REL24" with a .text offset + # fixup the args to use the offset from the relocation + + # Split args by ',' which will result in either [cr, offset] or [offset] + # Replace the current offset with the next line's ".text+0x" offset + splitArgs = args.split(",") + splitArgs[-1] = next_row.split(".text+0x")[-1] + args = ",".join(splitArgs) + + if ( + comment is not None + and ( + next_row is None + or re.search(self.config.arch.re_reloc, next_row) is None + ) + and mnemonic == "bl" + ): + # if the mnemonic is bl and the comment doesn't match + # <.text+0x...> replace the args with the contents of the comment + if re.search(r"<.+\+0x[0-9a-fA-F]+>", comment) is None: + args = comment[1:-1] + + return mnemonic, args + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + # row is the line with the relocations + # prev is the line to apply relocations to + + arch = self.config.arch + assert any( + r in row + for r in ["R_PPC_REL24", "R_PPC_ADDR16", "R_PPC_EMB_SDA21", "R_PPC_REL14"] + ), f"unknown relocation type '{row}' for line '{prev}'" + before, imm, after = parse_relocated_line(prev) + repl = row.split()[-1] + mnemonic, args = prev.split(maxsplit=1) + + if "R_PPC_REL24" in row: + # function calls + # or unconditional branches generated by GCC "b offset" + if mnemonic in PPC_BRANCH_INSTRUCTIONS and ".text+0x" in row: + # this has been handled in pre_process + return prev, None + elif "R_PPC_REL14" in row: + if mnemonic in PPC_BRANCH_INSTRUCTIONS and ".text+0x" in row: + # this has been handled in pre_process + return prev, None + elif "R_PPC_ADDR16_HI" in row: + # absolute hi of addr + repl = f"{repl}@h" + elif "R_PPC_ADDR16_HA" in row: + # adjusted hi of addr + repl = f"{repl}@ha" + elif "R_PPC_ADDR16_LO" in row: + # lo of addr + repl = f"{repl}@l" + elif "R_PPC_ADDR16" in row: + # 16-bit absolute addr + if "+0x7" in repl: + # remove the very large addends as they are an artifact of (label-_SDA(2)_BASE_) + # computations and are unimportant in a diff setting. + if int(repl.split("+")[1], 16) > 0x70000000: + repl = repl.split("+")[0] + elif "R_PPC_EMB_SDA21" in row: + # sda21 relocations; r2/r13 --> 0 swaps are performed in pre_process + repl = f"{repl}@sda21" + + return before + repl + after, repl + + def is_end_of_function(self, mnemonic: str, args: str) -> bool: + return mnemonic == "blr" + + +# Example: "cmp r0, #0x10" +ARM32_COMPARE_IMM_PATTERN = r"cmp\s+(r[0-9]|1[0-3]),\s+#(\w+)" + +# Example: "add pc, r1" +ARM32_JUMP_TABLE_START = r"add\s+pc,\s*r" + +# Examples: +# - "44: 00060032 .word 0x00060032" +# - "48: 0032 .short 0x0032" +# - "4a: 0032 movs r2, r6" +# - "9e: 00be lsls r6, r7, #2" +# - ".short 0x0032 ; 0x64" +ARM32_JUMP_TABLE_ENTRY_PATTERN = r"(?:(\w+):\s+([0-9a-f]+)\s+)?([\w\.]+)\s+([\w,\ ]+)" + +# Example: "ldr r4, [pc, #56] ; (4c )" +ARM32_LOAD_POOL_PATTERN = r"(ldr\s+r([0-9]|1[0-3]),\s+\[pc,.*;\s*)(\([a-fA-F0-9]+.*\))" + + +class AsmProcessorARM32(AsmProcessor): + @dataclass + class JumpTableEntry: + cur_addr: int + table_start_addr: int + value: int + is_word: bool + + def preprocess_objdump(self, objdump: str) -> str: + def short_table_entry( + cur_addr: int, jump_table_start_addr: int, value: int + ) -> str: + branch_target = jump_table_start_addr + value + 4 + return f" {cur_addr:x}: {value:04x} .short 0x{value:04x} ; 0x{branch_target:x}" + + new_lines = [] + lines = objdump.splitlines() + for i, jump_table_entry in self._lines_iterator(lines): + if jump_table_entry is None: + new_lines.append(lines[i]) + continue + + entry = jump_table_entry + if entry.is_word: + # Split into two ".short" entries. + hi, lo = entry.value >> 16, entry.value & 0xFFFF + new_lines.append( + short_table_entry(entry.cur_addr, entry.table_start_addr, lo) + ) + new_lines.append( + short_table_entry(entry.cur_addr + 2, entry.table_start_addr, hi) + ) + else: + new_lines.append( + short_table_entry( + entry.cur_addr, entry.table_start_addr, entry.value + ) + ) + return "\n".join(new_lines) + + # An iterator for each line of assembly, returning the line index and optional + # metadata if the line is a jump table entry. + def _lines_iterator( + self, lines: List[str] + ) -> Iterator[Tuple[int, Optional[JumpTableEntry]]]: + jump_table_entries = 0 + table_start_addr = 0 + for i, line in enumerate(lines): + addr_match = re.match(r"^\s*([0-9a-f]+):", line) + addr = int(addr_match.group(1), 16) if addr_match else -1 + entry_match = re.search(ARM32_JUMP_TABLE_ENTRY_PATTERN, line) + if jump_table_entries > 0 and entry_match: + value = ( + entry_match.group(4) + if is_hexstring(entry_match.group(4)) + else entry_match.group(2) + ) + table_entry = self.JumpTableEntry( + cur_addr=addr, + table_start_addr=table_start_addr, + value=int(value, 16), + is_word=entry_match.group(3) == ".word", + ) + jump_table_entries -= 2 if table_entry.is_word else 1 + + yield i, table_entry + continue + + # Check for jump tables. + if re.search(ARM32_JUMP_TABLE_START, line): + jump_table_entries = self._jump_table_entries_count(lines, i) + table_start_addr = addr + yield i, None + + # Returns the number of entries in the jump table starting at `line_no`, or + # 0 if it's not a jump table. + def _jump_table_entries_count(self, raw_lines: List[str], line_no: int) -> int: + # The number of entries should be in the most recent `cmp` before the + # jump table. + for i in reversed(range(line_no)): + cmp_match = re.search(ARM32_COMPARE_IMM_PATTERN, raw_lines[i]) + if cmp_match: + value = immediate_to_int(cmp_match.group(2)) + if value > 0: + return value + 1 + return 0 + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + arch = self.config.arch + if "R_ARM_V4BX" in row: + # R_ARM_V4BX converts "bx " to "mov pc," for some targets. + # Ignore for now. + return prev, None + if "R_ARM_ABS32" in row and not prev.startswith(".word"): + # Don't crash on R_ARM_ABS32 relocations incorrectly applied to code. + # (We may want to do something more fancy here that actually shows the + # related symbol, but this serves as a stop-gap.) + return prev, None + before, imm, after = parse_relocated_line(prev) + repl = row.split()[-1] + reloc_addend_from_imm(imm, before, self.config.arch) + return before + repl + after, repl + + def _normalize_arch_specific(self, mnemonic: str, row: str) -> str: + if self.config.ignore_addr_diffs: + row = self._normalize_bl(mnemonic, row) + row = self._normalize_data_pool(row) + return row + + def _normalize_bl(self, mnemonic: str, row: str) -> str: + if mnemonic != "bl": + return row + + row, _ = split_off_address(row) + return row + "" + + def _normalize_data_pool(self, row: str) -> str: + pool_match = re.search(ARM32_LOAD_POOL_PATTERN, row) + return pool_match.group(1) if pool_match else row + + def _post_process_jump_tables(self, lines: List["Line"]) -> None: + raw_lines = [ + ( + f"{line.line_num:x}: {line.original}" + if line.line_num is not None + else line.original + ) + for line in lines + ] + for i, jump_table_entry in self._lines_iterator(raw_lines): + if jump_table_entry is None: + continue + + entry = jump_table_entry + lines[i].branch_target = entry.table_start_addr + entry.value + 4 + + def _post_process_data_pools(self, lines: List["Line"]) -> None: + lines_by_line_number = {} + for line in lines: + if line.line_num is not None: + lines_by_line_number[line.line_num] = line + for line in lines: + if line.data_pool_addr is None: + continue + + # Add data symbol and its address to the line. + line_original = lines_by_line_number[line.data_pool_addr].original + value = line_original.split()[1] + addr = "{:x}".format(line.data_pool_addr) + line.original = line.normalized_original + f"={value} ({addr})" + + def post_process(self, lines: List["Line"]) -> None: + self._post_process_jump_tables(lines) + self._post_process_data_pools(lines) + + +class AsmProcessorAArch64(AsmProcessor): + def __init__(self, config: Config) -> None: + super().__init__(config) + self._adrp_pair_registers: Set[str] = set() + + def _normalize_arch_specific(self, mnemonic: str, row: str) -> str: + if self.config.ignore_addr_diffs: + row = self._normalize_adrp_differences(mnemonic, row) + row = self._normalize_bl(mnemonic, row) + return row + + def _normalize_bl(self, mnemonic: str, row: str) -> str: + if mnemonic != "bl": + return row + + row, _ = split_off_address(row) + return row + "" + + def _normalize_adrp_differences(self, mnemonic: str, row: str) -> str: + """Identifies ADRP + LDR/ADD pairs that are used to access the GOT and + suppresses any immediate differences. + + Whenever an ADRP is seen, the destination register is added to the set of registers + that are part of an ADRP + LDR/ADD pair. Registers are removed from the set as soon + as they are used for an LDR or ADD instruction which completes the pair. + + This method is somewhat crude but should manage to detect most such pairs. + """ + row_parts = row.split("\t", 1) + if mnemonic == "adrp": + self._adrp_pair_registers.add(row_parts[1].strip().split(",")[0]) + row, _ = split_off_address(row) + return row + "" + elif mnemonic == "ldr": + for reg in self._adrp_pair_registers: + # ldr xxx, [reg] + # ldr xxx, [reg, ] + if f", [{reg}" in row_parts[1]: + self._adrp_pair_registers.remove(reg) + return normalize_imms(row, AARCH64_SETTINGS) + elif mnemonic == "add": + for reg in self._adrp_pair_registers: + # add reg, reg, + if row_parts[1].startswith(f"{reg}, {reg}, "): + self._adrp_pair_registers.remove(reg) + return normalize_imms(row, AARCH64_SETTINGS) + + return row + + +class AsmProcessorI686(AsmProcessor): + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + if "WRTSEG" in row: # ignore WRTSEG (watcom) + return prev, None + repl = row.split()[-1] + mnemonic, args = prev.split(maxsplit=1) + offset = False + + # Calls + # Example call a2f + # Example call *0 + # Example jmp 64 + if mnemonic in I686_BRANCH_INSTRUCTIONS: + addr_imm = re.search(r"(^|(?<=\*)|(?<=\*\%cs\:))[0-9a-f]+", args) + + # Direct use of reloc + # Example 0x0,0x8(%edi) + # Example 0x0,%edi + # Example *0x0(,%edx,4) + # Example %edi,0 + # Example movb $0x0,0x0 + # Example $0x0,0x4(%edi) + # Match 0x0 part to replace + else: + addr_imm = re.search(r"(?:0x)?0+$", args) + + if not addr_imm: + addr_imm = re.search(r"(^\$?|(?<=\*))0x0", args) + + # Offset value + # Example 0x4,%eax + # Example $0x4,%eax + if not addr_imm: + addr_imm = re.search(r"(^|(?<=\*)|(?<=\$))0x[0-9a-f]+", args) + offset = True + + if not addr_imm: + addr_imm = re.search(r"(^|(?<=\*)|(?<=\%[fgdecs]s\:))0x[0-9a-f]+", args) + offset = True + + if not addr_imm: + assert False, f"failed to find address immediate for line '{prev}'" + + start, end = addr_imm.span() + + if "R_386_NONE" in row: + pass + elif "R_386_32" in row: + pass + elif "R_386_PC32" in row: + pass + elif "R_386_16" in row: + pass + elif "R_386_PC16" in row: + pass + elif "R_386_8" in row: + pass + elif "R_386_PC8" in row: + pass + elif "dir32" in row: + if "+" in repl: + repl = repl.split("+")[0] + elif "DISP32" in row: + pass + elif "OFF32" in row: + pass + elif "OFFPC32" in row: + if "+" in repl: + repl = repl.split("+")[0] + elif "R_386_GOT32" in row: + repl = f"%got({repl})" + elif "R_386_PLT32" in row: + repl = f"%plt({repl})" + elif "R_386_RELATIVE" in row: + repl = f"%rel({repl})" + elif "R_386_GOTOFF" in row: + repl = f"%got({repl})" + elif "R_386_GOTPC" in row: + repl = f"%got({repl})" + elif "R_386_32PLT" in row: + repl = f"%plt({repl})" + else: + assert False, f"unknown relocation type '{row}' for line '{prev}'" + + if offset: + repl = f"{repl}+{addr_imm.group()}" + + return f"{mnemonic}\t{args[:start]+repl+args[end:]}", repl + + def is_end_of_function(self, mnemonic: str, args: str) -> bool: + return mnemonic == "ret" + + +class AsmProcessorSH2(AsmProcessor): + def __init__(self, config: Config) -> None: + super().__init__(config) + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + return prev, None + + def is_end_of_function(self, mnemonic: str, args: str) -> bool: + return mnemonic == "rts" + + +class AsmProcessorM68k(AsmProcessor): + def pre_process( + self, mnemonic: str, args: str, next_row: Optional[str], comment: Optional[str] + ) -> Tuple[str, str]: + # replace objdump's syntax of pointer accesses with the equivilant in AT&T syntax for readability + return mnemonic, re.sub( + r"%(sp|a[0-7]|fp|pc)@(?:(?:\((-?(?:0x[0-9a-f]+|[0-9]+)) *(,%d[0-7]:[wl])?\))|(\+)|(-))?", + r"\5\2(%\1\3)\4", + args, + ) + + def process_reloc(self, row: str, prev: str) -> Tuple[str, Optional[str]]: + repl = row.split()[-1] + mnemonic, args = prev.split(maxsplit=1) + + addr_imm = re.search(r"(? bool: + return mnemonic == "rts" or mnemonic == "rte" or mnemonic == "rtr" + + +@dataclass +class ArchSettings: + name: str + re_int: Pattern[str] + re_comment: Pattern[str] + re_reg: Pattern[str] + re_sprel: Pattern[str] + re_large_imm: Pattern[str] + re_imm: Pattern[str] + re_reloc: Pattern[str] + branch_instructions: Set[str] + instructions_with_address_immediates: Set[str] + forbidden: Set[str] = field(default_factory=lambda: set(string.ascii_letters + "_")) + arch_flags: List[str] = field(default_factory=list) + branch_likely_instructions: Set[str] = field(default_factory=set) + proc: Type[AsmProcessor] = AsmProcessor + big_endian: Optional[bool] = True + delay_slot_instructions: Set[str] = field(default_factory=set) + + +MIPS_BRANCH_LIKELY_INSTRUCTIONS = { + "beql", + "bnel", + "beqzl", + "bnezl", + "bgezl", + "bgtzl", + "blezl", + "bltzl", + "bc1tl", + "bc1fl", +} +MIPS_BRANCH_INSTRUCTIONS = MIPS_BRANCH_LIKELY_INSTRUCTIONS.union( + { + "b", + "beq", + "bne", + "beqz", + "bnez", + "bgez", + "bgtz", + "blez", + "bltz", + "bc1t", + "bc1f", + } +) + +ARM32_PREFIXES = {"b", "bl"} +ARM32_CONDS = { + "", + "eq", + "ne", + "cs", + "cc", + "mi", + "pl", + "vs", + "vc", + "hi", + "ls", + "ge", + "lt", + "gt", + "le", + "al", +} +ARM32_SUFFIXES = {"", ".n", ".w"} +ARM32_BRANCH_INSTRUCTIONS = { + f"{prefix}{cond}{suffix}" + for prefix in ARM32_PREFIXES + for cond in ARM32_CONDS + for suffix in ARM32_SUFFIXES +} + +AARCH64_BRANCH_INSTRUCTIONS = { + "b", + "b.eq", + "b.ne", + "b.cs", + "b.hs", + "b.cc", + "b.lo", + "b.mi", + "b.pl", + "b.vs", + "b.vc", + "b.hi", + "b.ls", + "b.ge", + "b.lt", + "b.gt", + "b.le", + "cbz", + "cbnz", + "tbz", + "tbnz", +} + +PPC_BRANCH_INSTRUCTIONS = { + "b", + "beq", + "beq+", + "beq-", + "bne", + "bne+", + "bne-", + "blt", + "blt+", + "blt-", + "ble", + "ble+", + "ble-", + "bdz", + "bdz+", + "bdz-", + "bdnz", + "bdnz+", + "bdnz-", + "bge", + "bge+", + "bge-", + "bgt", + "bgt+", + "bgt-", + "bso", + "bso+", + "bso-", + "bns", + "bns+", + "bns-", +} + +I686_BRANCH_INSTRUCTIONS = { + "call", + "jmp", + "ljmp", + "ja", + "jae", + "jb", + "jbe", + "jc", + "jcxz", + "jecxz", + "jrcxz", + "je", + "jg", + "jge", + "jl", + "jle", + "jna", + "jnae", + "jnb", + "jnbe", + "jnc", + "jne", + "jng", + "jnge", + "jnl", + "jnle", + "jno", + "jnp", + "jns", + "jnz", + "jo", + "jp", + "jpe", + "jpo", + "js", + "jz", + "ja", + "jae", + "jb", + "jbe", + "jc", + "je", + "jz", + "jg", + "jge", + "jl", + "jle", + "jna", + "jnae", + "jnb", + "jnbe", + "jnc", + "jne", + "jng", + "jnge", + "jnl", + "jnle", + "jno", + "jnp", + "jns", + "jnz", + "jo", + "jp", + "jpe", + "jpo", + "js", + "jz", +} + +SH2_BRANCH_INSTRUCTIONS = { + "bf", + "bf.s", + "bt", + "bt.s", + "bra", + "bsr", +} + +M68K_CONDS = { + "ra", + "cc", + "cs", + "eq", + "ge", + "gt", + "hi", + "le", + "ls", + "lt", + "mi", + "ne", + "pl", + "vc", + "vs", +} + +M68K_BRANCH_INSTRUCTIONS = { + f"{prefix}{cond}{suffix}" + for prefix in {"b", "db"} + for cond in M68K_CONDS + for suffix in {"s", "w"} +}.union( + { + "dbt", + "dbf", + "bsrw", + "bsrs", + } +) + + +MIPS_SETTINGS = ArchSettings( + name="mips", + re_int=re.compile(r"[0-9]+"), + re_comment=re.compile(r"<.*>"), + # Includes: + # - General purpose registers v0..1, a0..7, t0..9, s0..8, zero, at, fp, k0..1/kt0..1 + # - Float registers f0..31, or fv0..1, fa0..7, ft0..15, fs0..8 plus odd complements + # (actually used number depends on ABI) + # sp, gp should not be in this list + re_reg=re.compile(r"\$?\b([astv][0-9]|at|f[astv]?[0-9]+f?|kt?[01]|fp|ra|zero)\b"), + re_sprel=re.compile(r"(?<=,)([0-9]+|0x[0-9a-f]+)\(sp\)"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile( + r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(sp)|%(lo|hi|got|gp_rel|call16)\([^)]*\)" + ), + re_reloc=re.compile(r"R_MIPS_"), + arch_flags=["-m", "mips:4300"], + branch_likely_instructions=MIPS_BRANCH_LIKELY_INSTRUCTIONS, + branch_instructions=MIPS_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=MIPS_BRANCH_INSTRUCTIONS.union( + {"j", "jal", "bal"} + ), + delay_slot_instructions=MIPS_BRANCH_INSTRUCTIONS.union( + {"j", "jal", "jr", "jalr", "bal"} + ), + proc=AsmProcessorMIPS, +) + +MIPSEL_SETTINGS = replace( + MIPS_SETTINGS, name="mipsel", big_endian=False, arch_flags=["-m", "mips:3000"] +) + +MIPSEE_SETTINGS = replace( + MIPSEL_SETTINGS, name="mipsee", arch_flags=["-m", "mips:5900"] +) + +MIPSEL_4000_SETTINGS = replace( + MIPSEL_SETTINGS, name="mipsel:4000", arch_flags=["-m", "mips:gs464"] +) + +MIPS_ARCH_NAMES = {"mips", "mipsel", "mipsee", "mipsel:4000"} + +ARM32_SETTINGS = ArchSettings( + name="arm32", + re_int=re.compile(r"[0-9]+"), + re_comment=re.compile(r"(<.*>|//.*$)"), + # Includes: + # - General purpose registers: r0..13 + # - Frame pointer registers: lr (r14), pc (r15) + # - VFP/NEON registers: s0..31, d0..31, q0..15, fpscr, fpexc, fpsid + # SP should not be in this list. + re_reg=re.compile( + r"\$?\b([rq][0-9]|[rq]1[0-5]|pc|lr|[ds][12]?[0-9]|[ds]3[01]|fp(scr|exc|sid))\b" + ), + re_sprel=re.compile(r"sp, #-?(0x[0-9a-fA-F]+|[0-9]+)\b"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"(?|//.*$)"), + # GPRs and FP registers: X0-X30, W0-W30, [BHSDVQ]0..31 + # (FP registers may be followed by data width and number of elements, e.g. V0.4S) + # The zero registers and SP should not be in this list. + re_reg=re.compile( + r"\$?\b([bhsdvq]([12]?[0-9]|3[01])(\.\d\d?[bhsdvq])?|[xw][12]?[0-9]|[xw]30)\b" + ), + re_sprel=re.compile(r"sp, #-?(0x[0-9a-fA-F]+|[0-9]+)\b"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"(?|//.*$)"), + # r1 not included + re_reg=re.compile(r"\$?\b([rf](?:[02-9]|[1-9][0-9]+)|f1)\b"), + re_sprel=re.compile(r"(?<=,)(-?[0-9]+|-?0x[0-9a-f]+)\(r1\)"), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile( + r"(\b|-)([0-9]+|0x[0-9a-fA-F]+)\b(?!\(r1\))|[^ \t,]+@(l|ha|h|sda21)" + ), + re_reloc=re.compile(r"R_PPC_"), + arch_flags=["-m", "powerpc", "-M", "broadway"], + branch_instructions=PPC_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=PPC_BRANCH_INSTRUCTIONS.union({"bl"}), + proc=AsmProcessorPPC, +) + +I686_SETTINGS = ArchSettings( + name="i686", + re_int=re.compile(r"[0-9]+"), + re_comment=re.compile(r"<.*>"), + # Includes: + # - (e)a-d(x,l,h) + # - (e)s,d,b(i,p)(l) + # - cr0-7 + # - x87 st + # - MMX, SSE vector registers + # - cursed registers: eal ebl ebh edl edh... + re_reg=re.compile( + r"\%?\b(e?(([sd]i|[sb]p)l?|[abcd][xhl])|[cdesfg]s|cr[0-7]|x?mm[0-7]|st)\b" + ), + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_sprel=re.compile(r"-?(0x[0-9a-f]+|[0-9]+)(?=\((%ebp|%esi)\))"), + re_imm=re.compile(r"-?(0x[0-9a-f]+|[0-9]+)|([\?$_][^ \t,]+)"), + re_reloc=re.compile(r"R_386_|dir32|DISP32|WRTSEG|OFF32|OFFPC32"), + # The x86 architecture has a variable instruction length. The raw bytes of + # an instruction as displayed by objdump can line wrap if it's long enough. + # This destroys the objdump output processor logic, so we avoid this. + arch_flags=["-m", "i386", "--no-show-raw-insn"], + branch_instructions=I686_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=I686_BRANCH_INSTRUCTIONS.union({"mov"}), + proc=AsmProcessorI686, +) + +SH2_SETTINGS = ArchSettings( + name="sh2", + # match -128-127 preceded by a '#' with a ',' after (8 bit immediates) + re_int=re.compile(r"(?<=#)(-?(?:1[01][0-9]|12[0-8]|[1-9][0-9]?|0))(?=,)"), + # match , match ! and after + re_comment=re.compile(r"<.*?>|!.*"), + # - r0-r15 general purpose registers, r15 is stack pointer during exceptions + # - sr, gbr, vbr - control registers + # - mach, macl, pr, pc - system registers + re_reg=re.compile(r"r1[0-5]|r[0-9]"), + # sh2 has pc-relative and gbr-relative but not stack-pointer-relative + re_sprel=re.compile(r"(?<=,)([0-9]+|0x[0-9a-f]+)\(sp\)"), + # max immediate size is 8-bit + re_large_imm=re.compile(r"-?[1-9][0-9]{2,}|-?0x[0-9a-f]{3,}"), + re_imm=re.compile(r"\b0[xX][0-9a-fA-F]+\b"), + # https://github.com/bminor/binutils-gdb/blob/master/bfd/elf32-sh-relocs.h#L21 + re_reloc=re.compile(r"R_SH_"), + arch_flags=["-m", "sh2"], + branch_instructions=SH2_BRANCH_INSTRUCTIONS, + instructions_with_address_immediates=SH2_BRANCH_INSTRUCTIONS.union( + {"bf", "bf.s", "bt", "bt.s", "bra", "bsr"} + ), + delay_slot_instructions=SH2_BRANCH_INSTRUCTIONS.union( + {"bf.s", "bt.s", "bra", "braf", "bsr", "bsrf", "jmp", "jsr", "rts"} + ), + proc=AsmProcessorSH2, +) + +SH4_SETTINGS = replace( + SH2_SETTINGS, + name="sh4", + # - fr0-fr15, dr0-dr14, xd0-xd14, fv0-fv12 FP registers + # dr/xd registers can only be even-numbered, and fv registers can only be a multiple of 4 + re_reg=re.compile( + r"r1[0-5]|r[0-9]|fr1[0-5]|fr[0-9]|dr[02468]|dr1[024]|xd[02468]|xd1[024]|fv[048]|fv12" + ), + arch_flags=["-m", "sh4"], +) + +SH4EL_SETTINGS = replace(SH4_SETTINGS, name="sh4el", big_endian=False) + +M68K_SETTINGS = ArchSettings( + name="m68k", + re_int=re.compile(r"[0-9]+"), + # '|' is used by assemblers, but is not used by objdump + re_comment=re.compile(r"<.*>"), + # Includes: + # - d0-d7 data registers + # - a0-a6 address registers + # - fp0-fp7 floating-point registers + # - usp (user sp) + # - fp, sr, ccr + # - fpcr, fpsr, fpiar + re_reg=re.compile(r"%\b(d[0-7]|a[0-6]|usp|fp([0-7]|cr|sr|iar)?|sr|ccr)(:[wl])?\b"), + # This matches all stack accesses that do not use an index register + re_sprel=re.compile(r"-?(0x[0-9a-f]+|[0-9]+)(?=\((%sp|%a7)\))"), + re_imm=re.compile(r"#?-?\b(0x[0-9a-f]+|[0-9]+)(?!\()"), + re_large_imm=re.compile(r"#?-?([1-9][0-9]{2,}|0x[0-9a-f]{3,})"), + re_reloc=re.compile(r"R_68K_"), + arch_flags=["-m", "m68k"], + branch_instructions=M68K_BRANCH_INSTRUCTIONS, + # Pretty much every instruction can take an address immediate + instructions_with_address_immediates=M68K_BRANCH_INSTRUCTIONS.union("jmp", "jsr"), + proc=AsmProcessorM68k, +) + +ARCH_SETTINGS = [ + MIPS_SETTINGS, + MIPSEL_SETTINGS, + MIPSEE_SETTINGS, + MIPSEL_4000_SETTINGS, + ARM32_SETTINGS, + ARMEL_SETTINGS, + AARCH64_SETTINGS, + PPC_SETTINGS, + I686_SETTINGS, + SH2_SETTINGS, + SH4_SETTINGS, + SH4EL_SETTINGS, + M68K_SETTINGS, +] + + +def immediate_to_int(immediate: str) -> int: + imm_match = re.match(r"#?(0x)?([0-9a-f]+)", immediate) + assert imm_match + base = 16 if imm_match.group(1) else 10 + return int(imm_match.group(2), base) + + +def is_hexstring(value: str) -> bool: + try: + int(value, 16) + return True + except ValueError: + return False + + +def hexify_int(row: str, pat: Match[str], arch: ArchSettings) -> str: + full = pat.group(0) + + # sh2/sh4 only has 8-bit immediates, just convert them uniformly without + # any -hex stuff + if arch.name == "sh2" or arch.name == "sh4" or arch.name == "sh4el": + return hex(int(full) & 0xFF) + + if len(full) <= 1: + # leave one-digit ints alone + return full + start, end = pat.span() + if start and row[start - 1] in arch.forbidden: + return full + if end < len(row) and row[end] in arch.forbidden: + return full + return hex(int(full)) + + +def parse_relocated_line(line: str) -> Tuple[str, str, str]: + # Pick out the last argument + for c in ",\t ": + if c in line: + ind2 = line.rindex(c) + break + else: + raise Exception(f"failed to parse relocated line: {line}") + before = line[: ind2 + 1] + after = line[ind2 + 1 :] + # Move an optional ($reg) part of it to 'after' + ind2 = after.find("(") + if ind2 == -1: + imm, after = after, "" + else: + imm, after = after[:ind2], after[ind2:] + return before, imm, after + + +def reloc_addend_from_imm(imm: str, before: str, arch: ArchSettings) -> str: + """For architectures like MIPS where relocations have addends embedded in + the code as immediates, convert such an immediate into an addition/ + subtraction that can occur just after the symbol.""" + # TODO this is incorrect for MIPS %lo/%hi which need to be paired up + # and combined. In practice, this means we only get symbol offsets within + # %lo, while %hi just shows the symbol. Unfortunately, objdump's output + # loses relocation order, so we cannot do this without parsing ELF relocs + # ourselves... + mnemonic = before.split()[0] + if mnemonic in arch.instructions_with_address_immediates: + addend = int(imm, 16) + else: + addend = int(imm, 0) + if addend == 0: + return "" + elif addend < 0: + return hex(addend) + else: + return "+" + hex(addend) + + +def pad_mnemonic(line: str) -> str: + if "\t" not in line: + return line + mn, args = line.split("\t", 1) + return f"{mn:<7s} {args}" + + +@dataclass +class Line: + mnemonic: str + diff_row: str + original: str + normalized_original: str + scorable_line: str + symbol: Optional[str] = None + line_num: Optional[int] = None + branch_target: Optional[int] = None + data_pool_addr: Optional[int] = None + source_filename: Optional[str] = None + source_line_num: Optional[int] = None + source_lines: List[str] = field(default_factory=list) + comment: Optional[str] = None + + +def process(dump: str, config: Config) -> List[Line]: + arch = config.arch + processor = arch.proc(config) + source_lines = [] + source_filename = None + source_line_num = None + rets_remaining = config.stop_at_ret + + i = 0 + num_instr = 0 + data_refs: Dict[int, Dict[str, List[int]]] = defaultdict(lambda: defaultdict(list)) + output: List[Line] = [] + lines = dump.split("\n") + while i < len(lines): + row = lines[i] + i += 1 + + if not row: + continue + + # Check if the currrent line has "OFFSET :" + function_label_match = re.match(r"^[0-9a-f]+ <(.*)>:$", row) + + if function_label_match: + function_name = function_label_match.groups()[0] + ":" + + if config.diff_function_symbols: + # If diffing function symbols is enabled + # Add the symbol to the diff output + + output.append( + Line( + mnemonic="