From 3d58db8765c95bc7ee0d80f974515bf91f01c633 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Mon, 13 Jan 2025 11:32:20 +0800
Subject: [PATCH] Armv7-M: Default to .w for better alignment

Currently, we maintain the instruction-width modifiers as they
are in the input code (except for some exceptions).
However, this can negatively impact performance as SLOTHY
may break code-alignment.
This commit changes the Armv7-M instruction writer, to output
.w for all instructions resulting in the best performance
(modulo the size of the instruction cache).
Unfortunately, LLVM (in the selftest) stumbles over some of these
.w modifiers in places where they do not have any effect.
To work around that, we remove the modifiers for the selftest.
---
 slothy/helper.py                   | 20 ++++++++++++++++++--
 slothy/targets/arm_v7m/arch_v7m.py | 12 ++++++++----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/slothy/helper.py b/slothy/helper.py
index 3d386d09..a481e2fc 100644
--- a/slothy/helper.py
+++ b/slothy/helper.py
@@ -1212,12 +1212,28 @@ def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_pa
                                               include=include_paths)
             except subprocess.CalledProcessError as exc:
                 log.error("CPreprocessor failed on the following input")
-                log.error(SouceLine.write_multiline(source))
+                log.error(SourceLine.write_multiline(source))
                 raise LLVM_Mc_Error from exc
 
         if platform.system() == "Darwin":
             source = list(filter(lambda s: s.text.strip().startswith(".type") is False, source))
 
+
+        # Remove all width information - LLVM cannot handle .w for
+        # some instructions that only have a 32-bit encoding,
+        # e.g., uadd16.w works in gcc, but not LLVM.
+        # Unfortunately, for some instructions this depends
+        # on the registers used and, hence, adjusting the input to
+        # SLOTHY is not sufficient.
+        # As currently, we don't have a model of the instruction encodings,
+        # there is no principled way to reason about it.
+        if thumb:
+            for line in source:
+                instruction = line.text
+                instruction = instruction.replace(".w ", " ")
+                instruction = instruction.replace(".n ", " ")
+                line.set_text(instruction)
+
         code = SourceLine.write_multiline(source)
 
         log.debug(f"Calling LLVM MC assmelber on the following code")
@@ -1585,7 +1601,7 @@ def extract(source, lbl, forced_loop_type=None):
         """
         Find a loop with start label `lbl` in `source` and return it together
         with its type.
-        
+
             Args:
                 source: list of SourceLine objects
                 lbl: label of the loop to extract
diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py
index 1de96011..35eaceb2 100644
--- a/slothy/targets/arm_v7m/arch_v7m.py
+++ b/slothy/targets/arm_v7m/arch_v7m.py
@@ -277,7 +277,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
         # if new_fixup != 0:
         #     yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{new_fixup}"
         if fixup != 0:
-            yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
         #if new_fixup != 0 or fixup != 0:
         if fixup != 0:
             yield f"{indent}vmov {self.additional_data['endf']}, {self.additional_data['end']}"
@@ -383,7 +383,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
             yield f"{indent}vmov {loop_end_reg}, {loop_end_reg_fpr}"
 
         if fixup != 0:
-            yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
 
         if fixup != 0 and loop_end_reg_fpr is not None:
             yield f"{indent}vmov {loop_end_reg_fpr}, {loop_end_reg}"
@@ -457,7 +457,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
         #     yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{new_fixup}"
 
         if fixup != 0:
-            yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
 
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
@@ -499,7 +499,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
             assert unroll in [1,2,4,8,16,32]
             yield f"{indent}lsr {loop_cnt}, {loop_cnt}, #{int(math.log2(unroll))}"
         if fixup != 0:
-            yield f"{indent}sub {loop_cnt}, {loop_cnt}, #{fixup}"
+            yield f"{indent}sub.w {loop_cnt}, {loop_cnt}, #{fixup}"
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
         yield f"{self.lbl_start}:"
@@ -1079,6 +1079,10 @@ def make(cls, src):
         return Armv7mInstruction.build(cls, src)
 
     def write(self):
+        # Default to .w for all instructions for better performance
+        # TODO: find a more principled way to do this
+        self.width = ".w"
+
         out = self.pattern
         l = list(zip(self.args_in, self.pattern_inputs))     + \
             list(zip(self.args_out, self.pattern_outputs))   + \