diff --git a/slothy/helper.py b/slothy/helper.py
index 3d386d09..a481e2fc 100644
--- a/slothy/helper.py
+++ b/slothy/helper.py
@@ -1212,12 +1212,28 @@ def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_pa
                                               include=include_paths)
             except subprocess.CalledProcessError as exc:
                 log.error("CPreprocessor failed on the following input")
-                log.error(SouceLine.write_multiline(source))
+                log.error(SourceLine.write_multiline(source))
                 raise LLVM_Mc_Error from exc
 
         if platform.system() == "Darwin":
             source = list(filter(lambda s: s.text.strip().startswith(".type") is False, source))
 
+
+        # Remove all width information - LLVM cannot handle .w for
+        # some instructions that only have a 32-bit encoding,
+        # e.g., uadd16.w works in gcc, but not LLVM.
+        # Unfortunately, for some instructions this depends
+        # on the registers used and, hence, adjusting the input to
+        # SLOTHY is not sufficient.
+        # As currently, we don't have a model of the instruction encodings,
+        # there is no principled way to reason about it.
+        if thumb:
+            for line in source:
+                instruction = line.text
+                instruction = instruction.replace(".w ", " ")
+                instruction = instruction.replace(".n ", " ")
+                line.set_text(instruction)
+
         code = SourceLine.write_multiline(source)
 
         log.debug(f"Calling LLVM MC assmelber on the following code")
@@ -1585,7 +1601,7 @@ def extract(source, lbl, forced_loop_type=None):
         """
         Find a loop with start label `lbl` in `source` and return it together
         with its type.
-        
+
             Args:
                 source: list of SourceLine objects
                 lbl: label of the loop to extract
diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py
index 1de96011..35eaceb2 100644
--- a/slothy/targets/arm_v7m/arch_v7m.py
+++ b/slothy/targets/arm_v7m/arch_v7m.py
@@ -277,7 +277,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
         # if new_fixup != 0:
         #     yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{new_fixup}"
         if fixup != 0:
-            yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
         #if new_fixup != 0 or fixup != 0:
         if fixup != 0:
             yield f"{indent}vmov {self.additional_data['endf']}, {self.additional_data['end']}"
@@ -383,7 +383,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
             yield f"{indent}vmov {loop_end_reg}, {loop_end_reg_fpr}"
 
         if fixup != 0:
-            yield f"{indent}sub {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {loop_end_reg}, {loop_end_reg}, #{fixup*inc_per_iter}"
 
         if fixup != 0 and loop_end_reg_fpr is not None:
             yield f"{indent}vmov {loop_end_reg_fpr}, {loop_end_reg}"
@@ -457,7 +457,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
         #     yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{new_fixup}"
 
         if fixup != 0:
-            yield f"{indent}sub {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
+            yield f"{indent}sub.w {self.additional_data['end']}, {self.additional_data['end']}, #{fixup*inc_per_iter}"
 
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
@@ -499,7 +499,7 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
             assert unroll in [1,2,4,8,16,32]
             yield f"{indent}lsr {loop_cnt}, {loop_cnt}, #{int(math.log2(unroll))}"
         if fixup != 0:
-            yield f"{indent}sub {loop_cnt}, {loop_cnt}, #{fixup}"
+            yield f"{indent}sub.w {loop_cnt}, {loop_cnt}, #{fixup}"
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
         yield f"{self.lbl_start}:"
@@ -1079,6 +1079,10 @@ def make(cls, src):
         return Armv7mInstruction.build(cls, src)
 
     def write(self):
+        # Default to .w for all instructions for better performance
+        # TODO: find a more principled way to do this
+        self.width = ".w"
+
         out = self.pattern
         l = list(zip(self.args_in, self.pattern_inputs))     + \
             list(zip(self.args_out, self.pattern_outputs))   + \