diff --git a/tests/unit/compiler/venom/test_duplicate_operands.py b/tests/unit/compiler/venom/test_duplicate_operands.py
index fbff0835d2..ab55649dae 100644
--- a/tests/unit/compiler/venom/test_duplicate_operands.py
+++ b/tests/unit/compiler/venom/test_duplicate_operands.py
@@ -1,6 +1,8 @@
 from vyper.compiler.settings import OptimizationLevel
 from vyper.venom import generate_assembly_experimental
+from vyper.venom.analysis.analysis import IRAnalysesCache
 from vyper.venom.context import IRContext
+from vyper.venom.passes.store_expansion import StoreExpansionPass
 
 
 def test_duplicate_operands():
@@ -13,7 +15,7 @@ def test_duplicate_operands():
     %3 = mul %1, %2
     stop
 
-    Should compile to: [PUSH1, 10, DUP1, DUP1, DUP1, ADD, MUL, POP, STOP]
+    Should compile to: [PUSH1, 10, DUP1, DUP2, ADD, MUL, POP, STOP]
     """
     ctx = IRContext()
     fn = ctx.create_function("test")
@@ -23,5 +25,9 @@ def test_duplicate_operands():
     bb.append_instruction("mul", sum_, op)
     bb.append_instruction("stop")
 
-    asm = generate_assembly_experimental(ctx, optimize=OptimizationLevel.GAS)
-    assert asm == ["PUSH1", 10, "DUP1", "DUP1", "ADD", "MUL", "POP", "STOP"]
+    ac = IRAnalysesCache(fn)
+    StoreExpansionPass(ac, fn).run_pass()
+
+    optimize = OptimizationLevel.GAS
+    asm = generate_assembly_experimental(ctx, optimize=optimize)
+    assert asm == ["PUSH1", 10, "DUP1", "DUP2", "ADD", "MUL", "POP", "STOP"]
diff --git a/tests/unit/compiler/venom/test_stack_cleanup.py b/tests/unit/compiler/venom/test_stack_cleanup.py
index 6015cf1c41..7198861771 100644
--- a/tests/unit/compiler/venom/test_stack_cleanup.py
+++ b/tests/unit/compiler/venom/test_stack_cleanup.py
@@ -9,7 +9,8 @@ def test_cleanup_stack():
     bb = fn.get_basic_block()
     ret_val = bb.append_instruction("param")
     op = bb.append_instruction("store", 10)
-    bb.append_instruction("add", op, op)
+    op2 = bb.append_instruction("store", op)
+    bb.append_instruction("add", op, op2)
     bb.append_instruction("ret", ret_val)
 
     asm = generate_assembly_experimental(ctx, optimize=OptimizationLevel.GAS)
diff --git a/vyper/venom/__init__.py b/vyper/venom/__init__.py
index afd79fc44f..a5f51b787d 100644
--- a/vyper/venom/__init__.py
+++ b/vyper/venom/__init__.py
@@ -12,13 +12,13 @@
 from vyper.venom.passes.algebraic_optimization import AlgebraicOptimizationPass
 from vyper.venom.passes.branch_optimization import BranchOptimizationPass
 from vyper.venom.passes.dft import DFTPass
-from vyper.venom.passes.extract_literals import ExtractLiteralsPass
 from vyper.venom.passes.make_ssa import MakeSSA
 from vyper.venom.passes.mem2var import Mem2Var
 from vyper.venom.passes.remove_unused_variables import RemoveUnusedVariablesPass
 from vyper.venom.passes.sccp import SCCP
 from vyper.venom.passes.simplify_cfg import SimplifyCFGPass
 from vyper.venom.passes.store_elimination import StoreElimination
+from vyper.venom.passes.store_expansion import StoreExpansionPass
 from vyper.venom.venom_to_assembly import VenomCompiler
 
 DEFAULT_OPT_LEVEL = OptimizationLevel.default()
@@ -54,8 +54,9 @@ def _run_passes(fn: IRFunction, optimize: OptimizationLevel) -> None:
     SimplifyCFGPass(ac, fn).run_pass()
     AlgebraicOptimizationPass(ac, fn).run_pass()
     BranchOptimizationPass(ac, fn).run_pass()
-    ExtractLiteralsPass(ac, fn).run_pass()
     RemoveUnusedVariablesPass(ac, fn).run_pass()
+
+    StoreExpansionPass(ac, fn).run_pass()
     DFTPass(ac, fn).run_pass()
 
 
diff --git a/vyper/venom/analysis/equivalent_vars.py b/vyper/venom/analysis/equivalent_vars.py
new file mode 100644
index 0000000000..9b0c03e3d1
--- /dev/null
+++ b/vyper/venom/analysis/equivalent_vars.py
@@ -0,0 +1,41 @@
+from vyper.venom.analysis.analysis import IRAnalysis
+from vyper.venom.analysis.dfg import DFGAnalysis
+from vyper.venom.basicblock import IRVariable
+
+
+class VarEquivalenceAnalysis(IRAnalysis):
+    """
+    Generate equivalence sets of variables. This is used to avoid swapping
+    variables which are the same during venom_to_assembly. Theoretically,
+    the DFTPass should order variable declarations optimally, but, it is
+    not aware of the "pickaxe" heuristic in venom_to_assembly, so they can
+    interfere.
+    """
+
+    def analyze(self):
+        dfg = self.analyses_cache.request_analysis(DFGAnalysis)
+
+        equivalence_set: dict[IRVariable, int] = {}
+
+        for bag, (var, inst) in enumerate(dfg._dfg_outputs.items()):
+            if inst.opcode != "store":
+                continue
+
+            source = inst.operands[0]
+
+            assert var not in equivalence_set  # invariant
+            if source in equivalence_set:
+                equivalence_set[var] = equivalence_set[source]
+                continue
+            else:
+                equivalence_set[var] = bag
+                equivalence_set[source] = bag
+
+        self._equivalence_set = equivalence_set
+
+    def equivalent(self, var1, var2):
+        if var1 not in self._equivalence_set:
+            return False
+        if var2 not in self._equivalence_set:
+            return False
+        return self._equivalence_set[var1] == self._equivalence_set[var2]
diff --git a/vyper/venom/passes/extract_literals.py b/vyper/venom/passes/store_expansion.py
similarity index 73%
rename from vyper/venom/passes/extract_literals.py
rename to vyper/venom/passes/store_expansion.py
index 91c0813e67..7718e67d33 100644
--- a/vyper/venom/passes/extract_literals.py
+++ b/vyper/venom/passes/store_expansion.py
@@ -1,12 +1,13 @@
 from vyper.venom.analysis.dfg import DFGAnalysis
 from vyper.venom.analysis.liveness import LivenessAnalysis
-from vyper.venom.basicblock import IRInstruction, IRLiteral
+from vyper.venom.basicblock import IRInstruction, IRLiteral, IRVariable
 from vyper.venom.passes.base_pass import IRPass
 
 
-class ExtractLiteralsPass(IRPass):
+class StoreExpansionPass(IRPass):
     """
-    This pass extracts literals so that they can be reordered by the DFT pass
+    This pass extracts literals and variables so that they can be
+    reordered by the DFT pass
     """
 
     def run_pass(self):
@@ -20,7 +21,7 @@ def _process_bb(self, bb):
         i = 0
         while i < len(bb.instructions):
             inst = bb.instructions[i]
-            if inst.opcode in ("store", "offset"):
+            if inst.opcode in ("store", "offset", "phi", "param"):
                 i += 1
                 continue
 
@@ -29,9 +30,11 @@ def _process_bb(self, bb):
                 if inst.opcode == "log" and j == 0:
                     continue
 
-                if isinstance(op, IRLiteral):
+                if isinstance(op, (IRVariable, IRLiteral)):
                     var = self.function.get_next_variable()
                     to_insert = IRInstruction("store", [op], var)
                     bb.insert_instruction(to_insert, index=i)
                     inst.operands[j] = var
+                    i += 1
+
             i += 1
diff --git a/vyper/venom/venom_to_assembly.py b/vyper/venom/venom_to_assembly.py
index 390fab8e7c..45b307d7b3 100644
--- a/vyper/venom/venom_to_assembly.py
+++ b/vyper/venom/venom_to_assembly.py
@@ -12,6 +12,7 @@
 )
 from vyper.utils import MemoryPositions, OrderedSet
 from vyper.venom.analysis.analysis import IRAnalysesCache
+from vyper.venom.analysis.equivalent_vars import VarEquivalenceAnalysis
 from vyper.venom.analysis.liveness import LivenessAnalysis
 from vyper.venom.basicblock import (
     IRBasicBlock,
@@ -25,6 +26,10 @@
 from vyper.venom.passes.normalization import NormalizationPass
 from vyper.venom.stack_model import StackModel
 
+DEBUG_SHOW_COST = False
+if DEBUG_SHOW_COST:
+    import sys
+
 # instructions which map one-to-one from venom to EVM
 _ONE_TO_ONE_INSTRUCTIONS = frozenset(
     [
@@ -152,6 +157,7 @@ def generate_evm(self, no_optimize: bool = False) -> list[str]:
 
                 NormalizationPass(ac, fn).run_pass()
                 self.liveness_analysis = ac.request_analysis(LivenessAnalysis)
+                self.equivalence = ac.request_analysis(VarEquivalenceAnalysis)
 
                 assert fn.normalized, "Non-normalized CFG!"
 
@@ -220,7 +226,11 @@ def _stack_reorder(
             if depth == final_stack_depth:
                 continue
 
-            if op == stack.peek(final_stack_depth):
+            to_swap = stack.peek(final_stack_depth)
+            if self.equivalence.equivalent(op, to_swap):
+                # perform a "virtual" swap
+                stack.poke(final_stack_depth, op)
+                stack.poke(depth, to_swap)
                 continue
 
             cost += self.swap(assembly, stack, depth)
@@ -240,19 +250,14 @@ def _emit_input_operands(
         # been scheduled to be killed. now it's just a matter of emitting
         # SWAPs, DUPs and PUSHes until we match the `ops` argument
 
-        # dumb heuristic: if the top of stack is not wanted here, swap
-        # it with something that is wanted
-        if ops and stack.height > 0 and stack.peek(0) not in ops:
-            for op in ops:
-                if isinstance(op, IRVariable) and op not in next_liveness:
-                    self.swap_op(assembly, stack, op)
-                    break
+        # to validate store expansion invariant -
+        # each op is emitted at most once.
+        seen: set[IROperand] = set()
 
-        emitted_ops = OrderedSet[IROperand]()
         for op in ops:
             if isinstance(op, IRLabel):
-                # invoke emits the actual instruction itself so we don't need to emit it here
-                # but we need to add it to the stack map
+                # invoke emits the actual instruction itself so we don't need
+                # to emit it here but we need to add it to the stack map
                 if inst.opcode != "invoke":
                     assembly.append(f"_sym_{op.value}")
                 stack.push(op)
@@ -267,13 +272,12 @@ def _emit_input_operands(
                 stack.push(op)
                 continue
 
-            if op in next_liveness and op not in emitted_ops:
-                self.dup_op(assembly, stack, op)
-
-            if op in emitted_ops:
+            if op in next_liveness:
                 self.dup_op(assembly, stack, op)
 
-            emitted_ops.add(op)
+            # guaranteed by store expansion
+            assert op not in seen, (op, seen)
+            seen.add(op)
 
     def _generate_evm_for_basicblock_r(
         self, asm: list, basicblock: IRBasicBlock, stack: StackModel
@@ -282,6 +286,12 @@ def _generate_evm_for_basicblock_r(
             return
         self.visited_basicblocks.add(basicblock)
 
+        if DEBUG_SHOW_COST:
+            print(basicblock, file=sys.stderr)
+
+        ref = asm
+        asm = []
+
         # assembly entry point into the block
         asm.append(f"_sym_{basicblock.label}")
         asm.append("JUMPDEST")
@@ -297,8 +307,14 @@ def _generate_evm_for_basicblock_r(
 
             asm.extend(self._generate_evm_for_instruction(inst, stack, next_liveness))
 
+        if DEBUG_SHOW_COST:
+            print(" ".join(map(str, asm)), file=sys.stderr)
+            print("\n", file=sys.stderr)
+
+        ref.extend(asm)
+
         for bb in basicblock.reachable:
-            self._generate_evm_for_basicblock_r(asm, bb, stack.copy())
+            self._generate_evm_for_basicblock_r(ref, bb, stack.copy())
 
     # pop values from stack at entry to bb
     # note this produces the same result(!) no matter which basic block
@@ -380,6 +396,7 @@ def _generate_evm_for_instruction(
             # find an instance of %13 *or* %14 in the stack and replace it with %56.
             to_be_replaced = stack.peek(depth)
             if to_be_replaced in next_liveness:
+                # this branch seems unreachable (maybe due to make_ssa)
                 # %13/%14 is still live(!), so we make a copy of it
                 self.dup(assembly, stack, depth)
                 stack.poke(0, ret)
@@ -421,6 +438,13 @@ def _generate_evm_for_instruction(
             if cost_with_swap > cost_no_swap:
                 operands[-1], operands[-2] = operands[-2], operands[-1]
 
+        cost = self._stack_reorder([], stack, operands, dry_run=True)
+        if DEBUG_SHOW_COST and cost:
+            print("ENTER", inst, file=sys.stderr)
+            print("  HAVE", stack, file=sys.stderr)
+            print("  WANT", operands, file=sys.stderr)
+            print("  COST", cost, file=sys.stderr)
+
         # final step to get the inputs to this instruction ordered
         # correctly on the stack
         self._stack_reorder(assembly, stack, operands)
@@ -537,10 +561,21 @@ def _generate_evm_for_instruction(
             if inst.output not in next_liveness:
                 self.pop(assembly, stack)
             else:
-                # peek at next_liveness to find the next scheduled item,
-                # and optimistically swap with it
+                # heuristic: peek at next_liveness to find the next scheduled
+                # item, and optimistically swap with it
+                if DEBUG_SHOW_COST:
+                    stack0 = stack.copy()
+
                 next_scheduled = next_liveness.last()
-                self.swap_op(assembly, stack, next_scheduled)
+                cost = 0
+                if not self.equivalence.equivalent(inst.output, next_scheduled):
+                    cost = self.swap_op(assembly, stack, next_scheduled)
+
+                if DEBUG_SHOW_COST and cost != 0:
+                    print("ENTER", inst, file=sys.stderr)
+                    print("  HAVE", stack0, file=sys.stderr)
+                    print("  NEXT LIVENESS", next_liveness, file=sys.stderr)
+                    print("  NEW_STACK", stack, file=sys.stderr)
 
         return apply_line_numbers(inst, assembly)
 
@@ -562,7 +597,7 @@ def dup(self, assembly, stack, depth):
         assembly.append(_evm_dup_for(depth))
 
     def swap_op(self, assembly, stack, op):
-        self.swap(assembly, stack, stack.get_depth(op))
+        return self.swap(assembly, stack, stack.get_depth(op))
 
     def dup_op(self, assembly, stack, op):
         self.dup(assembly, stack, stack.get_depth(op))