linkedin · shimizust · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/benchmark/benchmark_layer_norm.py b/benchmark/benchmark_layer_norm.py
@@ -15,9 +15,9 @@
             x_vals=[2**i for i in range(10, 15)],
             xlabel="hidden size",
             line_arg="provider",
-            line_vals=["liger", "huggingface", "torch_compile"],
-            line_names=["Liger", "Hugging Face", "Torch Compile"],
-            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            line_vals=["liger", "huggingface"],
+            line_names=["Liger", "Hugging Face"],
+            styles=[("blue", "solid"), ("orange", "solid")],
             ylabel="time (ms)",
             plot_name="layernorm-fwd-speed-benchmark",
             args={"M": 4096, "dtype": torch.float32, "mode": "forward"},
@@ -27,9 +27,9 @@
             x_vals=[2**i for i in range(10, 15)],
             xlabel="hidden size",
             line_arg="provider",
-            line_vals=["liger", "huggingface", "torch_compile"],
-            line_names=["Liger", "Hugging Face", "Torch Compile"],
-            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            line_vals=["liger", "huggingface"],
+            line_names=["Liger", "Hugging Face"],
+            styles=[("blue", "solid"), ("orange", "solid")],
             ylabel="time (ms)",
             plot_name="layernorm-full-speed-benchmark",
             args={"M": 4096, "dtype": torch.float32, "mode": "full"},
@@ -40,7 +40,6 @@ def bench_speed_layer_norm(M, N, dtype, provider, mode, eps=1e-6, device="cuda")
     x_shape = (M, N)
     triton_ln = LigerLayerNorm(hidden_size=N).to("cuda")
     torch_ln = torch.nn.LayerNorm(N, eps=eps).to("cuda")
-    torch_compile_ln = torch.compile(torch_ln)
 
     x = torch.randn(x_shape, dtype=dtype, device="cuda")
     dy = torch.randn_like(x)
@@ -52,8 +51,6 @@ def y_fwd():
             return triton_ln(x)
         if provider == "huggingface":
             return torch_ln(x)
-        if provider == "torch_compile":
-            return torch_compile_ln(x)
 
     if mode == "forward":
         ms, min_ms, max_ms = triton.testing.do_bench(
@@ -98,9 +95,9 @@ def benchmark_speed_layer_norm_wrapper():
             x_vals=[2**i for i in range(10, 15)],
             xlabel="hidden size",
             line_arg="provider",
-            line_vals=["liger", "huggingface", "torch_compile"],
-            line_names=["Liger", "Hugging Face", "Torch Compile"],
-            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            line_vals=["liger", "huggingface"],
+            line_names=["Liger", "Hugging Face"],
+            styles=[("blue", "solid"), ("orange", "solid")],
             ylabel="GPU memory usage (MB)",
             plot_name="layernorm-full-memory-benchmark",
             args={"M": 4096, "dtype": torch.float32, "mode": "full"},
@@ -112,7 +109,6 @@ def bench_memory_layer_norm(M, N, dtype, provider, mode, eps=1e-6, device="cuda"
 
     triton_ln = LigerLayerNorm(hidden_size=N).to("cuda")
     torch_ln = torch.nn.LayerNorm(N, eps=eps).to("cuda")
-    torch_compile_ln = torch.compile(torch_ln)
 
     x = torch.randn(x_shape, dtype=dtype, device="cuda")
     dy = torch.randn_like(x)
@@ -123,8 +119,6 @@ def y_fwd():
             return triton_ln(x)
         if provider == "huggingface":
             return torch_ln(x)
-        if provider == "torch_compile":
-            return torch_compile_ln(x)
 
     def full():
         y = y_fwd()

diff --git a/benchmark/layer_norm_memory/layernorm-full-memory-benchmark.csv b/benchmark/layer_norm_memory/layernorm-full-memory-benchmark.csv
@@ -1,6 +1,6 @@
-N,Liger,Hugging Face,Torch Compile
-1024.000000,79.828906,79.188281,79.434375
-2048.000000,158.915625,157.634375,157.384375
-4096.000000,316.346875,313.784375,314.268750
-8192.000000,632.693750,627.568750,627.568750
-16384.000000,1265.387500,1255.137500,1255.137500
+N,Liger,Hugging Face
+1024.000000,79.305469,78.461719
+2048.000000,158.579688,156.892188
+4096.000000,317.128125,313.753125
+8192.000000,634.225000,627.475000
+16384.000000,1268.418750,1254.918750
diff --git a/benchmark/layer_norm_memory/layernorm-full-memory-benchmark.png b/benchmark/layer_norm_memory/layernorm-full-memory-benchmark.png
diff --git a/benchmark/layer_norm_speed/layernorm-full-speed-benchmark.csv b/benchmark/layer_norm_speed/layernorm-full-speed-benchmark.csv
@@ -1,6 +1,6 @@
-N,Liger,Hugging Face,Torch Compile
-1024.000000,0.294912,0.188416,0.269312
-2048.000000,0.535552,0.388096,0.314368
-4096.000000,0.413696,0.776192,0.697344
-8192.000000,0.819200,1.538048,1.406976
-16384.000000,2.682800,3.057728,3.043328
+N,Liger,Hugging Face
+1024.000000,0.309568,0.118880
+2048.000000,0.301280,0.196256
+4096.000000,0.309920,0.432384
+8192.000000,0.442400,0.830368
+16384.000000,1.154208,1.619392
diff --git a/benchmark/layer_norm_speed/layernorm-full-speed-benchmark.png b/benchmark/layer_norm_speed/layernorm-full-speed-benchmark.png
diff --git a/benchmark/layer_norm_speed/layernorm-fwd-speed-benchmark.csv b/benchmark/layer_norm_speed/layernorm-fwd-speed-benchmark.csv
@@ -1,6 +1,6 @@
-N,Liger,Hugging Face,Torch Compile
-1024.000000,0.041984,0.050176,0.043008
-2048.000000,0.079872,0.113664,0.084992
-4096.000000,0.158720,0.234496,0.203840
-8192.000000,0.316416,0.465920,0.455680
-16384.000000,0.628736,0.928768,0.923648
+N,Liger,Hugging Face
+1024.000000,0.031776,0.036512
+2048.000000,0.049696,0.057216
+4096.000000,0.088448,0.121440
+8192.000000,0.166176,0.257728
+16384.000000,0.322592,0.506080
diff --git a/benchmark/layer_norm_speed/layernorm-fwd-speed-benchmark.png b/benchmark/layer_norm_speed/layernorm-fwd-speed-benchmark.png