Merge branch 'main' into embedding

linkedin · Aug 29, 2024 · f0f4389 · f0f4389
2 parents f331bc7 + e5d6ad7
commit f0f4389
Show file tree

Hide file tree

Showing 7 changed files with 33 additions and 4 deletions.
diff --git a/examples/huggingface/README.md b/examples/huggingface/README.md
@@ -31,3 +31,12 @@ Throughput improves by around 10%, while GPU memory usage drops by 50%.
 
 ![Throughput](img/qwen_tps.png)
 ![GPU Memory Allocated](img/qwen_mem_alloc.png)
+
+
+### GEMMA 7B
+Benchmark conditions: Gemma-7B, Alpaca Dataset, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 24%, while GPU memory usage drops by 33%. 
+
+![Throughput](img/gemma_7b_mem.png)
+![GPU Memory Allocated](img/gemma_7b_tp.png)
diff --git a/examples/huggingface/img/gemma_7b_mem.png b/examples/huggingface/img/gemma_7b_mem.png
diff --git a/examples/huggingface/img/gemma_7b_tp.png b/examples/huggingface/img/gemma_7b_tp.png
diff --git a/examples/huggingface/run_gemma.sh b/examples/huggingface/run_gemma.sh
@@ -0,0 +1,20 @@
+torchrun --nnodes=1 --nproc-per-node=4 training.py \
+    --model_name "google/gemma-7b-it" \
+    --bf16 \
+    --max_steps 20 \
+    --per_device_train_batch_size 24 \
+    --per_device_eval_batch_size 1 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir alpaca_finetuning
diff --git a/examples/huggingface/training.py b/examples/huggingface/training.py
@@ -53,8 +53,8 @@ def train():
             torch_dtype=torch.bfloat16,
             # These args will get passed to the appropriate apply_liger_kernel_to_* function
             # to override the default settings
-            cross_entropy=True,
-            fused_linear_cross_entropy=False,
+            # cross_entropy=True,
+            # fused_linear_cross_entropy=False,
         )
     else:
         model = transformers.AutoModelForCausalLM.from_pretrained(

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import find_namespace_packages, setup
 
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 
 setup(
     name="liger_kernel",

diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -128,7 +128,7 @@ def apply_liger_kernel_to_mixtral(
 
 def apply_liger_kernel_to_gemma(
     rope: bool = True,
-    cross_entropy: bool = True,
+    cross_entropy: bool = False,
     fused_linear_cross_entropy: bool = True,
     rms_norm: bool = True,
     geglu: bool = True,