lora微调时，将batch_size调为64后，学习率应该怎么调，我的学习率10点多，下不去。并且训练途中在验证数据集测试时，模型输出为空，然后就会报错： #1279

Jasmine0224 · 2024-06-25T09:57:49Z

Jasmine0224
Jun 25, 2024

╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /data/zll/ChatGLM3/finetune_demo/finetune_hf.py:536 in main │
│ │
│ 533 │ ) │
│ 534 │ │
│ 535 │ if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_c │
│ ❱ 536 │ │ trainer.train() │
│ 537 │ else: │
│ 538 │ │ output_dir = ft_config.training_args.output_dir │
│ 539 │ │ dirlist = os.listdir(output_dir) │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer.py │
│ :1859 in train │
│ │
│ 1856 │ │ │ finally: │
│ 1857 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1858 │ │ else: │
│ ❱ 1859 │ │ │ return inner_training_loop( │
│ 1860 │ │ │ │ args=args, │
│ 1861 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1862 │ │ │ │ trial=trial, │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer.py │
│ :2278 in _inner_training_loop │
│ │
│ 2275 │ │ │ │ │ self.state.epoch = epoch + (step + 1 + steps_skip │
│ 2276 │ │ │ │ │ self.control = self.callback_handler.on_step_end( │
│ 2277 │ │ │ │ │ │
│ ❱ 2278 │ │ │ │ │ self._maybe_log_save_evaluate(tr_loss, grad_norm, │
│ 2279 │ │ │ │ else: │
│ 2280 │ │ │ │ │ self.control = self.callback_handler.on_substep_e │
│ 2281 │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer.py │
│ :2662 in _maybe_log_save_evaluate │
│ │
│ 2659 │ │ │
│ 2660 │ │ metrics = None │
│ 2661 │ │ if self.control.should_evaluate: │
│ ❱ 2662 │ │ │ metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) │
│ 2663 │ │ │ self._report_to_hp_search(trial, self.state.global_step, │
│ 2664 │ │ │ │
│ 2665 │ │ │ # Run delayed LR scheduler now that metrics are populated │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer_se │
│ q2seq.py:180 in evaluate │
│ │
│ 177 │ │ # We don't want to drop samples in general │
│ 178 │ │ self.gather_function = self.accelerator.gather │
│ 179 │ │ self._gen_kwargs = gen_kwargs │
│ ❱ 180 │ │ return super().evaluate(eval_dataset, ignore_keys=ignore_keys, │
│ 181 │ │
│ 182 │ def predict( │
│ 183 │ │ self, │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer.py │
│ :3467 in evaluate │
│ │
│ 3464 │ │ start_time = time.time() │
│ 3465 │ │ │
│ 3466 │ │ eval_loop = self.prediction_loop if self.args.use_legacy_pred │
│ ❱ 3467 │ │ output = eval_loop( │
│ 3468 │ │ │ eval_dataloader, │
│ 3469 │ │ │ description="Evaluation", │
│ 3470 │ │ │ # No point gathering the predictions if there are no metr │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/transformers/trainer.py │
│ :3719 in evaluation_loop │
│ │
│ 3716 │ │ │ │ │ EvalPrediction(predictions=all_preds, label_ids=a │
│ 3717 │ │ │ │ ) │
│ 3718 │ │ │ else: │
│ ❱ 3719 │ │ │ │ metrics = self.compute_metrics(EvalPrediction(predict │
│ 3720 │ │ else: │
│ 3721 │ │ │ metrics = {} │
│ 3722 │
│ │
│ /data/zll/ChatGLM3/finetune_demo/finetune_hf.py:429 in compute_metrics │
│ │
│ 426 │ │ pred_tokens = list(jieba.cut(pred_txt)) │
│ 427 │ │ label_tokens = list(jieba.cut(label_txt)) │
│ 428 │ │ rouge = Rouge() │
│ ❱ 429 │ │ scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(labe │
│ 430 │ │ for k, v in scores[0].items(): │
│ 431 │ │ │ metrics_dct[k].append(round(v['f'] * 100, 4)) │
│ 432 │ │ metrics_dct['bleu-4'].append( │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/rouge_chinese/rouge.py: │
│ 116 in get_scores │
│ │
│ 113 │ │ assert(len(hyps) == len(refs)) │
│ 114 │ │ │
│ 115 │ │ if not avg: │
│ ❱ 116 │ │ │ return self._get_scores(hyps, refs) │
│ 117 │ │ return self._get_avg_scores(hyps, refs) │
│ 118 │ │
│ 119 │ def _get_scores(self, hyps, refs): │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/rouge_chinese/rouge.py: │
│ 129 in _get_scores │
│ │
│ 126 │ │ │ │
│ 127 │ │ │ for m in self.metrics: │
│ 128 │ │ │ │ fn = Rouge.AVAILABLE_METRICS[m] │
│ ❱ 129 │ │ │ │ sc = fn( │
│ 130 │ │ │ │ │ hyp, │
│ 131 │ │ │ │ │ ref, │
│ 132 │ │ │ │ │ raw_results=self.raw_results, │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/rouge_chinese/rouge.py: │
│ 54 in │
│ │
│ 51 class Rouge: │
│ 52 │ DEFAULT_METRICS = ["rouge-1", "rouge-2", "rouge-l"] │
│ 53 │ AVAILABLE_METRICS = { │
│ ❱ 54 │ │ "rouge-1": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, │
│ 55 │ │ "rouge-2": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, │
│ 56 │ │ "rouge-3": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, │
│ 57 │ │ "rouge-4": lambda hyp, ref, **k: rouge_score.rouge_n(hyp, ref, │
│ │
│ /root/anaconda3/envs/llm/lib/python3.9/site-packages/rouge_chinese/rouge_sco │
│ re.py:253 in rouge_n │
│ │
│ 250 │ ValueError: raises exception if a param has len <= 0 │
│ 251 │ """ │
│ 252 │ if len(evaluated_sentences) <= 0: │
│ ❱ 253 │ │ raise ValueError("Hypothesis is empty.") │
│ 254 │ if len(reference_sentences) <= 0: │
│ 255 │ │ raise ValueError("Reference is empty.") │
│ 256

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

lora微调时，将batch_size调为64后，学习率应该怎么调，我的学习率10点多，下不去。并且训练途中在验证数据集测试时，模型输出为空，然后就会报错： #1279

{{title}}

Replies: 0 comments

Select a reply

lora微调时，将batch_size调为64后，学习率应该怎么调，我的学习率10点多，下不去。并且训练途中在验证数据集测试时，模型输出为空，然后就会报错： #1279

Jasmine0224 Jun 25, 2024

Replies: 0 comments

Jasmine0224
Jun 25, 2024