Skip to content

Commit

Permalink
Merge pull request #15 from PicoCreator/dev-infctx-hyperparam-helper
Browse files Browse the repository at this point in the history
Dev infctx hyperparam helper
  • Loading branch information
PicoCreator authored Jul 11, 2023
2 parents f4da8ae + 584cdfb commit 86751c2
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 19 deletions.
4 changes: 2 additions & 2 deletions RWKV-v4neo/src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ def __init__(
self._loaded_dataset = None

# Log to wandb
wandb.config.update({ "data":dict(self._init_locals) })

if wandb.run is not None:
wandb.config.update({ "data":dict(self._init_locals) })

# Called once for initial setup
def prepare_data(self):
Expand Down
3 changes: 2 additions & 1 deletion RWKV-v4neo/src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,8 @@ def configure_optimizers(self):
model_args["__lr_final"] = lr_final

# Update WANDB
wandb.config.update({ "model": model_args })
if wandb.run is not None:
wandb.config.update({ "model": model_args })

# Setup layerwise learning rate
if self.layerwise_lr:
Expand Down
13 changes: 7 additions & 6 deletions RWKV-v4neo/src/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,13 @@ def __init__(

# Update WANDB config
# ---
trainer_config["target_batch_size"] = target_batch_size
del trainer_config["logger"]
del trainer_config["callbacks"]
wandb.config.update({
"trainer": trainer_config
})
if wandb.run is not None:
trainer_config["target_batch_size"] = target_batch_size
del trainer_config["logger"]
del trainer_config["callbacks"]
wandb.config.update({
"trainer": trainer_config
})

# Call the parent constructor
super().__init__(*args, **kwargs)
Expand Down
25 changes: 15 additions & 10 deletions notebook/trainer-validation/config/baseline-1024.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,10 @@ trainer:
# This number is divided by the number of GPUs, and nodes configured
# So if you have 4 GPUs, and 2 nodes, and this is configured as 128
# Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
target_batch_size: 16
target_batch_size: 128

# Number of datasamples to accumulate before backproping
# # Number of datasamples to accumulate before backproping, per GPU
# # this can't be used with target_batch_size.
# accumulate_grad_batches: -1

# Various other settings, you probably want to leave alone
Expand Down Expand Up @@ -209,19 +210,23 @@ model:
# Learning rate of the training process
# ---

# Learing rate on a per data sample basis
# this is automatically multiplied by accumulate_grad_batches
# to compute the actual lr_init and lr_final values
sample_lr_init: 8e-5
sample_lr_final: 5e-5
# Learing rate normalized to the target_batch_size
#
# If number of nodes & gpus is 1, this is the actual learning rate
# as the accumulate_grad_batches == target_batch_size
#
# Otherwise this is normalized using the following formula
# lr = lr / target_batch_size * accumulate_grad_batches
#
# This reduces the need to readjust the learning rate when changing
# the number of nodes & gpus used for training.
target_lr_init: 0.001
target_lr_final: 0.0001

# # Initia learning rate of the process
# lr_init: 6e-4
# # Final learning rate after the learning rate period
# # learning rate will stay at final value from then onwards
# #
# # NOTE: lr_final / lr_period does not work with warmup_steps
# # and will be ignored (or replaced) with the warmup_steps logic instead
# lr_final: 4e-4

# Number of epoch to reduce the learning rate from lr_init to lr_final
Expand Down

0 comments on commit 86751c2

Please sign in to comment.