-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
executable file
·91 lines (74 loc) · 2.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import logging
import os
import hydra
from hydra.utils import instantiate
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
import torch
from data.data_module import DataModule
from semi_learner import SSLLearner
from utils.utils import average_checkpoints
# static vars
os.environ["WANDB_SILENT"] = "true"
logging.getLogger("lightning").propagate = False
# __spec__ = None
@hydra.main(config_path="conf", config_name="config")
def main(cfg):
if cfg.fix_seed:
seed_everything(42, workers=True)
print("The SLURM job ID for this run is {}".format(os.environ["SLURM_JOB_ID"]))
cfg.slurm_job_id = os.environ["SLURM_JOB_ID"]
cfg.gpus = torch.cuda.device_count()
print('num gpus:', cfg.gpus)
wandb_logger = None
if cfg.log_wandb:
wandb_logger = instantiate(cfg.logger)
torch.set_float32_matmul_precision(precision=cfg.matmul_precision)
data_module = DataModule(cfg)
learner = SSLLearner(cfg)
ckpt_callback = ModelCheckpoint(
monitor=cfg.checkpoint.monitor,
mode=cfg.checkpoint.mode,
dirpath=os.path.join(cfg.checkpoint.dirpath, cfg.experiment_name) if cfg.checkpoint.dirpath else None,
save_last=True,
filename=f'{{epoch}}',
save_top_k=cfg.checkpoint.save_top_k,
)
callbacks = []
if cfg.log_wandb:
callbacks = [
ckpt_callback,
LearningRateMonitor(logging_interval=cfg.logging.logging_interval),
]
trainer = Trainer(
**cfg.trainer,
logger=wandb_logger,
callbacks=callbacks,
)
if cfg.test:
trainer.test(learner, datamodule=data_module)
else:
if not cfg.test_avg:
trainer.fit(learner, data_module, ckpt_path=cfg.ckpt_path)
torch.distributed.destroy_process_group()
if trainer.is_global_zero:
last = [
os.path.join(
cfg.checkpoint.dirpath, cfg.experiment_name, f"epoch={n}.ckpt"
) for n in range(trainer.max_epochs - cfg.model.avg_ckpts, trainer.max_epochs)
]
avg = average_checkpoints(last)
model_path = os.path.join(
cfg.checkpoint.dirpath, cfg.experiment_name, f"model_avg_{cfg.model.avg_ckpts}.pth"
)
torch.save(avg, model_path)
# compute WER
cfg.gpus = cfg.trainer.devices = cfg.trainer.num_nodes = 1
cfg.model.pretrained_model_path = model_path
cfg.model.transfer_only_encoder = False
data_module = DataModule(cfg)
learner = SSLLearner(cfg)
trainer = Trainer(**cfg.trainer, logger=wandb_logger)
trainer.test(learner, datamodule=data_module)
if __name__ == "__main__":
main()