diff --git a/README.ch.md b/README.ch.md
index a9b4e24f..7269dd67 100644
--- a/README.ch.md
+++ b/README.ch.md
@@ -17,6 +17,7 @@
- 数据: 数据集构建流程抽象、组合多个 DataLoader、...
- 分布式训练:同样支持多种训练加速框架,统一抽象,方便随时切换
- 更多工具类...
+- 尽可能多的支持现代 IDE 的自动补全
![lumo-framework](./images/lumo-intro.png)
@@ -33,6 +34,7 @@
- [More](#more)
- :pencil: [Acknowledge](#pencil-acknowledge)
- :scroll: [License](#scroll-license)
+- [完整文档](https://pytorch-lumo.github.io/lumo/)
# :cloud: 安装
diff --git a/README.md b/README.md
index 78ad25c3..f940e920 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ and focuses on enhancing the experience of deep learning practitioners.
- **Distributed Training:** Also supports multiple training acceleration frameworks, unified abstraction, and easy
switching at any time.
- More utilities...
+- **Type Hint:** Support as much as possible for modern IDE's auto-completion.
![lumo-framework](./images/lumo-intro.png)
@@ -38,6 +39,7 @@ and focuses on enhancing the experience of deep learning practitioners.
- :small_orange_diamond: [re-run](#small_orange_diamond-re-run)
- :small_orange_diamond: [backup](#small_orange_diamond-backup)
- :scroll: [License](#scroll-license)
+- [Full Document](https://pytorch-lumo.github.io/lumo/)
# :cloud: Installation
@@ -65,9 +67,9 @@ Here are two classic scenarios:
## :small_orange_diamond: Embedding into Existing Projects
-For existing projects, you can quickly embed Lumo by following these steps:
+For existing projects, you can quickly embed `lumo` by following these steps:
-- Import Lumo and initialize Logger and Experiment:
+- Import `lumo` and initialize Logger and Experiment:
```python
import random
@@ -117,8 +119,9 @@ exp.end()
## :small_orange_diamond: Building from Scratch
-If you want to start a new deep learning experiment from scratch, you can use Lumo to accelerate your code development.
-Below are examples of Lumo training at different scales:
+If you want to start a new deep learning experiment from scratch, you can use `lumo` to accelerate your code
+development.
+Below are examples of `lumo` training at different scales:
one-fine training:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 458e7782..a5fca1e0 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -31,7 +31,9 @@ def extract_version():
import os
import sys
-sys.path.insert(0, os.path.abspath('../../src/'))
+sys.path.insert(0, Path(__file__).parent.parent.joinpath('src').as_posix())
+
+# sys.path.insert(0, os.path.abspath('../../src/'))
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -130,7 +132,7 @@ def setup(app: Sphinx):
# '.md': 'markdown',
# }
#
-commonmark_suffixes = ['.rst']
+# commonmark_suffixes = ['.rst']
source_parsers = {
'.md': CommonMarkParser,
diff --git a/docs/source/custom_rtd_theme/versions.html b/docs/source/custom_rtd_theme/versions.html
index b5921c17..b95d995f 100644
--- a/docs/source/custom_rtd_theme/versions.html
+++ b/docs/source/custom_rtd_theme/versions.html
@@ -5,7 +5,7 @@
Read the Docs
- v: {{ current_version }}
+ v: {{ current_version.name }}
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8774ecfb..64875423 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,16 +1,28 @@
.. lumo documentation master file, created by
- sphinx-quickstart on Sat Mar 18 14:41:26 2023.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
+sphinx-quickstart on Sat Mar 18 14:41:26 2023.
+You can adapt this file completely to your liking, but it should at least
+contain the root `toctree` directive.
Welcome to lumo's documentation!
================================
+.. toctree::
+ :maxdepth: 1
+ :caption: Tutorial
+
+ tutorial/reproducibility.md
+ tutorial/configuration.md
+ tutorial/dataset_builder.md
+
+
+
+
.. toctree::
:maxdepth: 2
- :caption: Contents
+ :caption: Development
+
+
- ../tutorial/getting_start.md
Indices and tables
==================
diff --git a/docs/source/others/why_lumo.md b/docs/source/others/why_lumo.md
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/source/tutorial/configuration.md b/docs/source/tutorial/configuration.md
new file mode 100644
index 00000000..6620484c
--- /dev/null
+++ b/docs/source/tutorial/configuration.md
@@ -0,0 +1,94 @@
+# Runtime Configuration and Params
+
+## Params
+
+`~lumo.Params`is used to specify the configuration required for the current experiment. In addition to defining parameters that support autocompletion, it also supports command-line parameters, inheritance, and reading from multiple configuration files.
+
+The simplest usage is as follows:
+
+```python
+from lumo import Params
+
+params = Params()
+params.lr = 1e-3
+params.dataset = 'cifar10'
+params.from_args() # python main.py --dataset=cifar100
+
+print(params.dataset)
+>>> "cifar100"
+```
+
+Limit the value of parameters:
+
+```python
+params.dataset = params.choice('cifar10', 'cifar100')
+print(params.dataset)
+>>> "cifar10" # by default is the first value
+
+params.dataset = "imagenet"
+>>> raise BoundCheckError: value of param 'dataset' should in values ('cifar10',
+'cifar100'), but got imagenet
+```
+
+Read from other locations:
+
+```python
+params.from_json("*.json")
+params.from_yaml("*.yaml")
+params.from_yaml("*.yml")
+params.from_dict({})
+```
+
+`params.config`or`params.c`is a built-in reserved parameter. When the values of these two variables are strings and the path judgment is a yaml or json file or file list, the configuration is read from the corresponding position:
+
+```json
+# cfg.json
+{
+ "dataset": "cifar100"
+}
+```
+
+```python
+params.from_args(['--c','cfg.json'])
+print(params.dataset)
+>>> "cifar100"
+```
+
+## Configuration
+
+`lumo` provides a multi-level configuration system, including three file locations:
+
+```arduino
+~/.lumorc.json -> user-level
+/.lumorc.json -> repo-level, private
+/.lumorc.public.json -> repo-level, public
+```
+
+All configurations are loaded into`lumo.glob`at runtime for global settings:
+
+```css
+from lumo import glob
+
+glob['xxx']
+```
+
+## Difference between Configuration and Hyperparameters
+
+In `lumo`, configurations are mostly used for non-experiment-related content that is related to the computer environment and `lumo` behavior, such as the location of the dataset, GitHub access tokens, etc. All supported optional behaviors in`lumo`can be controlled by modifying the configuration in`glob`. The following are the currently supported configurable items:
+
+| Configuration | Description |
+| --- | --- |
+| github_access_token | Replaces the access_token parameter of the exp.backup() method. |
+| exp_root | One of several initial paths. |
+| db_root | One of several initial paths. |
+| progress_root | One of several initial paths. |
+| metric_root | One of several initial paths. |
+| cache_dir | One of several initial paths. |
+| blob_root | One of several initial paths. |
+| timezone | Determines the timezone used by lumo. Default is 'Asia/Shanghai'. |
+| TRAINER_LOGGER_STDIO | Controls whether the Logger outputs to the standard output stream. |
+| dev_branch | The branch used for saving code snapshots during version control. Default is 'lumo_experiments'. |
+| HOOK_LOCKFILE | Behavior control for loading LockFile ExpHook. |
+| HOOK_RECORDABORT | Behavior control for loading RecordAbort ExpHook. |
+| HOOK_GITCOMMIT | Behavior control for loading GitCommit ExpHook. |
+
diff --git a/docs/source/tutorial/dataset_builder.md b/docs/source/tutorial/dataset_builder.md
new file mode 100644
index 00000000..a6e9219c
--- /dev/null
+++ b/docs/source/tutorial/dataset_builder.md
@@ -0,0 +1,54 @@
+# Build your Dataset Easily
+
+lumo has designed`~lumo.DatasetBuilder`to provide a unified interface for constructing datasets. This can greatly reduce repetitive dataset design in most cases.
+
+Taking the CIFAR10 dataset as an example, if the dataset requires images to be output with two augmentations, either the Dataset class needs to be modified or the transform function needs to be rewritten:
+
+```python
+class MyCIFAR(CIFAR10):
+ ...
+
+ def __getitem(self,index):
+ sample = self.data[index]
+ label = self.target[index]
+ return self.transform1(sample), self.transform2(sample), label
+
+
+# or
+
+def two_transform(sample):
+ ...
+ return transform1(sample), transform2(sample)
+```
+
+When facing such changes in multiple datasets, this rewriting method can be time-consuming. Especially when the output format is not yet certain and may be subject to frequent changes.
+
+To solve this, lumo provides a universal and streaming solution through`DatasetBuilder`. You only need to prepare the raw data in the standard format and the standard one-to-one augmentation functions:
+
+```python
+...
+
+source = CIFAR10()
+transform1 = ...
+transform2 = ...
+```
+
+Then, any output format can be defined through`DatasetBuilder`:
+
+```python
+from lumo import DatasetBuilder
+
+ds = (
+ DatasetBuilder()
+ # Define input stream
+ .add_input('xs', source.data)
+ .add_input('ys', source.targets)
+ # Define output stream
+ .add_output('xs','xs1',transform1)
+ .add_output('xs','xs2',transform2)
+ .add_output('ys','ys')
+)
+
+print(ds[0])
+>>> {'xs1': ..., 'xs2': ..., "ys": ...}
+```
diff --git a/docs/source/tutorial/getting_start.md b/docs/source/tutorial/getting_start.md
deleted file mode 100644
index 83799930..00000000
--- a/docs/source/tutorial/getting_start.md
+++ /dev/null
@@ -1 +0,0 @@
-# Getting Start
\ No newline at end of file
diff --git a/docs/source/tutorial/images/2023-03-24-15-57-11.png b/docs/source/tutorial/images/2023-03-24-15-57-11.png
new file mode 100644
index 00000000..3a74bf15
Binary files /dev/null and b/docs/source/tutorial/images/2023-03-24-15-57-11.png differ
diff --git a/docs/source/tutorial/reproducibility.md b/docs/source/tutorial/reproducibility.md
new file mode 100644
index 00000000..075b966c
--- /dev/null
+++ b/docs/source/tutorial/reproducibility.md
@@ -0,0 +1,202 @@
+# Experiment management
+
+In `lumo`, the`Experiment`class provides sufficient guarantees to ensure experiment reproducibility. Specifically,`Experiment`guarantees reproducibility from four perspectives: path management, version control, parameter recording, and backup. It also simplifies the operation threshold through visual panels, command-line interfaces, and other methods.
+
+## Path Management
+
+To ensure that paths are not duplicated,`Experiment`assigns a unique experiment ID (`test_name`) to each experiment run. At the same time,`Experiment`provides three different types of data storage paths for storing information (info_dir), binary files (blob_dir), and temporary files (cache_dir), with the following path relationships:
+
+
+```markdown
+-
+ -
+ -
+
+-
+ -
+ -
+
+-
+ -
+ -
+```
+## Version Control
+
+The lifecycle of`Experiment`includes start/progress/end, and a series of`ExpHook`classes are set up to perform partial operations at each lifecycle stage. Among them,`~lumo.exp.exphook.GitCommit`is responsible for git commit, which checks for file changes at`on_start`and submits a snapshot of the current file to the`lumo_experiments`branch if changes exist. The commit information corresponding to the current code is recorded in the`info_dir`of the`Experiment`instance and can be viewed through`exp.properties['git']`.
+
+## Information Recording
+
+Information recording includes startup parameters such as hyperparameters and program execution parameters, runtime and post-run parameters such as Metric, execution time, and other metadata. All information mentioned except for hyperparameters is automatically recorded by`Experiment`at`.start()`. The hyperparameters of the experiment can be recorded by`exp.dump_info('params', params_dict)`.
+
+> When using`lumo.Trainer`for training, hyperparameters used are automatically recorded in the`params`key.
+
+
+For Metric, the`Experiment`instance can be recorded using`.dump_metric`and`.dump_metrics()`, for example:
+
+
+```python
+max_acc = exp.dump_metric("acc",acc, "cls_acc", cls_acc)
+```
+
+Here shows an example in `exp.properties`
+
+```python
+{'agent': nan,
+ 'backup': {'23-03-17-161847': {'backend': 'github',
+ 'number': 4,
+ 'repo': 'pytorch-lumo/image-classification'}},
+ 'deprecated': nan,
+ 'exception': nan,
+ 'execute': {'cwd': '~/python/image-classification-private',
+ 'exec_argv': ['train_ssl.py',
+ 'train_ssl.py',
+ '--module=simclr',
+ '--device=2',
+ '--config=config/ssl/simclr/cifar100.yaml',
+ '--model=wrn282',
+ '--scan=ssl-2023.02.28'],
+ 'exec_bin': '~/miniconda3/bin/python3',
+ 'exec_file': 'train_ssl.py',
+ 'repo': '~/python/image-classification-private'},
+ 'exp_name': 'simclr.simclrexp',
+ 'git': {'commit': '294ccdac',
+ 'dep_hash': '404fc6044b2119d56a5e8b92ac02fc1c',
+ 'repo': '~/python/image-classification-private'},
+ 'hooks': {'Diary': {'loaded': True, 'msg': ''},
+ 'FinalReport': {'loaded': True, 'msg': ''},
+ 'GitCommit': {'loaded': True, 'msg': ''},
+ 'LastCmd': {'loaded': True, 'msg': ''},
+ 'LockFile': {'loaded': True, 'msg': ''},
+ 'RecordAbort': {'loaded': True, 'msg': ''}},
+ 'lock': {'accelerate': '0.16.0',
+ 'decorator': '5.1.1',
+ 'fire': '0.5.0',
+ 'hydra': '1.3.1',
+ 'joblib': '1.2.0',
+ 'lumo': '0.15.0',
+ 'numpy': '1.24.2',
+ 'omegaconf': '2.3.0',
+ 'psutil': '5.9.4',
+ 'torch': '1.8.1+cu101',
+ 'torch.version.cuda': '10.1'},
+ 'note': '',
+ 'params': {'apply_mixco': False,
+ 'apply_unmix': False,
+ 'config': 'config/ssl/simclr/cifar100.yaml',
+ 'dataset': 'cifar100',
+ 'detach_cls': True,
+ 'device': 2,
+ 'ema': True,
+ 'ema_alpha': 0.99,
+ 'epoch': 1000,
+ 'eval': {'batch_size': 512,
+ 'num_workers': 8,
+ 'pin_memory': True,
+ 'shuffle': True},
+ 'feature_dim': 128,
+ 'hidden_feature_size': 128,
+ 'knn': True,
+ 'knn_k': 200,
+ 'knn_t': 0.1,
+ 'linear_eval': False,
+ 'lr_decay_end': 0.0005,
+ 'method': 'simclr',
+ 'model': 'wrn282',
+ 'module': 'simclr',
+ 'more_sample': True,
+ 'n_classes': 100,
+ 'optim': {'lr': 0.06,
+ 'momentum': 0.9,
+ 'name': 'SGD',
+ 'weight_decay': 0.0005},
+ 'pretrain_path': None,
+ 'scan': 'ssl-2023.02.28',
+ 'seed': 1,
+ 'semi_eval': False,
+ 'stl10_unlabeled': True,
+ 'temperature': 0.1,
+ 'test': {'batch_size': 512,
+ 'num_workers': 8,
+ 'pin_memory': True,
+ 'shuffle': False},
+ 'train': {'batch_size': 512,
+ 'num_workers': 8,
+ 'pin_memory': True,
+ 'shuffle': True},
+ 'train_ending': 10,
+ 'train_linear': True,
+ 'train_strategy': 'ending',
+ 'warmup_epochs': 0,
+ 'warmup_from': 0.01,
+ 'with_bn': False},
+ 'pinfo': {'hash': '62ee6de98b381872e200e82901ad51f7',
+ 'obj': {'argv': ['~/miniconda3/bin/python3',
+ 'train_ssl.py',
+ 'train_ssl.py',
+ '--module=simclr',
+ '--device=2',
+ '--config=config/ssl/simclr/cifar100.yaml',
+ '--model=wrn282',
+ '--scan=ssl-2023.02.28'],
+ 'pid': 27687,
+ 'pname': 'python3',
+ 'pstart': 1678763482.5},
+ 'pid': 27687},
+ 'progress': {'finished': False,
+ 'last_edit_time': '23-03-14-212932',
+ 'ratio': 1.0,
+ 'start': '23-03-14-111124',
+ 'update_from': None},
+ 'rerun': {'from': '230313.015.99t', 'repeat': 1},
+ 'test_name': '230314.000.a3t',
+ ...
+ }
+```
+
+# Retrieve Experiment
+
+`Watch`consolidates information for all experiments, allowing users to search for a specific experiment.
+
+
+```python
+from lumo import Watcher, Experiment
+
+w = Watcher()
+df = w.load() # all experiments
+
+exp = Experiment.from_cache(df.iloc[0].to_dict())
+```
+For a known experiment with`test_name`, the`Experiment`instance can be directly retrieved using the`retrieve`method:
+
+
+```python
+w.retrieve('230306.012.d5t')
+>>> Experiment(info_dir=".../.lumo/experiments/moco.mocoexp/230306.012.d5t")
+```
+## Visual Panel
+
+A fixed-style panel can never satisfy everyone's needs. Therefore, `lumo` provides dynamic panels based on pandas and panel, with all styles except for a few fixed parts added by the user:
+
+
+```python
+from lumo import Watcher
+w = Watcher()
+df = w.load()
+
+... filter operations ...
+
+new_df = ...
+
+w.panel(new_df)
+```
+![](images/2023-03-24-15-57-11.png)
+
+## Repetitive Experiment
+
+Repetitive experiments mainly occur in two scenarios:
+
+- To verify the stability of the results, rerun the experiment with other random seeds and the same parameters.
+- In the middle of the experiment, due to memory, disk space, or other reasons, the experiment failed and needs to be rerun with similar parameters.
+
+Especially when scanning parameters, if only
+
diff --git a/docs/source/tutorial_zh/configuration.md b/docs/source/tutorial_zh/configuration.md
new file mode 100644
index 00000000..992429b5
--- /dev/null
+++ b/docs/source/tutorial_zh/configuration.md
@@ -0,0 +1,40 @@
+
+
+# Configuration
+
+lumo 提供了多级作用域的配置,这包括:
+
+```
+~/.lumorc.json -> user-level
+/.lumorc.json -> repo-level, private
+/.lumorc.public.json -> repo-level, public
+```
+
+所有的配置会在运行时加载到 `lumo.glob` 中,用以全局设置:
+
+```
+from lumo import glob
+
+glob['xxx']
+```
+
+## 配置和超参数的区别
+
+在 lumo 中,配置大多用于和实验完全无关,但跟电脑环境和 lumo 行为有关的内容,如数据集的存放路径、GitHub 的 access token 等。 `lumo` 中所有支持的可选行为都可以通过更改 glob 的配置进行控制,以下是目前支持变更的配置项:
+
+| 配置项 | 描述 |
+| --- | --- |
+| github_access_token | 代替 exp.backup() 方法的 access_token 参数。 |
+| exp_root | 一些初始路径之一。 |
+| db_root | 一些初始路径之一。 |
+| progress_root | 一些初始路径之一。 |
+| metric_root | 一些初始路径之一。 |
+| cache_dir | 一些初始路径之一。 |
+| blob_root | 一些初始路径之一。 |
+| timezone | 决定 lumo 使用的时区,默认为 'Asia/Shanghai'。 |
+| TRAINER_LOGGER_STDIO | 控制 Logger 是否向标准输出流输出内容。 |
+| dev_branch | 版本控制时代码快照的保存分支,默认为 'lumo_experiments'。 |
+| HOOK_LOCKFILE | 控制加载 LockFile ExpHook 的行为。 |
+| HOOK_RECORDABORT | 控制加载 RecordAbort ExpHook 的行为。 |
+| HOOK_GITCOMMIT | 控制加载 GitCommit ExpHook 的行为。 |
+
diff --git a/docs/source/tutorial_zh/dataset_builder.md b/docs/source/tutorial_zh/dataset_builder.md
new file mode 100644
index 00000000..03885db3
--- /dev/null
+++ b/docs/source/tutorial_zh/dataset_builder.md
@@ -0,0 +1,54 @@
+# DatasetBuilder
+
+lumo 设计了 `~lumo.DatasetBuilder` 用于提供一个统一的构建数据集的接口。可以在大多数情况下,减少重复的数据集形态设计。
+
+以 CIFAR10 数据集为例,如果需要数据集的图片输出两次增广,那么要么要更改 Dataset 类,要么重写 transform 函数:
+
+```python
+class MyCIFAR(CIFAR10):
+ ...
+
+ def __getitem(self,index):
+ sample = self.data[index]
+ label = self.target[index]
+ return self.transform1(sample), self.transform2(sample), label
+
+
+# or
+
+def two_transform(sample):
+ ...
+ return transform1(sample), transform2(sample)
+```
+
+在多个数据集都面临这种更改时候,这种重写的方式会消耗较多的精力。尤其是在输出形式还不确定,可能面临频繁更改的时候。
+
+对此,lumo 通过 `DatasetBuilder` 提供了一个通用的流式的解决方案,你只需要按标准形式准备好原始数据,和标准的 one-to-one 的增广函数:
+
+```python
+...
+
+source = CIFAR10()
+transform1 = ...
+transform2 = ...
+```
+
+随后,任意的输出形式均可以通过 `DatasetBuilder` 来定义
+
+```python
+from lumo import DatasetBuilder
+
+ds = (
+ DatasetBuilder()
+ # 定义输入流
+ .add_input('xs', source.data)
+ .add_input('ys', source.targets)
+ # 定义输出流
+ .add_output('xs','xs1',transform1)
+ .add_output('xs','xs2',transform2)
+ .add_output('ys','ys')
+)
+
+print(ds[0])
+>>> {'xs1': ..., 'xs2': ..., "ys": ...}
+```
diff --git a/docs/source/tutorial_zh/images/2023-03-24-15-57-11.png b/docs/source/tutorial_zh/images/2023-03-24-15-57-11.png
new file mode 100644
index 00000000..3a74bf15
Binary files /dev/null and b/docs/source/tutorial_zh/images/2023-03-24-15-57-11.png differ
diff --git a/docs/source/tutorial_zh/reproducibility.md b/docs/source/tutorial_zh/reproducibility.md
new file mode 100644
index 00000000..de79ee83
--- /dev/null
+++ b/docs/source/tutorial_zh/reproducibility.md
@@ -0,0 +1,147 @@
+# Reproducibility
+
+在 lumo 中,`Experiment` 为保证实验可复现提供了足够的保障。具体的,Experiment 从路径管理、版本控制、参数记录、备份四个角度来保证。并通过可视化面板、命令行接口等简化了操作门槛。
+
+## 路径管理
+
+为了保证路径不重复,`Experiment` 会为每次实验运行分配一个唯一实验 ID (`test_name`)。同时,`Experiment` 提供三种不同类型的数据存储路径,分别用于存储信息(info_dir)、二进制文件(blob_dir)、临时文件(cache_dir),三者的路径关系如下:
+
+```
+-
+ -
+ -
+
+-
+ -
+ -
+
+-
+ -
+ -
+
+```
+
+## 版本控制
+
+`Experiment` 的生命周期包括 start/progress/end,在每个生命周期,都设置了一系列 `ExpHook` 类来执行部分操作。其中,负责 git 提交的是 `~lumo.exp.exphook.GitCommit`,会在 `on_start` 时检查文件的更改,如果存在,则向 `lumo_experiments` 分支提交一份当前的文件快照。当前代码所对应的 commit 等信息会全部记录到该 `Experiment` 实例的 `info_dir` 中。可以通过 `exp.properties['git']` 查看。
+
+## 参数记录
+
+参数记录包括超参数、程序执行参数等启动时参数,Metric 等运行时和运行后参数以及执行时间等元信息。除了超参数外,所提到的全部信息都会通过 `Experiment` 在 `.start()` 时自动记录。而实验的超参数,则可以通过`exp.dump_info('params',params_dict)` 自行记录。
+
+> 使用 lumo.Trainer 训练时,使用到的超参数会自动记录到 `params` key 中。
+
+对于 Metric,`Experiment` 实例可以通过 `.dump_metric` 和 `.dump_metrics()` 进行记录,如:
+
+```python
+max_acc = exp.dump_metric("acc",acc, "cls_acc", cls_acc)
+```
+
+# Retrieve
+
+`Watch` 会将所有实验的信息进行整合,从而允许用户全盘检索某次实验。
+
+```python
+from lumo import Watcher, Experiment
+
+w = Watcher()
+df = w.load() # all experiments
+
+exp = Experiment.from_cache(df.iloc[0].to_dict())
+```
+
+对某个已知 `test_name` 的实验,可以通过 `retrieve` 方法直接获取 `Experiment` 实例:
+
+```python
+w.retrieve('230306.012.d5t')
+>>> Experiment(info_dir=".../.lumo/experiments/moco.mocoexp/230306.012.d5t")
+```
+
+## 可视化面板
+
+一个固定样式的面板永远不能满足所有人的需要,所以,lumo 基于 pandas 和 panel 提供了动态面板,除了固定的几个部分外,其余所有样式均由使用者自行添加:
+
+```python
+from lumo import Watcher
+w = Watcher()
+df = w.load()
+
+... filter operations ...
+
+new_df = ...
+
+w.panel(new_df)
+```
+
+![](images/2023-03-24-15-57-11.png)
+
+## 重复实验
+
+重复实验主要存在于两个场景:
+
+- 为了验证结果稳定性而使用其他随机种子重新以相同参数运行实验
+- 在实验中途因为显存、内存等原因实验出错需要以类似参数运行实验
+
+尤其是在扫参时,如果仅有一两个实验出现了问题,直接基于日志观察,很难快速的知道失败的实验的运行参数。lumo 提供了 `rerun` 这一命令行参数,对于通过可视化面板或其他方式得到的失败的实验 ID (test_name),可以直接通过如下的命令重跑,并重新指定参数:
+
+```bash
+lumo rerun 230315.017.bbt --device=0
+lumo rerun 230315.012.29t --seed=1
+```
+
+`rerun`除了保证运行参数完全可控,还会在原实验和新实验之间建立一个双向链接,用于提示实验的重复次数等,可以分别在原实验和新实验的 `rerun` 参数中观察到:
+
+```python
+exp.properties['rerun']
+>>> {'from': '230310.002.f5t', 'repeat': 1}
+
+exp2.properties['rerun']
+>>> {'rerun_at': {'230311.004.87t': True}}
+```
+
+## 备份
+
+`Experiment` 额外提供了备份方法,可以向不同的途径记录实验。
+
+### GitHub
+
+```python
+from lumo import glob
+glob['github_access_token'] = '...' # or store in .lumorc.json
+exp.backup('github',repo='pytorch-lumo/lumo')
+```
+
+```python
+from lumo import glob
+exp.backup('github',repo='pytorch-lumo/lumo', access_token='...')
+```
+
+### 本地/远程备份
+
+- [ ] TODO
+
+### 代码快照
+
+- [ ] TODO
+
+# 其他
+
+## 生命周期
+
+所有经过 `Experiment` 对实验信息的改动都会自动触发一次变更记录,会在 `exp.heartbeat_fn` 位置创建一个文件。该文件会被 Watcher 在 `.load()` 的时候检测到,并增量更新该实验的内容。`Experiment` 实例在创建时,如果是通过 `.from_cache()` 创建的,也会检测该文件是否存在,如果存在,则从原目录中重新加载,而忽略 cache 的内容。
+
+```
+- progress
+ -
+ - {test-1}.heartbeat
+ - {test-1}.pid
+```
+
+## ExpHook
+
+目前支持的 ExpHook 有:
+
+- `~lumo.exp.exphook.LastCmd` : 在项目根目录(或运行目录)记录运行命令的 history。
+- `~lumo.exp.exphook.RecordAbort` : 在项目抛出异常时在 `exp.properties["exception"]` 记录异常内容。
+- `~lumo.exp.exphook.GitCommit` : 在每次实验开始时保存实验快照,并记录 commit 信息到 `exp.properties['git']`
+- `~lumo.exp.exphook.LockFile` : 在每次实验开始时保存相关库的版本信息,记录到 `exp.properties['lock']`
diff --git a/src/lumo/__init__.py b/src/lumo/__init__.py
index 35697cca..1d8f32dd 100644
--- a/src/lumo/__init__.py
+++ b/src/lumo/__init__.py
@@ -1,7 +1,7 @@
"""
"""
-__version__ = "1.0.0"
+__version__ = "1.1.0"
from .core import Params, ParamsType, MetricType, Meter, Record, TrainStage, BaseParams
from .proc import glob
diff --git a/src/lumo/cli/__init__.py b/src/lumo/cli/__init__.py
index e9a39df9..eeeb8781 100644
--- a/src/lumo/cli/__init__.py
+++ b/src/lumo/cli/__init__.py
@@ -56,10 +56,24 @@ def board(port=11606, address=None, open=True):
print(f"Starting server on port {port}")
+def backup_local(test_name, target_dir, with_blob=False, with_cache=False):
+ from lumo.exp.watch import Watcher
+ from lumo import Experiment
+ w = Watcher()
+ exp = w.retrieve(test_name)
+ if exp is None:
+ print(f'{test_name} not found')
+ exit(1)
+ else:
+ print(exp.backup('local', target_dir=target_dir, with_blob=with_blob, with_cache=with_cache))
+
+
def main():
"""the entry"""
fire.Fire({
'rerun': rerun,
'note': note,
'board': board,
+ 'backup_local': backup_local,
})
+ exit(0)
diff --git a/src/lumo/core/params.py b/src/lumo/core/params.py
index c3384f9a..5e0b021a 100644
--- a/src/lumo/core/params.py
+++ b/src/lumo/core/params.py
@@ -8,7 +8,7 @@
import fire
from joblib import hash
-from omegaconf import DictConfig, OmegaConf, DictKeyType
+from omegaconf import DictConfig, OmegaConf, DictKeyType, ListConfig
from omegaconf._utils import _ensure_container
# from .attr import safe_update_dict, set_item_iterative
@@ -393,11 +393,19 @@ def func(*args, **kwargs):
config = kwargs.get('config')
if config is None:
config = kwargs.get('c')
- if config is not None and isinstance(config, str) and os.path.exists(config):
- if config.endswith('yaml') or config.endswith('yml'):
- self.from_yaml(config)
- elif config.endswith('json'):
- self.from_json(config)
+
+ if config is not None:
+ if isinstance(config, str):
+ config = config.split(',')
+ if isinstance(config, (list, ListConfig)):
+ for config_fn in config:
+ print('get', config_fn, 'done')
+ if not (isinstance(config_fn, str) and os.path.exists(config_fn)):
+ continue
+ if config_fn.endswith('yaml') or config_fn.endswith('yml'):
+ self.from_yaml(config_fn)
+ elif config_fn.endswith('json'):
+ self.from_json(config_fn)
dic = BaseParams()
for k, v in kwargs.items():
diff --git a/src/lumo/exp/backup.py b/src/lumo/exp/backup.py
index 7dd4c379..c35fa6a2 100644
--- a/src/lumo/exp/backup.py
+++ b/src/lumo/exp/backup.py
@@ -1,8 +1,11 @@
-from .experiment import Experiment
import os
+import re
+import warnings
from pprint import pformat
+from pathlib import Path
from lumo.utils.fmt import strftime
-import re
+from .experiment import Experiment
+from lumo.utils.compress import compress_dpath
issue_title = """Test {test_name}"""
@@ -44,6 +47,9 @@
{properties}
+
+
+> Powered by [lumo](https://github.com/pytorch-lumo/lumo)
"""
@@ -150,38 +156,29 @@ def backup_github_issue(exp: Experiment, repo: str, access_token: str,
return issue
- # There is no way to upload files by GitHub Api
- # filter backuped file sizes
- # files = []
- # for root, dirs, fs in os.walk(exp.mk_ipath()):
- # for f in fs:
- # absf = os.path.join(root, f)
- # file_size = os.path.getsize(absf) / (1024 * 1024) # Mb
- # if file_size > size_limit:
- # continue
- # files.append(absf)
- #
- # for root, dirs, fs in os.walk(exp.mk_bpath()):
- # for f in fs:
- # absf = os.path.join(root, f)
- # file_size = os.path.getsize(absf) / (1024 * 1024) # Mb
- # if file_size > size_limit:
- # continue
- # files.append(absf)
-
- # repo.create_git_blob()
- # {files}
- # issue.create_comment()
-
def backup_ssh(exp: Experiment, host, username, root, size_limit):
"""compress backup zip files to target server with replacement"""
pass
-def backup_local(exp: Experiment, target: str):
+def backup_local(exp: Experiment, target_dir: str, with_code=False, with_blob=False, with_cache=False):
"""backup in local dist"""
- pass
+ dpath = {'info': exp.info_dir}
+ if with_blob:
+ dpath['blob'] = exp.blob_dir
+ if with_cache:
+ dpath['cache'] = exp.cache_dir
+ if with_code:
+ res = exp.archive()
+ if res is None:
+ warnings.warn(f'git commit is not recorded in {exp}.')
+
+ names, paths = zip(*list(dpath.items()))
+
+ target_fn = os.path.join(target_dir, f'{exp.test_name}.tar')
+
+ return compress_dpath(paths, names, target_fn, root_name=exp.test_name)
backup_regist = {
diff --git a/src/lumo/exp/experiment.py b/src/lumo/exp/experiment.py
index 0e128a97..d55f8b2c 100644
--- a/src/lumo/exp/experiment.py
+++ b/src/lumo/exp/experiment.py
@@ -23,6 +23,7 @@
from .base import BaseExpHook
from ..proc.pid import pid_hash, runtime_pid_obj
from .metric import Metric
+from lumo.utils import repository as git_repo
class Experiment:
@@ -92,7 +93,7 @@ class Experiment:
ENV_TEST_NAME_KEY = 'LUMO_EXP_TEST_NAME'
- def __init__(self, exp_name: str = None, test_name=None, paths=None, info_dir=None):
+ def __init__(self, exp_name: str = None, test_name=None, roots=None, info_dir=None, blob_dir=None, cache_dir=None):
"""
Initializes a new instance of the Experiment class.
@@ -103,7 +104,7 @@ def __init__(self, exp_name: str = None, test_name=None, paths=None, info_dir=No
Raises:
ValueError: If the experiment name is not a legal filename.
"""
- if info_dir is not None:
+ if exp_name is None and info_dir is not None and os.path.exists(info_dir):
exp = self.__class__.from_disk(info_dir=info_dir)
self._prop = exp._prop
self._hooks = exp._hooks
@@ -117,14 +118,20 @@ def __init__(self, exp_name: str = None, test_name=None, paths=None, info_dir=No
if test_name is None:
test_name = os.environ.get(Experiment.ENV_TEST_NAME_KEY, None)
self._prop['test_name'] = test_name
- if paths is None:
- paths = {}
- self._prop['paths'] = paths
+ if roots is None:
+ roots = {}
+ self._prop['paths'] = roots
self._prop['note'] = ''
self._hooks = {}
self._metric = None
+ if info_dir is not None:
+ self._prop['info_dir'] = info_dir
+ if blob_dir is not None:
+ self._prop['blob_dir'] = blob_dir
+ if cache_dir is not None:
+ self._prop['cache_dir'] = cache_dir
# wrap
self.dump_string = self._trigger_change(self.dump_string)
self.dump_note = self._trigger_change(self.dump_note)
@@ -190,7 +197,7 @@ def __repr__(self):
Returns:
str: A string representation of the Experiment object.
"""
- return f'{self.__class__.__name__}(info_dir="{self.info_dir})"'
+ return f'{self.__class__.__name__}(info_dir="{self.info_dir}")'
def __str__(self):
"""
@@ -330,7 +337,7 @@ def metric(self):
return self._metric
@property
- def paths(self) -> dict:
+ def roots(self) -> dict:
"""
Gets a dictionary containing the paths to various directories associated with the experiment.
@@ -583,15 +590,15 @@ def dump_metrics(self, dic: dict, cmp: str):
@property
def info_root(self):
- return self.paths['info_root']
+ return self.roots['info_root']
@property
def cache_root(self):
- return self.paths['cache_root']
+ return self.roots['cache_root']
@property
def blob_root(self):
- return self.paths['blob_root']
+ return self.roots['blob_root']
@property
def pid_fn(self):
@@ -613,20 +620,26 @@ def exp_dir(self):
@property
def info_dir(self):
- d = os.path.join(self.info_root, self.exp_name, self.test_name)
- os.makedirs(d, exist_ok=True)
+ d = self.properties.get('info_dir')
+ if d is None:
+ d = os.path.join(self.info_root, self.exp_name, self.test_name)
+ os.makedirs(d, exist_ok=True)
return d
@property
def cache_dir(self):
- d = os.path.join(self.cache_root, self.exp_name, self.test_name)
- os.makedirs(d, exist_ok=True)
+ d = self.properties.get('cache_dir')
+ if d is None:
+ d = os.path.join(self.cache_root, self.exp_name, self.test_name)
+ os.makedirs(d, exist_ok=True)
return d
@property
def blob_dir(self):
- d = os.path.join(self.blob_root, self.exp_name, self.test_name)
- os.makedirs(d, exist_ok=True)
+ d = self.properties.get('blob_dir')
+ if d is None:
+ d = os.path.join(self.blob_root, self.exp_name, self.test_name)
+ os.makedirs(d, exist_ok=True)
return d
def _mk_path(self, *path: str, is_dir: bool) -> str:
@@ -709,7 +722,7 @@ def rerun(self, arg_list: List[str]):
# self.properties['']
new_test_name = self._create_test_name(self.exp_dir)
new_exp = Experiment(self.exp_name, test_name=new_test_name)
- self.dump_info('deprecated', {'rerun_at': {new_exp.test_name: True}}, append=True)
+ self.dump_info('rerun', {'rerun_at': {new_exp.test_name: True}}, append=True)
old_rerun_info = self.properties.get('rerun', {})
count = 1
if isinstance(old_rerun_info, dict):
@@ -729,7 +742,7 @@ def initial(self):
"""
self.dump_info('exp_name', self.exp_name)
self.dump_info('test_name', self.test_name)
- self.dump_info('paths', self.paths)
+ self.dump_info('roots', self.roots)
self.dump_info('execute', {
'repo': self.project_root,
@@ -831,25 +844,30 @@ def exp_func():
@classmethod
def from_cache(cls, dic: dict):
"""
- Creates an Experiment object from a cached dictionary.
+ Create an Experiment object from cached Experiment data.
- The cached dictionary should have the same format as the one returned by the Experiment.to_cache() method.
+ If the disk has been modified (as detected by `~Experiment.heartbeat_fn`),
+ reload the Experiment from disk. Otherwise, create a new Experiment
+ object with the cached data.
Args:
- cls: the Experiment class.
- dic: a dictionary with the cached Experiment data.
+ cls: The Experiment class.
+ dic: A dictionary containing cached Experiment data.
Returns:
An Experiment object.
"""
paths = dic.pop('paths', {})
_ = dic.pop('metrics')
- self = cls(exp_name=dic['exp_name'], test_name=dic['test_name'], paths=paths)
+ self = cls(exp_name=dic['exp_name'], test_name=dic['test_name'], roots=paths)
+ if os.path.exists(self.heartbeat_fn):
+ return cls.from_disk(self.info_dir)
self._prop.update(dic)
+
return self
@classmethod
- def from_disk(cls, info_dir):
+ def from_disk(cls, info_dir, blob_dir=None, cache_dir=None):
"""
Creates an Experiment object from a test root directory on disk.
@@ -866,7 +884,8 @@ def from_disk(cls, info_dir):
if not is_test_root(info_dir):
raise ValueError(f'{info_dir} is not a valid test_root')
info_dir = os.path.abspath(info_dir)
- exp_dir = os.path.dirname(info_dir)
+ exp_name = io.load_json(os.path.join(info_dir, 'info', 'exp_name.json'))
+ test_name = io.load_json(os.path.join(info_dir, 'info', 'test_name.json'))
paths_fn = os.path.join(info_dir, 'info', f'paths.json')
if os.path.exists(paths_fn):
@@ -877,7 +896,9 @@ def from_disk(cls, info_dir):
else:
paths = {}
- self = cls(os.path.basename(exp_dir), test_name=os.path.basename(info_dir), paths=paths)
+ # given exp_name will stop __init__ load information by .from_disk()
+ self = cls(exp_name, test_name=test_name, roots=paths,
+ info_dir=info_dir, blob_dir=blob_dir, cache_dir=cache_dir)
# load prop
for f in os.listdir(self.mk_ipath('info', is_dir=True)):
@@ -907,17 +928,35 @@ def dict(self):
'metrics': self.metric.value,
}
+ @overload
+ def backup(self, backend: str = 'local',
+ target_dir: str = None, with_code=False, with_blob=False, with_cache=False):
+ ...
+
+ @overload
+ def backup(self, backend: str = 'github', access_token: str = None,
+ labels: list = None, update: bool = True,
+ **kwargs):
+ ...
+
def backup(self, backend: str = 'github', **kwargs):
"""
Backup this experiment into the given target, currently only support GitHub, you can implement your own way
by the provided information of Experiment.
"""
from .backup import backup_regist
- if backend == 'github':
- kwargs.setdefault('access_token', glob['github_access_token'])
-
return backup_regist[backend](exp=self, **kwargs)
+ def archive(self, target_dir=None):
+ if 'git' not in self.properties:
+ return None
+
+ if target_dir is None:
+ target_dir = self.blob_dir
+
+ repo = git_repo.load_repo(self.project_root)
+ return git_repo.git_archive(target_dir, repo, self.properties['git']['commit'])
+
class SimpleExperiment(Experiment):
"""
@@ -925,8 +964,8 @@ class SimpleExperiment(Experiment):
execute before and after the experiment.
"""
- def __init__(self, exp_name: str = None, test_name=None, paths=None, info_dir=None):
- super().__init__(exp_name, test_name, paths, info_dir)
+ def __init__(self, exp_name: str = None, test_name=None, roots=None, info_dir=None):
+ super().__init__(exp_name, test_name, roots, info_dir)
from . import exphook
self.set_hook(exphook.LastCmd())
self.set_hook(exphook.LockFile())
diff --git a/src/lumo/exp/watch.py b/src/lumo/exp/watch.py
index fc07ea2d..cb3fcfe9 100644
--- a/src/lumo/exp/watch.py
+++ b/src/lumo/exp/watch.py
@@ -496,8 +496,11 @@ def is_test_root(path: str) -> bool:
Returns:
True if the path is a valid test root, False otherwise.
"""
- test_name = os.path.basename(path.rstrip('/'))
- return is_test_name(test_name)
+ if os.path.exists(os.path.join(path, 'info', 'test_name.json')):
+ return True
+
+ # test_name = os.path.basename(path.rstrip('/'))
+ # return is_test_name(test_name)
def is_test_name(test_name: str) -> bool:
diff --git a/src/lumo/analyse/__init__.py b/src/lumo/sketch/analyse/__init__.py
similarity index 100%
rename from src/lumo/analyse/__init__.py
rename to src/lumo/sketch/analyse/__init__.py
diff --git a/src/lumo/analyse/collect.py b/src/lumo/sketch/analyse/collect.py
similarity index 100%
rename from src/lumo/analyse/collect.py
rename to src/lumo/sketch/analyse/collect.py
diff --git a/src/lumo/analyse/condition.py b/src/lumo/sketch/analyse/condition.py
similarity index 100%
rename from src/lumo/analyse/condition.py
rename to src/lumo/sketch/analyse/condition.py
diff --git a/src/lumo/exp/finder.py b/src/lumo/sketch/analyse/finder.py
similarity index 99%
rename from src/lumo/exp/finder.py
rename to src/lumo/sketch/analyse/finder.py
index dec377f6..af783881 100644
--- a/src/lumo/exp/finder.py
+++ b/src/lumo/sketch/analyse/finder.py
@@ -180,7 +180,7 @@ def summary_experiment(test_name: str = None, test_root: str = None):
print('Tags:')
indent_print(pformat(exp.tags))
print('Use paths:')
- indent_print(pformat(exp.paths))
+ indent_print(pformat(exp.roots))
print('Execute:')
indent_print(' '.join(exp.exec_argv))
print('-----------------------------------')
@@ -199,7 +199,7 @@ def format_experiment(exp: Experiment) -> Dict[str, Any]:
return {
'Properties': exp.properties,
'tags': exp.tags,
- 'paths': exp.paths,
+ 'paths': exp.roots,
'exec_argv': exp.exec_argv,
}
diff --git a/src/lumo/sketch/memory_grab.py b/src/lumo/sketch/memory_grab.py
deleted file mode 100644
index 857b3e5e..00000000
--- a/src/lumo/sketch/memory_grab.py
+++ /dev/null
@@ -1,278 +0,0 @@
-"""
-A hack function to monitor GPU memory usage, then occupy its access.
-"""
-import functools
-import os
-import subprocess
-import time
-from functools import partial
-
-import torch
-
-from lumo.core.tree import tree
-from lumo.utils import re
-
-match_mem = re.compile('([0-9]+) +([0-9]+)[^|]* ([0-9]+)MiB')
-
-
-class DeviceMem:
- def __init__(self):
- self.line_mem = tree()
-
- def _parse_device_pid_mem_pair(self, lines):
- for lid, line in enumerate(lines):
- res = re.search(match_mem, line)
- if res is not None:
- _device, _pid, _mib = [int(i) for i in res.groups()]
- self.line_mem[_device][_pid] = lid
- yield _device, _pid, _mib
-
- def try_parse(self, lines, pid, device):
- """ try parse mem from cached lid directly.
- Returns:
- -1 means failed.
- others means successd and its memory.
-
- """
- lid = self.line_mem[device][pid]
- if isinstance(lid, dict):
- return -1
- elif lid > len(lines):
- return -1
- else:
- res = re.search(match_mem, lines[lid])
- if res is None:
- return -1
- else:
- _device, _pid, _mib = [int(i) for i in res.groups()]
- if _pid == pid and _device == device:
- return _mib
- else:
- return -1
-
- def re_parse(self, lines, pid, device):
- _res = self.try_parse(lines, pid, device)
- if _res != -1:
- return _res
-
- for _device, _pid, _mib in self._parse_device_pid_mem_pair(lines):
- if _pid == pid and _device == device:
- return _mib
-
- return 0
-
- def _get_nvidia_smi(self):
- proc = subprocess.Popen(['nvidia-smi'], stdout=subprocess.PIPE)
- lines = proc.stdout.readlines()
- return [i.decode() for i in lines]
-
- def _device_equal(self, da, db):
- if isinstance(da, (int, str)):
- da = torch.device(da)
- if isinstance(db, (int, str)):
- db = torch.device(db)
- return da == db
-
- def get_device_release_mem(self, device):
- """ get device memory left."""
- s_pid = os.getpid()
- total = self.get_device_mem(device)
- for _device, _pid, _mib in self._parse_device_pid_mem_pair(self._get_nvidia_smi()):
- if self._device_equal(device, _device):
- total -= _mib
-
- return total
-
- def choice_free_gpu(self, non_block=False):
- """choice a free gpu"""
- pass
-
- def get_device_mem(self, device):
- """ returns device total memory(unit: MB) """
- return torch.cuda.get_device_properties(device).total_memory // (1024 * 1024)
-
- def get_pid_device_mem(self, pid, device):
- """
- 尽可能有效率的得到进程在某设备下占用的显存(通过命令行程序调用获取)
- :param pid:
- :param device:
- :return:
- """
- if isinstance(device, torch.device):
- device = device.index
- else:
- device = torch.device(device).index
-
- lines = self._get_nvidia_smi()
- mib = self.try_parse(lines, pid, device)
- if mib == -1:
- mib = self.re_parse(lines, pid, device)
-
- return mib
-
-
-_memer = DeviceMem()
-_pid = os.getpid()
-
-if torch.cuda.is_available():
- get_pid_device_mem = partial(_memer.get_pid_device_mem, pid=_pid, device=torch.cuda.current_device())
-
-
-class memory(object):
- r"""
- 优雅的抢卡
- Args:
- memory: 需要占用的内存,以 MB 为单位
- device: 需要占用内存的设备
- hold:
- unit:
- Example::
- >>> import lumo
- >>> with lumo.memory(5000):
- ... y = x * 2
-
- >>> @lumo.memory(1024)
- ... def doubler(x):
- ... ...
-
- >>> lumo.memory(10000).start()
- ... # do something
-
- Why use nvidia-smi to get memory useage? see:
- https://github.com/pytorch/pytorch/issues/12873
- """
-
- def __init__(self, memory, device=None, hold=False) -> None:
- super().__init__()
- if device is None:
- device = torch.cuda.current_device()
- if isinstance(device, (str, int)):
- device = torch.device(device)
-
- self.need = memory
- self.device = device
- self.hold = hold
- self.exc_time = 0
- self.acc = 5
- self.mem = []
- self.last_success = _memer.get_pid_device_mem(_pid, self.device)
-
- def copy(self, pid: int, wait: bool = True):
- self.need = _memer.get_pid_device_mem(pid, self.device)
- if wait:
- self.wait(pid)
- else:
- self.start()
-
- def wait(self, pid):
- while _memer.get_pid_device_mem(pid, self.device) > 0:
- time.sleep(0.5)
- self.start()
-
- def immediately(self, pre_init=False):
- """
- 等待,直到内存有空间后,开始申请相应显存,优雅,礼貌,推荐
- Args:
- pre_init: 是否初始化 CUDA(这将在一开始消耗一定显存),默认为 False,即不抢占任何内存,
- 直到设备释放足够空间后开始抢占。
- """
- while True:
- _left = _memer.get_device_release_mem(self.device)
- _allocated = _memer.get_pid_device_mem(_pid, self.device)
-
- if pre_init and _allocated == 0:
- self._malloc(1, init=True)
- continue
-
- _need = self.need - _allocated
- if _need < 0:
- return self.end()
-
- if _left > _need:
- if _allocated == 0:
- self._malloc(1, init=True)
- continue
-
- res = self._malloc(_need)
- time.sleep(0.5)
- if res:
- return self.end()
-
- print("need {}Mb, {}Mb allocated, "
- "waiting for {}Mb released, "
- "but only {}Mb left.".format(self.need,
- _allocated, _need,
- _left), end='\r')
-
- def _malloc(self, size, init=False):
- """ unit: mb """
- try:
- tmp = torch.rand(size, 1048576 // 4, device=self.device)
- if not init:
- self.mem.append(tmp)
- return True
- except Exception as e:
- return False
-
- def end(self):
- print()
- if self.hold:
- print('press keyboardinterrupt to end')
- try:
- while True:
- # do some Fake thing
- self.mem[-1].random_()
- time.sleep(0.1)
- except KeyboardInterrupt:
- print('continue')
-
- def start(self, immediately=True):
- if immediately:
- self.immediately()
- else:
- self.invade()
-
- def invade(self, unit=5):
- """一点一点的侵占,有多少占用多少,直到申请满为止,比较粗鲁,不友好,不推荐"""
- try:
- while self.last_success < self.need:
- res = self._malloc(unit + self.acc)
- if res:
- self.acc += unit
- self.last_success = _memer.get_pid_device_mem(_pid, self.device)
- time.sleep(0.1)
- else:
- self.exc_time += 1
- self.acc = max(self.acc - unit, 0)
- time.sleep(0.5)
- print('{}/{}Mb, try={}, pid={}'.format(self.last_success,
- self.need,
- self.exc_time,
- os.getpid()), end='\r')
- self.end()
- except KeyboardInterrupt:
- print('\nabort.')
-
- def __enter__(self):
- self.invade()
-
- def __exit__(self, *args):
- del self.mem[:]
- torch.cuda.empty_cache()
- return True
-
- def __call__(self, func):
- @functools.wraps(func)
- def decorate_no_grad(*args, **kwargs):
- with self:
- return func(*args, **kwargs)
-
- return decorate_no_grad
-
- @staticmethod
- def hold_current():
- count = torch.cuda.device_count()
- mems = [_memer.get_pid_device_mem(_pid, i) for i in range(count)]
- for i, mem in enumerate(mems):
- if mem > 0:
- memory(mem, device=i, hold=(i == count - 1)).start()
diff --git a/src/lumo/sketch/vis/__main__.py b/src/lumo/sketch/vis/__main__.py
index 526c02ba..152a2f97 100644
--- a/src/lumo/sketch/vis/__main__.py
+++ b/src/lumo/sketch/vis/__main__.py
@@ -18,44 +18,45 @@ class Main():
def __init__(self):
test_names = self.select_head()
for test_name in test_names:
- test_root = finder.retrieval_test_root(test_name)
- if test_root:
- # with st.expander(test_root):
- self.make_test(test_root)
- # st.write(test_root)
- # st.write(finder.format_experiment(Experiment.from_disk(test_root)))
- else:
-
- st.error(f'"{test_name}" is not a valid test name/ test root!')
-
- def make_test(self, test_root: str):
- exp = Experiment.from_disk(test_root)
- with st.expander("Experiment Info"):
- st.write(finder.format_experiment(exp))
- # with st.expander("Visualize Metrics"):
- if exp.has_prop('tensorboard_args'):
- tb = exp.properties.get('tensorboard_args')
- metrics = parser.parse_fron_tensorboard(tb['log_dir'])
- elif exp.has_prop('logger_args'):
- tb = exp.properties.get('logger_args')
- metrics = parser.parse_from_log(tb['log_dir'])
- else:
- metrics = {}
- metrics = list(metrics.items())
- for i in range(0, len(metrics), 2):
- l, m = st.columns(2)
- k, v = metrics[i]
- l.write(k)
- l.line_chart(np.array([vv.value for vv in v]))
- if i + 1 >= len(metrics):
- break
- k, v = metrics[i + 1]
- m.write(k)
- m.line_chart(np.array([vv.value for vv in v]))
- # if i + 2 >= len(metrics):
- # break
- # k, v = metrics[i + 2]
- # r.line_chart({'k': np.array([vv.value for vv in v])})
+ pass
+ # test_root = finder.retrieval_test_root(test_name)
+ # if test_root:
+ # # with st.expander(test_root):
+ # self.make_test(test_root)
+ # # st.write(test_root)
+ # # st.write(finder.format_experiment(Experiment.from_disk(test_root)))
+ # else:
+ #
+ # st.error(f'"{test_name}" is not a valid test name/ test root!')
+
+ # def make_test(self, test_root: str):
+ # exp = Experiment.from_disk(test_root)
+ # with st.expander("Experiment Info"):
+ # st.write(finder.format_experiment(exp))
+ # # with st.expander("Visualize Metrics"):
+ # if exp.has_prop('tensorboard_args'):
+ # tb = exp.properties.get('tensorboard_args')
+ # metrics = parser.parse_fron_tensorboard(tb['log_dir'])
+ # elif exp.has_prop('logger_args'):
+ # tb = exp.properties.get('logger_args')
+ # metrics = parser.parse_from_log(tb['log_dir'])
+ # else:
+ # metrics = {}
+ # metrics = list(metrics.items())
+ # for i in range(0, len(metrics), 2):
+ # l, m = st.columns(2)
+ # k, v = metrics[i]
+ # l.write(k)
+ # l.line_chart(np.array([vv.value for vv in v]))
+ # if i + 1 >= len(metrics):
+ # break
+ # k, v = metrics[i + 1]
+ # m.write(k)
+ # m.line_chart(np.array([vv.value for vv in v]))
+ # if i + 2 >= len(metrics):
+ # break
+ # k, v = metrics[i + 2]
+ # r.line_chart({'k': np.array([vv.value for vv in v])})
def select_head(self):
left, right = st.columns([1, 3])
diff --git a/src/lumo/trainer/accelerator.py b/src/lumo/trainer/accelerator.py
index 72de3996..90ec7ddc 100644
--- a/src/lumo/trainer/accelerator.py
+++ b/src/lumo/trainer/accelerator.py
@@ -1,293 +1,13 @@
-import warnings
-from torch import nn
-import torch
-from torch import distributed
-from torch.utils.data import DataLoader
-from lumo.data.loader import DataLoaderSide
-from lumo.proc.dist import gather
-
-
-class Accelerator:
- """
- A class to define the interface for various types of accelerator.
-
- Attributes:
- _prop (dict): A dictionary of keyword arguments.
-
- Methods:
- device: A property method to get the device.
- set_device: A method to set the device.
- prepare_data_loader: A method to prepare the data loader.
- prepare_model: A method to prepare the model.
- prepare_optimizer: A method to prepare the optimizer.
- unwrap_model: A method to unwrap the model.
- prepare: A method to prepare the inputs for training.
- wait_for_everyone: A method to wait for all processes to synchronize.
- gather: A method to gather the tensor data.
- backward: A method to compute the gradients using backpropagation.
- """
-
- def __init__(self, **kwargs):
- """
- Initialize the class with a dictionary of keyword arguments.
- """
- self._prop = kwargs
-
- @property
- def device(self) -> torch.device:
- """
- Get the device.
- """
- return self._prop.get('device', None)
-
- def set_device(self, device: torch.device):
- """
- Set the device.
-
- Args:
- device (torch.device): The device to be set.
- """
- assert isinstance(device, torch.device)
- self._prop['device'] = device
-
- def prepare_data_loader(self, dataloader):
- """
- Prepare the data loader.
-
- Args:
- dataloader: The data loader.
-
- Returns:
- The prepared data loader.
- """
- return dataloader
-
- def prepare_model(self, model: torch.nn.Module):
- """
- Prepare the model.
-
- Args:
- model (torch.nn.Module): The model.
-
- Returns:
- The prepared model.
- """
- return model.to(self.device)
-
- def prepare_optimizer(self, optimizer: torch.optim.Optimizer):
- """
- Prepare the optimizer.
-
- Args:
- optimizer (torch.optim.Optimizer): The optimizer.
-
- Returns:
- The prepared optimizer.
- """
- return optimizer
-
- def unwrap_model(self, model):
- """
- Unwrap the model.
-
- Args:
- model: The model.
-
- Returns:
- The unwrapped model.
- """
- return model
-
- def prepare(self, *args):
- """
- Prepare the inputs for training.
-
- Args:
- *args: The inputs.
-
- Returns:
- The prepared inputs.
- """
- res = []
- for item in args:
- if isinstance(item, nn.Module):
- res.append(self.prepare_model(item))
- elif isinstance(item, (DataLoader, DataLoaderSide)):
- res.append(self.prepare_data_loader(item))
- elif isinstance(item, torch.optim.Optimizer):
- res.append(self.prepare_optimizer(item))
- else:
- raise NotImplementedError()
- return res
-
- def wait_for_everyone(self):
- """
- Wait for all processes to synchronize.
- """
- torch.distributed.barrier()
-
- def gather(self, tensor: torch.Tensor):
- """
- Gather the tensor data.
-
- Args:
- tensor (torch.Tensor): The tensor to be gathered.
-
- Returns:
- The gathered tensor data.
- """
- return gather(tensor)
-
- def backward(self, loss: torch.Tensor, **kwargs):
- """
- Compute the gradients using backpropagation.
-
- Args:
- loss (torch.Tensor): The loss tensor.
- **kwargs: The additional keyword arguments.
- """
- loss.backward(**kwargs)
-
-class HugAccelerator(Accelerator):
- """
- A class to define the interface for Hugging Face accelerator.
-
- Methods:
- set_device: A method to set the device.
- prepare_data_loader: A method to prepare the data loader.
- prepare_model: A method to prepare the model.
- prepare_optimizer: A method to prepare the optimizer.
- unwrap_model: A method to unwrap the model.
- prepare: A method to prepare the inputs for training.
- wait_for_everyone: A method to wait for all processes to synchronize.
- gather: A method to gather the tensor data.
- backward: A method to compute the gradients using backpropagation.
- """
-
- def __init__(self, **kwargs):
- """
- Initialize the class with a dictionary of keyword arguments.
- """
- super().__init__(**kwargs)
- from .backend.accelerator import Accelerator
- self._backbone = Accelerator()
-
- @property
- def device(self):
- """
- Get the device.
- """
- return self._backbone.device
-
- def set_device(self, device: torch.device):
- """
- Set the device.
-
- Args:
- device (torch.device): The device to be set.
- """
- assert isinstance(device, torch.device)
- self._backbone.state.device = device
-
- def prepare_data_loader(self, loader):
- """
- Prepare the data loader.
-
- Args:
- loader: The data loader.
-
- Returns:
- The prepared data loader.
- """
- from accelerate.data_loader import DataLoaderShard, DataLoaderDispatcher
- if isinstance(loader, (DataLoaderShard, DataLoaderDispatcher)):
- warnings.warn('Duplicated prepare a same DataLoader twice, check your code.')
- return loader
- return self._backbone.prepare_data_loader(loader)
-
- def prepare_model(self, model):
- """
- Prepare the model.
-
- Args:
- model: The model.
-
- Returns:
- The prepared model.
- """
- return self._backbone.prepare_model(model)
-
- def prepare_optimizer(self, optimizer):
- """
- Prepare the optimizer.
-
- Args:
- optimizer: The optimizer.
-
- Returns:
- The prepared optimizer.
- """
- return self._backbone.prepare_optimizer(optimizer)
-
- def unwrap_model(self, model):
- """
- Unwrap the model.
-
- Args:
- model: The model.
-
- Returns:
- The unwrapped model.
- """
- return self._backbone.unwrap_model(model)
-
- def prepare(self, *args):
- """
- Prepare the inputs for training.
-
- Args:
- *args: The inputs.
-
- Returns:
- The prepared inputs.
- """
- return self._backbone.prepare(*args)
-
- def wait_for_everyone(self):
- """
- Wait for all processes to synchronize.
- """
- self._backbone.wait_for_everyone()
-
- def gather(self, tensor):
- """
- Gather the tensor data.
-
- Args:
- tensor: The tensor to be gathered.
-
- Returns:
- The gathered tensor data.
- """
- return self._backbone.gather(tensor)
-
- def backward(self, loss: torch.Tensor, **kwargs):
- """
- Compute the gradients using backpropagation.
-
- Args:
- loss (torch.Tensor): The loss tensor.
- **kwargs: The additional keyword arguments.
- """
- self._backbone.backward(loss, **kwargs)
-
+from lumo.trainer.backend.accelerator import HugAccelerator
+from lumo.trainer.backend.base import Accelerator
+from lumo.trainer.backend.horovod_accelerator import Horovod
+from lumo.trainer.backend.original import TorchDist
register = {
-
'none': Accelerator,
+ 'original': TorchDist,
'accelerator': HugAccelerator,
- 'deepspeed': None,
- 'horovod': None,
+ 'horovod': Horovod,
}
diff --git a/src/lumo/trainer/backend/accelerator.py b/src/lumo/trainer/backend/accelerator.py
index 5c64a757..ffa376bc 100644
--- a/src/lumo/trainer/backend/accelerator.py
+++ b/src/lumo/trainer/backend/accelerator.py
@@ -1,5 +1,9 @@
+import warnings
+
+import torch
from accelerate.accelerator import Accelerator as _Accelerator
from accelerate.data_loader import prepare_data_loader
+from lumo.trainer.backend.base import Accelerator as Base
class Accelerator(_Accelerator):
@@ -31,3 +35,136 @@ def prepare_data_loader(self, data_loader, **kwargs):
rng_types=self.rng_types.copy(),
dispatch_batches=self.dispatch_batches,
)
+
+
+class HugAccelerator(Base):
+ """
+ A class to define the interface for Hugging Face accelerator.
+
+ Methods:
+ set_device: A method to set the device.
+ prepare_data_loader: A method to prepare the data loader.
+ prepare_model: A method to prepare the model.
+ prepare_optimizer: A method to prepare the optimizer.
+ unwrap_model: A method to unwrap the model.
+ prepare: A method to prepare the inputs for training.
+ wait_for_everyone: A method to wait for all processes to synchronize.
+ gather: A method to gather the tensor data.
+ backward: A method to compute the gradients using backpropagation.
+ """
+
+ def __init__(self, **kwargs):
+ """
+ Initialize the class with a dictionary of keyword arguments.
+ """
+ super().__init__(**kwargs)
+ self._backbone = Accelerator()
+
+ @property
+ def device(self):
+ """
+ Get the device.
+ """
+ return self._backbone.device
+
+ def set_device(self, device: torch.device):
+ """
+ Set the device.
+
+ Args:
+ device (torch.device): The device to be set.
+ """
+ assert isinstance(device, torch.device)
+ self._backbone.state.device = device
+
+ def prepare_data_loader(self, loader):
+ """
+ Prepare the data loader.
+
+ Args:
+ loader: The data loader.
+
+ Returns:
+ The prepared data loader.
+ """
+ from accelerate.data_loader import DataLoaderShard, DataLoaderDispatcher
+ if isinstance(loader, (DataLoaderShard, DataLoaderDispatcher)):
+ warnings.warn('Duplicated prepare a same DataLoader twice, check your code.')
+ return loader
+ return self._backbone.prepare_data_loader(loader)
+
+ def prepare_model(self, model):
+ """
+ Prepare the model.
+
+ Args:
+ model: The model.
+
+ Returns:
+ The prepared model.
+ """
+ return self._backbone.prepare_model(model)
+
+ def prepare_optimizer(self, optimizer):
+ """
+ Prepare the optimizer.
+
+ Args:
+ optimizer: The optimizer.
+
+ Returns:
+ The prepared optimizer.
+ """
+ return self._backbone.prepare_optimizer(optimizer)
+
+ def unwrap_model(self, model):
+ """
+ Unwrap the model.
+
+ Args:
+ model: The model.
+
+ Returns:
+ The unwrapped model.
+ """
+ return self._backbone.unwrap_model(model)
+
+ def prepare(self, *args):
+ """
+ Prepare the inputs for training.
+
+ Args:
+ *args: The inputs.
+
+ Returns:
+ The prepared inputs.
+ """
+ return self._backbone.prepare(*args)
+
+ def wait_for_everyone(self):
+ """
+ Wait for all processes to synchronize.
+ """
+ self._backbone.wait_for_everyone()
+
+ def gather(self, tensor):
+ """
+ Gather the tensor data.
+
+ Args:
+ tensor: The tensor to be gathered.
+
+ Returns:
+ The gathered tensor data.
+ """
+ return self._backbone.gather(tensor)
+
+ def backward(self, loss: torch.Tensor, **kwargs):
+ """
+ Compute the gradients using backpropagation.
+
+ Args:
+ loss (torch.Tensor): The loss tensor.
+ **kwargs: The additional keyword arguments.
+ """
+ self._backbone.backward(loss, **kwargs)
diff --git a/src/lumo/trainer/backend/base.py b/src/lumo/trainer/backend/base.py
new file mode 100644
index 00000000..1f870eec
--- /dev/null
+++ b/src/lumo/trainer/backend/base.py
@@ -0,0 +1,147 @@
+import torch
+from lumo import DataLoaderSide
+from lumo.proc.dist import gather
+from torch import nn
+from torch.utils.data import DataLoader
+
+
+class Accelerator:
+ """
+ A class to define the interface for various types of accelerator.
+
+ Attributes:
+ _prop (dict): A dictionary of keyword arguments.
+
+ Methods:
+ device: A property method to get the device.
+ set_device: A method to set the device.
+ prepare_data_loader: A method to prepare the data loader.
+ prepare_model: A method to prepare the model.
+ prepare_optimizer: A method to prepare the optimizer.
+ unwrap_model: A method to unwrap the model.
+ prepare: A method to prepare the inputs for training.
+ wait_for_everyone: A method to wait for all processes to synchronize.
+ gather: A method to gather the tensor data.
+ backward: A method to compute the gradients using backpropagation.
+ """
+
+ def __init__(self, **kwargs):
+ """
+ Initialize the class with a dictionary of keyword arguments.
+ """
+ self._prop = kwargs
+
+ @property
+ def device(self) -> torch.device:
+ """
+ Get the device.
+ """
+ return self._prop.get('device', None)
+
+ def set_device(self, device: torch.device):
+ """
+ Set the device.
+
+ Args:
+ device (torch.device): The device to be set.
+ """
+ assert isinstance(device, torch.device)
+ self._prop['device'] = device
+
+ def prepare_data_loader(self, dataloader):
+ """
+ Prepare the data loader.
+
+ Args:
+ dataloader: The data loader.
+
+ Returns:
+ The prepared data loader.
+ """
+ return dataloader
+
+ def prepare_model(self, model: torch.nn.Module):
+ """
+ Prepare the model.
+
+ Args:
+ model (torch.nn.Module): The model.
+
+ Returns:
+ The prepared model.
+ """
+ return model.to(self.device)
+
+ def prepare_optimizer(self, optimizer: torch.optim.Optimizer):
+ """
+ Prepare the optimizer.
+
+ Args:
+ optimizer (torch.optim.Optimizer): The optimizer.
+
+ Returns:
+ The prepared optimizer.
+ """
+ return optimizer
+
+ def unwrap_model(self, model):
+ """
+ Unwrap the model.
+
+ Args:
+ model: The model.
+
+ Returns:
+ The unwrapped model.
+ """
+ return model
+
+ def prepare(self, *args):
+ """
+ Prepare the inputs for training.
+
+ Args:
+ *args: The inputs.
+
+ Returns:
+ The prepared inputs.
+ """
+ res = []
+ for item in args:
+ if isinstance(item, nn.Module):
+ res.append(self.prepare_model(item))
+ elif isinstance(item, (DataLoader, DataLoaderSide)):
+ res.append(self.prepare_data_loader(item))
+ elif isinstance(item, torch.optim.Optimizer):
+ res.append(self.prepare_optimizer(item))
+ else:
+ raise NotImplementedError()
+ return res
+
+ def wait_for_everyone(self):
+ """
+ Wait for all processes to synchronize.
+ """
+ torch.distributed.barrier()
+
+ def gather(self, tensor: torch.Tensor):
+ """
+ Gather the tensor data.
+
+ Args:
+ tensor (torch.Tensor): The tensor to be gathered.
+
+ Returns:
+ The gathered tensor data.
+ """
+ return gather(tensor)
+
+ def backward(self, loss: torch.Tensor, **kwargs):
+ """
+ Compute the gradients using backpropagation.
+
+ Args:
+ loss (torch.Tensor): The loss tensor.
+ **kwargs: The additional keyword arguments.
+ """
+ loss.backward(**kwargs)
diff --git a/src/lumo/trainer/backend/horovod_accelerator.py b/src/lumo/trainer/backend/horovod_accelerator.py
new file mode 100644
index 00000000..6b62db1c
--- /dev/null
+++ b/src/lumo/trainer/backend/horovod_accelerator.py
@@ -0,0 +1,32 @@
+import torch
+from lumo import DataLoaderSide
+from lumo.proc.dist import gather
+from torch import nn
+from torch.utils.data import DataLoader
+from .base import Accelerator
+
+
+class Horovod(Accelerator):
+ """
+ A class to define the interface for various types of accelerator.
+
+ Attributes:
+ _prop (dict): A dictionary of keyword arguments.
+
+ Methods:
+ device: A property method to get the device.
+ set_device: A method to set the device.
+ prepare_data_loader: A method to prepare the data loader.
+ prepare_model: A method to prepare the model.
+ prepare_optimizer: A method to prepare the optimizer.
+ unwrap_model: A method to unwrap the model.
+ prepare: A method to prepare the inputs for training.
+ wait_for_everyone: A method to wait for all processes to synchronize.
+ gather: A method to gather the tensor data.
+ backward: A method to compute the gradients using backpropagation.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ import horovod
+
diff --git a/src/lumo/trainer/backend/original.py b/src/lumo/trainer/backend/original.py
new file mode 100644
index 00000000..8cb0adc7
--- /dev/null
+++ b/src/lumo/trainer/backend/original.py
@@ -0,0 +1,27 @@
+import torch
+from lumo import DataLoaderSide
+from lumo.proc.dist import gather
+from torch import nn
+from torch.utils.data import DataLoader
+from .base import Accelerator
+
+
+class TorchDist(Accelerator):
+ """
+ A class to define the interface for various types of accelerator.
+
+ Attributes:
+ _prop (dict): A dictionary of keyword arguments.
+
+ Methods:
+ device: A property method to get the device.
+ set_device: A method to set the device.
+ prepare_data_loader: A method to prepare the data loader.
+ prepare_model: A method to prepare the model.
+ prepare_optimizer: A method to prepare the optimizer.
+ unwrap_model: A method to unwrap the model.
+ prepare: A method to prepare the inputs for training.
+ wait_for_everyone: A method to wait for all processes to synchronize.
+ gather: A method to gather the tensor data.
+ backward: A method to compute the gradients using backpropagation.
+ """
diff --git a/src/lumo/trainer/saver.py b/src/lumo/trainer/saver.py
index 1088e03c..1f6c4df8 100644
--- a/src/lumo/trainer/saver.py
+++ b/src/lumo/trainer/saver.py
@@ -27,14 +27,16 @@ class state_dict_tuple:
IndexError: If the index is not 0 or 1.
Examples:
- # Create an instance of state_dict_tuple with state_dict and meta_info
- >>> sd = state_dict_tuple({'a': 1, 'b': 2}, 'meta')
-
- # Access the state_dict and meta_info using the [] operator
- >>> sd[0]
- {'a': 1, 'b': 2}
- >>> sd[1]
- 'meta'
+ ```python
+ # Create an instance of state_dict_tuple with state_dict and meta_info
+ >>> sd = state_dict_tuple({'a': 1, 'b': 2}, 'meta')
+
+ # Access the state_dict and meta_info using the [] operator
+ >>> sd[0]
+ {'a': 1, 'b': 2}
+ >>> sd[1]
+ 'meta'
+ ```
"""
def __init__(self, state_dict=None, meta_info=None):
@@ -75,8 +77,7 @@ class Saver:
"""
Write state_dict into test dirs, record save log into /.lumo/save..log
- format:
- ->
+ format: ->
Raises:
When save/load operations happend, you may meet Out Of Space, FileNotExist, or other problems,
diff --git a/src/lumo/trainer/trainer.py b/src/lumo/trainer/trainer.py
index c8307ff1..0196ab03 100644
--- a/src/lumo/trainer/trainer.py
+++ b/src/lumo/trainer/trainer.py
@@ -20,6 +20,7 @@
from lumo.utils import safe_io as IO
from lumo.trainer.rnd import RndManager
from lumo.utils.logger import Logger
+from .backend.base import Accelerator
from .base import _BaseTrainer
from .components import TrainerExperiment, TrainerParams
from .saver import Saver
@@ -89,12 +90,16 @@ def __init__(self, params: ParamsType, dm: DataModule = None, accelerator=None):
self.train_toggle = False
device = params.get('device', None) if not self.is_dist else None
- # self.accelerate = Accelerator(kwargs_handlers=[
- # DistributedDataParallelKwargs(find_unused_parameters=params.get('find_unused_parameters', False))
- # ])
- accelerator = glob.get('accelerator', 'accelerator')
- self.accelerate = get_accelerator(accelerator)
+ if isinstance(accelerator, str) or accelerator is None:
+ accelerator = glob.get('accelerator', 'accelerator')
+ accelerate = get_accelerator(accelerator)
+ elif isinstance(accelerator, Accelerator):
+ accelerate = accelerator
+ else:
+ raise NotImplementedError()
+
+ self.accelerate = accelerate
self.accelerate.set_device(torch.device(device))
if dist.is_main():
diff --git a/src/lumo/utils/compress.py b/src/lumo/utils/compress.py
new file mode 100644
index 00000000..ff0c6aea
--- /dev/null
+++ b/src/lumo/utils/compress.py
@@ -0,0 +1,43 @@
+import os
+from typing import List
+import tarfile
+
+
+def get_path_suffix(path1, path2):
+ relpath = os.path.relpath(path1, path2)
+ return os.path.splitext(relpath)[0]
+
+
+def compress_dpath(paths: List[str], names: List[str], target, root_name=None):
+ """
+ Compress multiple directories into a single tar archive.
+
+ Args:
+ paths (List[str]): List of paths to the directories to be compressed.
+ names (List[str]): List of names to be used for each directory in the tar archive.
+ target (str): Path to the tar archive file to create.
+
+ Returns:
+ None
+
+ Raises:
+ IOError: If there is an error opening or writing to the target file.
+ tarfile.TarError: If there is an error adding a file to the tar archive.
+
+ Example:
+ >>> compress_dpath(['A/dir', 'B/dir'], ['dir_a', 'dir_b'], 'archive.tar.gz')
+
+ """
+ assert target.endswith('tar')
+
+ if isinstance(root_name, str):
+ names = [os.path.join(root_name, i) for i in names]
+
+ with tarfile.open(target, "w:gz") as tar:
+ for path, name in zip(paths, names):
+
+ for root, dirs, files in os.walk(path):
+ for file in files:
+ file_path = os.path.join(root, file)
+ tar.add(file_path, arcname=os.path.join(name, get_path_suffix(root, path), file))
+ return target
diff --git a/src/lumo/utils/device.py b/src/lumo/utils/device.py
index ef6ee945..65395685 100644
--- a/src/lumo/utils/device.py
+++ b/src/lumo/utils/device.py
@@ -37,9 +37,9 @@ def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_oth
The data on which to apply `func`
*args:
Positional arguments that will be passed to `func` when applied on the unpacked data.
- main_type (`type`, *optional*, defaults to `torch.Tensor`):
+ main_type (`type`, optional, defaults to `torch.Tensor`):
The base type of the objects to which apply `func`.
- error_on_other_type (`bool`, *optional*, defaults to `False`):
+ error_on_other_type (`bool`, optional, defaults to `False`):
Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
`main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
**kwargs:
diff --git a/src/lumo/utils/memory_grab.py b/src/lumo/utils/memory_grab.py
index a49c31d7..b96ddbc2 100644
--- a/src/lumo/utils/memory_grab.py
+++ b/src/lumo/utils/memory_grab.py
@@ -184,23 +184,26 @@ class memory(object):
A graceful memory allocator that optimizes GPU memory allocation by incrementally increasing the memory
footprint to minimize fragmentation.
+ ```python
+ import lumo
+ with lumo.memory(5000):
+ y = x * 2
+
+ @lumo.memory(1024)
+ def doubler(x):
+ ...
+
+ lumo.memory(10000).start()
+ ...
+ ```
+
Args:
memory (int): Memory size to be allocated in MB.
device (str or int, optional): Device to allocate memory on. Defaults to the current CUDA device.
hold (bool, optional): Whether to hold the memory after allocation. Defaults to False.
invade (bool, optional): Whether to use aggressive memory allocation. Defaults to False.
- Examples:
- >>> import lumo
- >>> with lumo.memory(5000):
- ... y = x * 2
-
- >>> @lumo.memory(1024)
- ... def doubler(x):
- ... ...
- >>> lumo.memory(10000).start()
- ... # do something
References:
To get GPU memory usage, we use nvidia-smi. Refer to this link for details:
diff --git a/src/lumo/utils/repository.py b/src/lumo/utils/repository.py
index fce12f8b..6b45dca6 100644
--- a/src/lumo/utils/repository.py
+++ b/src/lumo/utils/repository.py
@@ -185,14 +185,13 @@ def git_commit(repo=None, key=None, branch_name=None, info: str = None, filter_f
return commit_
-def git_archive(repo=None, commit_hex=None, commit: Commit = None):
+def git_archive(target_path, repo=None, commit_hex=None, commit: Commit = None):
"""
git archive -o
Returns:
An Experiment represents this archive operation
"""
- from lumo.exp import Experiment
if repo is None:
repo = load_repo()
@@ -201,18 +200,14 @@ def git_archive(repo=None, commit_hex=None, commit: Commit = None):
old_path = os.getcwd()
os.chdir(commit.tree.abspath)
- exp = Experiment('GitArchive')
- fn = exp.mk_bpath(f'{commit.hexsha[:8]}.tar')
- exp.dump_info('git_archive', {'file': fn,
- 'test_name': exp.test_name,
- 'commit_hex': commit.hexsha[:8]})
- exp.dump_string('archive_fn', fn)
+ fn = os.path.join(target_path, f'{commit.hexsha[:8]}.tar')
+
with open(fn, 'wb') as w:
repo.archive(w, commit.hexsha)
os.chdir(old_path)
- return exp
+ return fn
def git_checkout(repo=None, commit_hex=None, commit: Commit = None):
diff --git a/tests/core/test_params.py b/tests/core/test_params.py
index fa449f74..c264f7c9 100644
--- a/tests/core/test_params.py
+++ b/tests/core/test_params.py
@@ -76,12 +76,29 @@ def test_json():
def test_yaml():
res = get_res()
fn = tempfile.mktemp('.yaml')
+ res.yaml = 3
res.a = 3
res.to_yaml(fn)
res.from_args([f'--config={fn}', '--a=2'])
assert res.a == 2
res.from_args([f'--config={fn}'])
- assert res.a == 3
+ assert res.yaml == 3
+
+
+def test_multiple_config():
+ json_fn = tempfile.mktemp(".json")
+ with open(json_fn, 'w') as w:
+ json.dump({'json': {'a': 2}}, w)
+
+ yaml_pm = BaseParams()
+ yaml_fn = tempfile.mktemp('.yaml')
+ yaml_pm.yaml = 3
+ yaml_pm.to_yaml(yaml_fn)
+
+ res = MyParams()
+ res.from_args([f'--config={json_fn},{yaml_fn}'])
+ assert res.yaml == 3
+ assert res.json["a"] == 2
def test_copy():
diff --git a/tests/exp/test_exp.py b/tests/exp/test_exp.py
new file mode 100644
index 00000000..1aa3fd2d
--- /dev/null
+++ b/tests/exp/test_exp.py
@@ -0,0 +1,50 @@
+import os
+
+from lumo import Experiment, Params
+import tarfile
+from lumo.utils import safe_io as IO
+from lumo.proc.config import debug_mode
+import tempfile
+
+
+def get_experiment():
+ debug_mode()
+ exp = Experiment('test_exp')
+ exp.start()
+ exp.dump_info('temp_string', 'string')
+ exp.dump_info('temp_number', 123)
+ exp.dump_info('temp_list', [1, 2, 3])
+ exp.dump_info('temp_dict', {"a": [1, 2, 3]})
+
+ pfn = exp.mk_ipath('params.json')
+ Params.init_from_kwargs(a=1, b=2, c=3).to_json(pfn)
+ IO.dump_pkl({'a': 1}, exp.mk_bpath('bin.pkl'))
+ IO.dump_pkl({'a': 1}, exp.mk_cpath('bin.pkl'))
+
+ exp.end()
+
+ return exp
+
+
+def test_backup():
+ exp = get_experiment()
+ target_dir = tempfile.mkdtemp()
+
+ fpath = exp.backup('local', target_dir=target_dir, with_blob=True, with_cache=True)
+ file = tarfile.open(fpath, mode='r')
+ de_dir = tempfile.mkdtemp()
+ os.makedirs(de_dir, exist_ok=True)
+ file.extractall(de_dir)
+ bexp = Experiment.from_disk(
+ os.path.join(de_dir, exp.test_name, 'info'),
+ blob_dir=os.path.join(de_dir, exp.test_name, 'blob'),
+ cache_dir=os.path.join(de_dir, exp.test_name, 'cache'),
+ )
+
+ assert bexp.properties['temp_string'] == 'string'
+ assert bexp.properties['temp_number'] == 123
+ assert bexp.properties['temp_list'] == [1, 2, 3]
+ assert bexp.properties['temp_dict'] == {"a": [1, 2, 3]}
+
+ assert IO.load_pkl(bexp.mk_bpath('bin.pkl'))['a'] == 1
+ assert IO.load_pkl(bexp.mk_cpath('bin.pkl'))['a'] == 1
diff --git a/tests/utils/test_repository.py b/tests/utils/test_repository.py
index 6956bda0..742dd210 100644
--- a/tests/utils/test_repository.py
+++ b/tests/utils/test_repository.py
@@ -1,10 +1,11 @@
-import tempfile
import os
+import random
+import tempfile
+
import git
from lumo.proc.config import debug_mode
from lumo.utils import repository
-import random
def write(fn):
@@ -62,8 +63,7 @@ def test_git():
import tarfile
- exp = repository.git_archive(repo, b_hash)
- archived_fn = exp.load_string('archive_fn')
+ archived_fn = repository.git_archive(tempfile.mkdtemp(), repo, b_hash)
file = tarfile.open(archived_fn, mode='r')
assert file.extractfile('a.txt').read().decode() == b_str
assert file.extractfile('init.txt').read().decode() == f_str