diff --git a/DQN_README.md b/DQN_README.md new file mode 100644 index 0000000..a76bdf2 --- /dev/null +++ b/DQN_README.md @@ -0,0 +1,159 @@ +# 目录 + +## DQN +DQN是一种基于Q-learning的强化学习算法。 +[论文](https://arxiv.org/pdf/1312.5602.pdf):Playing Atari with Deep Reinforcement Learning, Mnih et al, 2013. +这是基于XingTian(刑天)框架以及Mindspore(昇思)框架实现的DQN算法 + +刑天 (XingTian) 是一个组件化强化学习库,用于开发、验证强化学习算法。它目前已支持包括DQN、DDPG、PPO和IMPALA等系列算法,可以在多种环境中训练智能体,如Gym、Atari、Torcs、StarCraftII等。 为了满足用户快速验证和解决RL问题的需求,刑天抽象出了四个模块:`Algorithm`,`Model`,`Agent`,`Environment`。它们的工作方式类似于"乐高"积木的组合。 +## 模型架构 + +DQN模型基于深度神经网络,将深度神经网络与Q-learning相结合,网络结构为三个卷积层和两个全连接层。使用DQN算法根据来自Replay Buffer的一批数据训练模型。 它首先使用参与者和目标参与者模型预测当前状态和下一个状态的 Q 值。 然后计算 TD 误差并使用预测 Q 值和目标 Q 值更新参与者模型。同时接收数据并将每个Item作为状态、动作、奖励、下一状态和完成的元组添加到Replay Buffer。代理在网络学习的驱动下进行探索,对环境的反馈进行处理做出Action和状态更新,触发训练过程。 + +![DQN网络架构](./figs/DQN.png) + +更多细节可以参考[原论文](https://arxiv.org/pdf/1312.5602.pdf)。 +## 数据集 + +DQN作为一种强化学习算法,模型通过和环境交互收集样本更新参数权重。我们在gym上训练智能体。Gym是一个最广泛使用的强化学习实验环境,内置上百种实验环境,比如一些简单几何体的运动,一些用文本表示的简单游戏,或者机械臂的抓取和控制等实验环境,我们用dqn训练了几种atari游戏,包括beamrider,breakout,qbert,spaceinvader。通过智能体与环境交互获得状态动作、奖励、下一状态,并将这些值代入神经网络训练,以获得理想结果。 + +## 环境要求 + +* 硬件(GPU or NPU) + * 使用GPU处理器或者NPU处理器来搭建硬件环境。 +* 框架 + * MindSpore(2.0.0),参考MindSpore-2.0.0版本的[安装教程](https://mindspore.cn/install) + * XingTian(0.3.0),参考XingTian的[安装教程](https://github.com/huawei-noah/xingtian) +* 其他第三方库参考`requirements.txt` +## 快速入门 + +完成框架安装和环境配之后,你可以按照如下步骤进行模型的训练和评估 + +``` +cd xingtian +# 训练 +xt_main -f examples/_dqn_ms.yaml -t train + +# 训练和评估 +xt_main -f examples/_dqn_ms.yaml -t train_with_evaluate +``` + +## 脚本说明 + +### 脚本和样例代码 + +```bash +. +xt/model/dqn +├── __init__.py +├── default_config.py # 参数配置 +├── dqn_cnn_pong_ms.py +├── dqn_cnn_pong.py +├── dqn_cnn.py +├── dqn_cnn_zeus.py +├── dqn_zeus.py +├── dqn_mlp.py +├── dqn_mlp_zeus.py +├── dqn_rec_model.py +├── dqn_mlp_ms.py # Mindspore实现采用多层感知机的DQN算法 +└── dqn_cnn_ms.py # Mindspore实现采用卷积神经网络的DQN算法 +``` + +### 脚本参数 + +下面是一个有关 DQN算法的参数示例,我们通过配置系统中已注册的算法,环境信息来组合训练任务。有关不同参数更详细的描述可以在[用户指导](./docs/user.cn.md) 中找到。基于Mindspore实现的DQN算法训练配置脚本位于```examples/_dqn_ms.yaml```中 + + + +```yaml +alg_para: + alg_name: DQN # 算法配置 + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 400000, + } + +env_para: + env_name: AtariEnv # 环境配置 + env_info: { 'name': SpaceInvadersNoFrameskip-v4, 'vision': False} + +agent_para: # agent配置 + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000, + 'episode_count': 200000 + } + +model_para: # 模型参数 + actor: + model_name: DqnCnnMS + state_dim: [84,84,4] + action_dim: 6 + model_config: { + 'LR': 0.00015, + } + +env_num: 2 # 启动的环境数量 + +``` + +另外在 [examples](./examples) 目录下,可以找到更加丰富的训练配置示例。 + + + + +## 训练过程 + +### 训练 + +通过以下命令训练 +``` +xt_main -f examples/_dqn_ms.yaml -t train > train.log 2>&1 & +``` +训练日志将会被保存到train.log中 +``` +# train.log示例 +... +INFO [My-Labtop 222.20.75.218] Apr 29 00:13:15: Task: T0 | Train_count: 249990 | Steps: 499981224 | Elapsed time: 9 days, 13 minutes, 43 seconds +mean_env_step_ms: 1.927544 mean_explore_ms: 16594.248891 mean_inference_ms: 80.986863 mean_loop_time_ms: 3086.343014 +mean_prepare_data_ms: 40.179479 mean_restore_model_ms: 1.536433 mean_train_time_ms: 589.716685 mean_wait_model_ms: 0.024343 +mean_wait_sample_ms: 206.692171 step_per_second: 642.0 train_loss: 7.732563 train_reward_avg: 420.73 +... +``` + +### 训练和评估 +通过以下命令训练和评估 +``` +xt_main -f examples/_dqn_ms.yaml -t train_with_evaluate > train.log 2>&1 & +``` + +## 精度和性能 + +* 10M step 之后的DQN 收敛回报 (40M frames) + | env | DQN On Mindspore(NPU) | DQN On Mindspore(GPU) |DQN On Tensorflow | + | ------------- | ------------ | --------- |-------------| + | BeamRider | 11010 | 10492 | 6706 | + | Breakout | 376 | 365 | 352 | + | QBert | 13746 | 11905 | 14087 | + | SpaceInvaders | 1410 | 1270 | 947 | + + +* 吞吐量(step per second) + | env | DQN On Mindspore(NPU) | DQN On Mindspore(GPU)| DQN On Tensorflow| + | ------------- | ------------ | --------- |--------------| + | BeamRider | 124 |133 |129 | + | Breakout | 128 |131 |117 | + | QBert | 139 |138 |111 | + | SpaceInvaders | 133 |136 |115 | + +> 实验硬件环境: +> +> TF 72 Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz with single Tesla V100 +> +> MS NPU 72 Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz with single Ascend 910 +> +> MS GPU 20 Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz *2 with single RTX2080Ti \ No newline at end of file diff --git a/Muzero_README.md b/Muzero_README.md new file mode 100644 index 0000000..bfcbfce --- /dev/null +++ b/Muzero_README.md @@ -0,0 +1,144 @@ +# 目录 +## Muzero +MuZero是DeepMind于19年发布的model based算法,是AlphaZero的又一升级。DeepMind将算法推广至更大的适用范围。这篇文章的主要贡献在于不需要规则这一先验信息,并且在MCTS的搜索过程完全建立在hidden state上。最终在雅达利的各类游戏上取得了重大突破,而不只是局限于棋类游戏。 +[论文:Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model](https://arxiv.org/abs/1911.08265) + +## 模型架构 +在每一个step中,隐藏状态执行一次蒙特卡洛树搜索的到下一个动作。 +![Muzero模型架构](./figs/Muzero.png) +- prediction:预测器。神经网络f,依据的“游戏状态”是一个隐藏表征,预测给定游戏状态下的策略p和价值v。 +- dynamics:生成器,表示系统中的动态变化。 神经网络g生成隐藏表征。动力学网络获取当前隐藏状态s和动作a,产生一个即时奖励r(immediate reward)和一个新的隐藏状态s。 +- representation:表征编码器,从历史观测,转换为初始状态。神经网络h 将当前观测到的游戏状态映射到初始表征。 + + +## 数据集 +MuZero模型在XingTian框架的游戏breakout、pong的环境下进行强化学习训练。通过智能体与环境交互获得状态动作、奖励、下一状态,并将这些值代入神经网络训练,以获得理想结果。 + +## 环境要求 +* 硬件(GPU or NPU) + * 使用GPU处理器或者NPU处理器来搭建硬件环境。 +* 框架 + * MindSpore(2.0.0),参考MindSpore-2.0.0版本的[安装教程](https://mindspore.cn/install) + * XingTian(0.3.0),参考XingTian的[安装教程](https://github.com/huawei-noah/xingtian) +* 其他第三方库参考`requirements.txt` + +## 快速入门 +完成框架安装和环境配之后,你可以按照如下步骤进行模型的训练和评估 +``` +cd xingtian +# 训练 +xt_main -f examples/muzero/muzero__ms.yaml -t train + +# 训练和评估 +xt_main -f examples/muzero/muzero__ms.yaml -t train_with_evaluate +``` + +## 文件结构说明 + +### 文件目录 +``` +xingtian/xt/model/muzero +├── __init__.py +├── default_config.py +├── muzero_atari.py +├── muzero_cnn_ms.py # Mindspore实现采用卷积神经网络的Muzero算法 +├── muzero_cnn.py +├── muzero_mlp_ms.py # Mindspore实现采用多层感知机的Muzero算法 +├── muzero_mlp.py +├── muzero_model_ms.py # Mindspore实现Muzero算法基类 +├── muzero_model.py +├── muzero_utils_ms.py +└── muzero_utils.pys +``` + +### 训练配置 +下面是一个有关Muzero算法的参数示例,我们通过配置系统中已注册的算法,环境信息来组合训练任务。有关不同参数更详细的描述可以在[用户指导](./docs/user.cn.md) 中找到。基于Mindspore实现的Muzero算法训练配置脚本位于```examples/muzero/muzero_breakout_ms.yaml```中 + +```yaml +# examples/muzero/muzero__ms.yaml +alg_para: # 算法配置 + alg_name: Muzero + alg_config: { + "train_per_checkpoint": 100, + "prepare_times_per_train": 10, + 'BUFFER_SIZE': 10000, + } + +env_para: # 环境配置 + env_name: AtariEnv + env_info: { 'name': BreakoutNoFrameskip-v4, vision': False} + +agent_para: # agent配置 + agent_name: MuzeroAtari + agent_num : 1 + agent_config: { + 'max_steps': 200 , # agent最大步数 + 'complete_step': 500000000, # agent训练完成帧数 + 'NUM_SIMULATIONS': 50 # 模型参数 + } + +model_para: + actor: + model_name: MuzeroCnnMS + state_dim: [84, 84, 4] + action_dim: 4 + max_to_keep: 500 + model_config: { + 'reward_min': 0, + 'reward_max': 50, + 'value_min': 0, + 'value_max': 500, + 'obs_type': 'uint8' + } + +env_num: 50 # 启动的环境数量 +speedup: False # 是否使用核绑定来加速 +``` +另外在 [examples](./examples) 目录下,可以找到更加丰富的训练配置示例。 + +## 训练过程 + +### 训练 + +通过以下命令训练 +``` +xt_main -f examples/muzero/muzero__ms.yaml -t train > train.log 2>&1 & +``` +训练日志将会被保存到train.log中 +``` +# train.log示例 +... +INFO [My-Labtop 222.20.75.218] Apr 29 00:13:15: Task: T0 | Train_count: 249990 | Steps: 499981224 | Elapsed time: 9 days, 13 minutes, 43 seconds +mean_env_step_ms: 1.927544 mean_explore_ms: 16594.248891 mean_inference_ms: 80.986863 mean_loop_time_ms: 3086.343014 +mean_prepare_data_ms: 40.179479 mean_restore_model_ms: 1.536433 mean_train_time_ms: 589.716685 mean_wait_model_ms: 0.024343 +mean_wait_sample_ms: 206.692171 step_per_second: 642.0 train_loss: 7.732563 train_reward_avg: 420.73 +... +``` + +### 训练和评估 +通过以下命令训练和评估 +``` +xt_main -f examples/muzero/muzero__ms.yaml -t train_with_evaluate > train.log 2>&1 & +``` + +## 性能 + +* 500M step 之后的Muzero 收敛回报 (40M frames) + | env | Muzero On Mindspore(NPU) | Muzero On Mindspore(GPU) |Muzero On Tensorflow | + | ------------- | ------------ | --------- |-------------| + | Breakout | 388 | 417 | 413 | + + + +* 吞吐量(step per second) + | env | Muzero On Mindspore(NPU) | Muzero On Mindspore(GPU)| Muzero On Tensorflow| + | ------------- | ------------ | --------- |--------------| + | Breakout | 503 |642 |892 | + +> 实验硬件环境: +> +> TF 72 Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz with single Tesla V100 +> +> MS NPU 72 Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz with single Ascend 910 +> +> MS GPU 20 Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz *2 with single RTX2080Ti diff --git a/PPO_README.md b/PPO_README.md new file mode 100644 index 0000000..04f0955 --- /dev/null +++ b/PPO_README.md @@ -0,0 +1,154 @@ +# 目录 +## PPO +近端策略优化(PPO)算法是OpenAI在2017提出的一种强化学习算法,被认为是目前强化学习领域的SOTA方法,也是适用性最广的算法之一。 +[论文](https://arxiv.org/pdf/1707.06347) + +刑天 (XingTian) 是一个组件化强化学习库,用于开发、验证强化学习算法。它目前已支持包括DQN、DDPG、PPO和IMPALA等系列算法,可以在多种环境中训练智能体,如Gym、Atari、Torcs、StarCraftII等。 为了满足用户快速验证和解决RL问题的需求,刑天抽象出了四个模块:`Algorithm`,`Model`,`Agent`,`Environment`。它们的工作方式类似于"乐高" + +## 模型架构 +PPO网络模型采用Actor-Critic结构。Actor网络用于生成策略,根据当前状态生成一个动作概率分布,然后根据这个分布选择一个动作;Critic网络用于评估策略的好坏,根据当前状态和选择的动作评估该动作的价值,并输出一个价值估计。这个价值估计可以用来计算策略的优劣,从而对策略进行更新。 +PPO算法的核心是通过优化策略来最大化累积回报。为了实现这一点,PPO算法采用了一种叫做“近端策略优化”的技术,即通过限制新策略和旧策略之间的差异来控制更新的幅度,以避免更新过度。 + +![PPO网络架构](./figs/PPO.png) + +更多细节可以参考[原论文](https://arxiv.org/pdf/1707.06347)。 + +## 数据集 +PPO模型在XingTian框架的游戏beamrider、breakout、qbert、spaceinvader的环境下进行强化学习训练。通过智能体与环境交互获得状态动作、奖励、下一状态,并将这些值代入神经网络训练,以获得理想结果。 + +## 环境要求 +* 硬件(GPU or NPU) + * 使用GPU处理器或者NPU处理器来搭建硬件环境。 +* 框架 + * MindSpore(2.0.0),参考MindSpore-2.0.0版本的[安装教程](https://mindspore.cn/install) + * XingTian(0.3.0),参考XingTian的[安装教程](https://github.com/huawei-noah/xingtian) +* 其他第三方库参考`requirements.txt` + +## 快速入门 +完成框架安装和环境配之后,你可以按照如下步骤进行模型的训练和评估 +``` +cd xingtian +# 训练 +xt_main -f examples/_ppo_ms.yaml -t train + +# 训练和评估 +xt_main -f examples/_ppo_ms.yaml -t train_with_evaluate +``` + +## 文件结构说明 + +### 文件目录 +``` +xingtian/xt/model/ppo +├── default_config.py #参数配置 +├── __init__.py +├── ppo_cnn_pigs.py +├── ppo_mlp_zeus.py +├── ppo_cnn_zeus.py +├── ppo_cnn_ms.py # Mindspore实现采用卷积神经网络的PPO算法 +├── ppo_cnn.py +├── ppo_mlp_ms.py # Mindspore实现采用多层感知机的PPO算法 +├── ppo_mlp.py +├── ppo_ms.py # Mindspore实现的PPO算法基类 +└── ppo.py +``` + +### 训练配置 +下面是一个有关 PPO算法的参数示例,我们通过配置系统中已注册的算法,环境信息来组合训练任务。有关不同参数更详细的描述可以在[用户指导](./docs/user.cn.md) 中找到。基于Mindspore实现的PPO算法训练配置脚本位于```examples/_ppo_ms.yaml```中 + +```yaml +# examples/beamrider_ppo_ms.yaml +alg_para: # 算法配置 + alg_name: PPO + +env_para: # 环境配置 + env_name: AtariEnv + env_info: + name: BeamRiderNoFrameskip-v4 + vision: False + +agent_para: # agent配置 + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 128 # agent最大步数 + complete_step: 10000000 # agent训练完成帧数 + +model_para: # 模型参数 + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 9 + input_dtype: uint8 + model_config: + BATCH_SIZE: 320 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + USE_AMSGRAD: False +env_num: 10 # 启动的环境数量 +speedup: False # 是否使用核绑定来加速 + +benchmark: + log_interval_to_train: 10 +``` +另外在 [examples](./examples) 目录下,可以找到更加丰富的训练配置示例。 + +## 训练过程 + +### 训练 + +通过以下命令训练 +``` +xt_main -f examples/_ppo_ms.yaml -t train > train.log 2>&1 & +``` +训练日志将会被保存到train.log中 +``` +# train.log示例 +... +INFO [My-Labtop 222.20.75.218] Apr 29 00:13:15: Task: T0 | Train_count: 249990 | Steps: 499981224 | Elapsed time: 9 days, 13 minutes, 43 seconds +mean_env_step_ms: 1.927544 mean_explore_ms: 16594.248891 mean_inference_ms: 80.986863 mean_loop_time_ms: 3086.343014 +mean_prepare_data_ms: 40.179479 mean_restore_model_ms: 1.536433 mean_train_time_ms: 589.716685 mean_wait_model_ms: 0.024343 +mean_wait_sample_ms: 206.692171 step_per_second: 642.0 train_loss: 7.732563 train_reward_avg: 420.73 +... +``` + +### 训练和评估 +通过以下命令训练和评估 +``` +xt_main -f examples/_ppo_ms.yaml -t train_with_evaluate > train.log 2>&1 & +``` + +## 性能 + +* 10M step 之后的PPO 收敛回报 (40M frames) + | env | PPO On Mindspore(NPU) | PPO On Mindspore(GPU) |PPO On Tensorflow | + | ------------- | ------------ | --------- |-------------| + | BeamRider | 4424 | 4677 | 4877 | + | Breakout | 344 | 283 | 341 | + | QBert | 14246 | 14561 | 14771 | + | SpaceInvaders | 1279 | 974 | 1025 | + + +* 吞吐量(step per second) + | env | PPO On Mindspore(NPU) | PPO On Mindspore(GPU)| PPO On Tensorflow| + | ------------- | ------------ | --------- |--------------| + | BeamRider | 1788 |2053 |2422 | + | Breakout | 2081 |2347 |2497 | + | QBert | 1802 |2152 |2436 | + | SpaceInvaders | 1793 |2163 |2438 | + +> 实验硬件环境: +> +> TF 72 Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz with single Tesla V100 +> +> MS NPU 72 Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz with single Ascend 910 +> +> MS GPU 20 Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz *2 with single RTX2080Ti diff --git a/examples/ant_ppo_ms.yaml b/examples/ant_ppo_ms.yaml new file mode 100644 index 0000000..6dbe0f0 --- /dev/null +++ b/examples/ant_ppo_ms.yaml @@ -0,0 +1,44 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: GymEnv + env_info: + name: MiniGrid-Ant-v0 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 200 + complete_step: 100000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 3] + action_dim: 4 + input_dtype: uint8 + model_config: + BATCH_SIZE: 10 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + + +env_num: 1 +speedup: False + +benchmark: + log_interval_to_train: 5 + + + diff --git a/examples/beamrider_dqn_ms.yaml b/examples/beamrider_dqn_ms.yaml new file mode 100644 index 0000000..8d741ea --- /dev/null +++ b/examples/beamrider_dqn_ms.yaml @@ -0,0 +1,32 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 400000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': BeamRiderNoFrameskip-v4, 'vision': False} + +agent_para: + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000, + 'episode_count': 200000 + } + +model_para: + actor: + model_name: DqnCnnMS + state_dim: [84,84,4] + action_dim: 9 + model_config: { + 'LR': 0.00015, + } + +env_num: 2 diff --git a/examples/beamrider_ppo_ms.yaml b/examples/beamrider_ppo_ms.yaml new file mode 100644 index 0000000..26354c1 --- /dev/null +++ b/examples/beamrider_ppo_ms.yaml @@ -0,0 +1,40 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: AtariEnv + env_info: + name: BeamRiderNoFrameskip-v4 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 128 + complete_step: 10000000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 9 + input_dtype: uint8 + model_config: + BATCH_SIZE: 320 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + USE_AMSGRAD: False +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/examples/breakout_dqn_ms.yaml b/examples/breakout_dqn_ms.yaml new file mode 100644 index 0000000..05700a9 --- /dev/null +++ b/examples/breakout_dqn_ms.yaml @@ -0,0 +1,32 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 400000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': BreakoutNoFrameskip-v4, 'vision': False} + +agent_para: + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000, + 'episode_count': 200000 + } + +model_para: + actor: + model_name: DqnCnnMS + state_dim: [84,84,4] + action_dim: 4 + model_config: { + 'LR': 0.00015, + } + +env_num: 2 diff --git a/examples/breakout_ppo_ms.yaml b/examples/breakout_ppo_ms.yaml new file mode 100644 index 0000000..3a5b494 --- /dev/null +++ b/examples/breakout_ppo_ms.yaml @@ -0,0 +1,40 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: AtariEnv + env_info: + name: BreakoutNoFrameskip-v4 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 128 + complete_step: 10000000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 4 + input_dtype: uint8 + model_config: + BATCH_SIZE: 320 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.0004 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [256] + USE_AMSGRAD: False +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/examples/cartpole_dqn_ms.yaml b/examples/cartpole_dqn_ms.yaml new file mode 100644 index 0000000..1b42682 --- /dev/null +++ b/examples/cartpole_dqn_ms.yaml @@ -0,0 +1,29 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 10, + 'prepare_times_per_train': 4, + 'learning_starts': 100, + 'save_model': True, + 'save_interval': 100 + } + +env_para: + env_name: GymEnv + env_info: { 'name': CartPole-v0, 'vision': False} + +agent_para: + agent_name: CartpoleDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000 , + 'complete_step': 5000000 + } + +model_para: + actor: + model_name: DqnMlpMS + state_dim: [4] + action_dim: 2 + +env_num: 1 diff --git a/examples/cartpole_ppo_ms.yaml b/examples/cartpole_ppo_ms.yaml new file mode 100644 index 0000000..076684d --- /dev/null +++ b/examples/cartpole_ppo_ms.yaml @@ -0,0 +1,52 @@ +alg_para: + alg_name: PPO + alg_config: + process_num: 1 + save_model: True # default False + save_interval: 100 + +env_para: + env_name: GymEnv + env_info: + name: CartPole-v0 + vision: False + +agent_para: + agent_name: PPO + agent_num : 1 + agent_config: + max_steps: 200 + complete_step: 2000000 + complete_episode: 9000 + +model_para: + actor: + model_name: PpoMlpMS + state_dim: [4] + action_dim: 2 + input_dtype: float32 + model_config: + BATCH_SIZE: 200 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.01 + LR: 0.0003 + LOSS_CLIPPING: 0.2 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 8 + SUMMARY: False + VF_SHARE_LAYERS: False + activation: tanh + hidden_sizes: [64, 64] + +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 20 + eval: + # model_path: /YOUR/PATH/TO/EVAL/models + gap: 100 + model_divided_freq: 1 # how many times to divided of each model + episodes_per_eval: 1 + evaluator_num: 1 + max_step_per_episode: 4000 diff --git a/examples/dog_ppo_ms.yaml b/examples/dog_ppo_ms.yaml new file mode 100644 index 0000000..93c38cd --- /dev/null +++ b/examples/dog_ppo_ms.yaml @@ -0,0 +1,44 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: GymEnv + env_info: + name: MiniGrid-Dog-v0 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 200 + complete_step: 100000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 3] + action_dim: 4 + input_dtype: uint8 + model_config: + BATCH_SIZE: 10 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + + +env_num: 1 +speedup: False + +benchmark: + log_interval_to_train: 5 + + + diff --git a/examples/muzero/muzero_breakout_ms.yaml b/examples/muzero/muzero_breakout_ms.yaml new file mode 100644 index 0000000..18f9295 --- /dev/null +++ b/examples/muzero/muzero_breakout_ms.yaml @@ -0,0 +1,37 @@ +alg_para: + alg_name: Muzero + alg_config: { + "train_per_checkpoint": 100, + "prepare_times_per_train": 10, + 'BUFFER_SIZE': 10000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': BreakoutNoFrameskip-v4, vision': False} + +agent_para: + agent_name: MuzeroAtari + agent_num : 1 + agent_config: { + 'max_steps': 200 , + 'complete_step': 500000000, + 'NUM_SIMULATIONS': 50 + } + +model_para: + actor: + model_name: MuzeroCnnMS + state_dim: [84, 84, 4] + action_dim: 4 + max_to_keep: 500 + model_config: { + 'reward_min': 0, + 'reward_max': 50, + 'value_min': 0, + 'value_max': 500, + 'obs_type': 'uint8' + } + +env_num: 50 +speedup: False \ No newline at end of file diff --git a/examples/muzero/muzero_pong_ms.yaml b/examples/muzero/muzero_pong_ms.yaml new file mode 100644 index 0000000..6d853c4 --- /dev/null +++ b/examples/muzero/muzero_pong_ms.yaml @@ -0,0 +1,37 @@ +alg_para: + alg_name: Muzero + alg_config: { + "train_per_checkpoint": 100, + "prepare_times_per_train": 10, + 'BUFFER_SIZE': 10000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': PongNoFrameskip-v4, vision': False} + +agent_para: + agent_name: MuzeroAtari + agent_num : 1 + agent_config: { + 'max_steps': 200 , + 'complete_step': 50000000, + 'NUM_SIMULATIONS': 50 + } + +model_para: + actor: + model_name: MuzeroCnnMS + state_dim: [84, 84, 4] + action_dim: 6 + max_to_keep: 500 + model_config: { + 'reward_min': -2, + 'reward_max': 2, + 'value_min': -21, + 'value_max': 21, + 'obs_type': 'int8' + } + +env_num: 50 +speedup: False \ No newline at end of file diff --git a/examples/pendulum_ppo_ms.yaml b/examples/pendulum_ppo_ms.yaml new file mode 100644 index 0000000..fdc97e3 --- /dev/null +++ b/examples/pendulum_ppo_ms.yaml @@ -0,0 +1,43 @@ +alg_para: + alg_name: PPO + alg_config: + process_num: 1 + only_save_best_model: True + +env_para: + env_name: GymEnv + env_info: + name: Pendulum-v0 + vision: False + +agent_para: + agent_name: PPO + agent_num : 1 + agent_config: + max_steps: 200 + complete_step: 2000000 + +model_para: + actor: + model_name: PpoMlpMS + state_dim: [3] + action_dim: 1 + input_dtype: float32 + model_config: + BATCH_SIZE: 200 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.01 + LR: 0.0003 + LOSS_CLIPPING: 0.2 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 8 + SUMMARY: False + VF_SHARE_LAYERS: False + activation: tanh + hidden_sizes: [64, 64] + +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/examples/pong_dqn_ms.yaml b/examples/pong_dqn_ms.yaml new file mode 100644 index 0000000..d0dfa44 --- /dev/null +++ b/examples/pong_dqn_ms.yaml @@ -0,0 +1,30 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 10000, +# 'save_model': True, + 'save_interval': 100 + } + +env_para: + env_name: AtariEnv + env_info: { 'name': PongNoFrameskip-v4, 'vision': False} + +agent_para: + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000 + } + +model_para: + actor: + model_name: DqnCnnPongMS + state_dim: [84,84,4] + action_dim: 6 + +env_num: 2 diff --git a/examples/pong_ppo_ms.yaml b/examples/pong_ppo_ms.yaml new file mode 100644 index 0000000..723d6af --- /dev/null +++ b/examples/pong_ppo_ms.yaml @@ -0,0 +1,41 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: AtariEnv + env_info: + name: PongNoFrameskip-v4 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 200 + complete_step: 10000000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 6 + input_dtype: uint8 + model_config: + BATCH_SIZE: 200 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + + +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/examples/qbert_dqn_ms.yaml b/examples/qbert_dqn_ms.yaml new file mode 100644 index 0000000..3654d25 --- /dev/null +++ b/examples/qbert_dqn_ms.yaml @@ -0,0 +1,32 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 400000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': QbertNoFrameskip-v4, 'vision': False} + +agent_para: + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000, + 'episode_count': 200000 + } + +model_para: + actor: + model_name: DqnCnnMS + state_dim: [84,84,4] + action_dim: 6 + model_config: { + 'LR': 0.00015, + } + +env_num: 2 diff --git a/examples/qbert_ppo_ms.yaml b/examples/qbert_ppo_ms.yaml new file mode 100644 index 0000000..4111a84 --- /dev/null +++ b/examples/qbert_ppo_ms.yaml @@ -0,0 +1,40 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: AtariEnv + env_info: + name: QbertNoFrameskip-v4 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 128 + complete_step: 10000000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 6 + input_dtype: uint8 + model_config: + BATCH_SIZE: 320 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.0005 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + USE_AMSGRAD: True +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/examples/spaceinvader_dqn_ms.yaml b/examples/spaceinvader_dqn_ms.yaml new file mode 100644 index 0000000..c0815c8 --- /dev/null +++ b/examples/spaceinvader_dqn_ms.yaml @@ -0,0 +1,32 @@ +alg_para: + alg_name: DQN + alg_config: { + 'train_per_checkpoint': 50, + 'prepare_times_per_train': 4, + 'learning_starts': 10000, + 'BUFFER_SIZE': 400000, + } + +env_para: + env_name: AtariEnv + env_info: { 'name': SpaceInvadersNoFrameskip-v4, 'vision': False} + +agent_para: + agent_name: AtariDqn + agent_num : 1 + agent_config: { + 'max_steps': 2000, + 'complete_step': 10000000, + 'episode_count': 200000 + } + +model_para: + actor: + model_name: DqnCnnMS + state_dim: [84,84,4] + action_dim: 6 + model_config: { + 'LR': 0.00015, + } + +env_num: 2 diff --git a/examples/spaceinvader_ppo_ms.yaml b/examples/spaceinvader_ppo_ms.yaml new file mode 100644 index 0000000..8ccf75f --- /dev/null +++ b/examples/spaceinvader_ppo_ms.yaml @@ -0,0 +1,40 @@ +alg_para: + alg_name: PPO + +env_para: + env_name: AtariEnv + env_info: + name: SpaceInvadersNoFrameskip-v4 + vision: False + +agent_para: + agent_name: AtariPpo + agent_num : 1 + agent_config: + max_steps: 128 + complete_step: 10000000 + +model_para: + actor: + model_name: PpoCnnMS + state_dim: [84, 84, 4] + action_dim: 6 + input_dtype: uint8 + model_config: + BATCH_SIZE: 320 + CRITIC_LOSS_COEF: 1.0 + ENTROPY_LOSS: 0.003 + LOSS_CLIPPING: 0.1 + LR: 0.00025 + MAX_GRAD_NORM: 5.0 + NUM_SGD_ITER: 4 + SUMMARY: False + VF_SHARE_LAYERS: True + activation: relu + hidden_sizes: [512] + USE_AMSGRAD: False +env_num: 10 +speedup: False + +benchmark: + log_interval_to_train: 10 diff --git a/figs/DQN.png b/figs/DQN.png new file mode 100644 index 0000000..74c1583 Binary files /dev/null and b/figs/DQN.png differ diff --git a/figs/Muzero.png b/figs/Muzero.png new file mode 100644 index 0000000..261d1ab Binary files /dev/null and b/figs/Muzero.png differ diff --git a/figs/PPO.png b/figs/PPO.png new file mode 100644 index 0000000..65faf77 Binary files /dev/null and b/figs/PPO.png differ diff --git a/requirements.txt b/requirements.txt index 3428a12..bade52e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,18 @@ -zmq -h5py -gym[atari] -tqdm -imageio -matplotlib -Ipython -pyyaml -pyarrow -lz4 -fabric2 -absl-py -psutil -tensorboardX -setproctitle +zmq==0.0.0 +h5py==3.6.0 +gym[atari]==0.19.0 +gym_minigrid==1.0.3 +tensorflow==1.15.0 +tqdm==4.62.3 +imageio==2.12.0 +matplotlib==3.0.3 +Ipython==7.30.0 +pyyaml==6.0 +pyarrow==6.0.1 +lz4==3.1.10 +fabric2==2.6.0 +absl-py==1.0.0 +psutil==5.8.0 +tensorboardX==2.4.1 +setproctitle==1.2.2 +protobuf==3.19.0 diff --git a/xt/framework/explorer.py b/xt/framework/explorer.py index 51a3441..c0a738e 100644 --- a/xt/framework/explorer.py +++ b/xt/framework/explorer.py @@ -23,6 +23,10 @@ from copy import deepcopy from absl import logging import setproctitle +try: + from xt.model.ms_compat import ms +except: + pass from zeus.common.ipc.share_buffer import ShareBuf from xt.framework.agent_group import AgentGroup from zeus.common.ipc.uni_comm import UniComm @@ -58,6 +62,10 @@ def start_explore(self): """Start explore process.""" signal.signal(signal.SIGINT, signal.SIG_IGN) os.environ["CUDA_VISIBLE_DEVICES"] = str(-1) + try: + ms.set_context(device_target='CPU') + except: + pass explored_times = 0 try: diff --git a/xt/framework/predictor.py b/xt/framework/predictor.py index 7a7a367..1f28d37 100644 --- a/xt/framework/predictor.py +++ b/xt/framework/predictor.py @@ -23,6 +23,10 @@ from copy import deepcopy from xt.algorithm import alg_builder import setproctitle +try: + from xt.model.ms_compat import ms +except: + pass from zeus.common.ipc.uni_comm import UniComm from zeus.common.ipc.message import message, get_msg_data, set_msg_info, set_msg_data, get_msg_info from zeus.common.util.profile_stats import PredictStats, TimerRecorder @@ -86,6 +90,10 @@ def predict(self, recv_data): def start(self): os.environ["CUDA_VISIBLE_DEVICES"] = str(-1) + try: + ms.set_context(device_target='CPU') + except: + pass alg_para = self.config_info.get('alg_para') setproctitle.setproctitle("xt_predictor") diff --git a/xt/model/dqn/dqn_cnn.py b/xt/model/dqn/dqn_cnn.py index 93c672d..4730d95 100644 --- a/xt/model/dqn/dqn_cnn.py +++ b/xt/model/dqn/dqn_cnn.py @@ -24,7 +24,6 @@ from xt.model import XTModel from xt.model.tf_utils import TFVariables from zeus.common.util.common import import_config - from zeus.common.util.register import Registers diff --git a/xt/model/dqn/dqn_cnn_ms.py b/xt/model/dqn/dqn_cnn_ms.py new file mode 100644 index 0000000..4fa70a6 --- /dev/null +++ b/xt/model/dqn/dqn_cnn_ms.py @@ -0,0 +1,137 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from zeus.common.util.register import Registers +from xt.model.model_ms import XTModel_MS +from xt.model.ms_utils import MSVariables +from xt.model.dqn.default_config import LR +from xt.model.ms_compat import ms +from xt.model.ms_compat import Conv2d, Dense, Flatten, ReLU, Adam, MSELoss, WithLossCell, MultitypeFuncGraph, \ + DynamicLossScaleUpdateCell, Cast, Cell, Tensor +from zeus.common.util.common import import_config +import mindspore.ops as ops +import numpy as np +ms.set_context(mode =ms.GRAPH_MODE) +@Registers.model +class DqnCnnMS(XTModel_MS): + """Docstring for DqnCnn.""" + + def __init__(self, model_info): + model_config = model_info.get('model_config', None) + import_config(globals(), model_config) + + self.state_dim = model_info['state_dim'] + self.action_dim = model_info['action_dim'] + self.learning_rate = LR + self.dueling = model_config.get('dueling', False) + self.net = DqnCnnNet(state_dim=self.state_dim, action_dim=self.action_dim, dueling=self.dueling) + super().__init__(model_info) + self.net.compile(ms.Tensor(np.zeros((1, 84, 84, 4))).astype(ms.float32)) + + def create_model(self, model_info): + """Create Deep-Q CNN network.""" + loss_fn = MSELoss() + adam = Adam(params=self.net.trainable_params(), learning_rate=self.learning_rate, use_amsgrad=True) + loss_net = WithLossCell(self.net, loss_fn) + device_target = ms.get_context("device_target") + if device_target == 'Ascend': + manager = DynamicLossScaleUpdateCell(loss_scale_value=2 ** 12, scale_factor=2, scale_window=1000) + model = MyTrainOneStepCell(loss_net, adam, manager, grad_clip=True, clipnorm=10.) + else: + model = MyTrainOneStepCell(loss_net, adam, grad_clip=True, clipnorm=10.) + self.actor_var = MSVariables(self.net) + return model + + def predict(self, state): + state = Tensor(state, dtype=ms.float32) + return self.net(state).asnumpy() + + +class DqnCnnNet(Cell): + def __init__(self, **descript): + super(DqnCnnNet, self).__init__() + self.state_dim = descript.get("state_dim") + action_dim = descript.get("action_dim") + self.dueling = descript.get("dueling") + self.convlayer1 = Conv2d(self.state_dim[2], 32, kernel_size=8, stride=4, pad_mode='valid', + weight_init="xavier_uniform") + self.convlayer2 = Conv2d(32, 64, kernel_size=4, stride=2, pad_mode='valid', weight_init="xavier_uniform") + self.convlayer3 = Conv2d(64, 64, kernel_size=3, stride=1, pad_mode='valid', weight_init="xavier_uniform") + self.relu = ReLU() + self.flattenlayer = Flatten() + _dim = ( + (((self.state_dim[0] - 4) // 4 - 2) // 2 - 2) + * (((self.state_dim[1] - 4) // 4 - 2) // 2 - 2) + * 64 + ) + self.denselayer1 = Dense(_dim, 256, activation='relu', weight_init="xavier_uniform") + self.denselayer2 = Dense(256, action_dim, weight_init="xavier_uniform") + self.denselayer3 = Dense(256, 1, weight_init="xavier_uniform") + + def construct(self, x): + out = Cast()(x.transpose((0, 3, 1, 2)), ms.float32) / 255. + out = self.convlayer1(out) + out = self.relu(out) + out = self.convlayer2(out) + out = self.relu(out) + out = self.convlayer3(out) + out = self.relu(out) + out = self.flattenlayer(out) + out = self.denselayer1(out) + value = self.denselayer2(out) + if self.dueling: + adv = self.denselayer3(out) + mean = value.sub(value.mean(axis=1, keep_dims=True)) + value = adv.add(mean) + return value + + +_grad_scale = MultitypeFuncGraph("grad_scale") + + +@_grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * ops.cast(ops.Reciprocal()(scale), ops.dtype(grad)) + + +class MyTrainOneStepCell(ms.nn.TrainOneStepWithLossScaleCell): + def __init__(self, network, optimizer, scale_sense=1, grad_clip=False, clipnorm=1.): + self.clipnorm = clipnorm + if isinstance(scale_sense, (int, float)): + scale_sense = Tensor(scale_sense, dtype=ms.float32) + super(MyTrainOneStepCell, self).__init__(network, optimizer, scale_sense) + self.grad_clip = grad_clip + + def construct(self,*inputs ): + weights = self.weights + loss = self.network(*inputs) + scaling_sens = self.scale_sense + status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + scaling_sens_filled = ops.ones_like(loss) * ops.cast(scaling_sens, ops.dtype(loss)) + grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled) + grads = self.hyper_map(ops.partial(_grad_scale, scaling_sens), grads) + if self.grad_clip: + grads = ops.clip_by_global_norm(grads, self.clipnorm) + grads = self.grad_reducer(grads) + cond = self.get_overflow_status(status, grads) + overflow = self.process_loss_scale(cond) + if not overflow: + loss = ops.depend(loss, self.optimizer(grads)) + return loss \ No newline at end of file diff --git a/xt/model/dqn/dqn_cnn_pong_ms.py b/xt/model/dqn/dqn_cnn_pong_ms.py new file mode 100644 index 0000000..535ac84 --- /dev/null +++ b/xt/model/dqn/dqn_cnn_pong_ms.py @@ -0,0 +1,48 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +from xt.model.dqn.default_config import LR +from xt.model.dqn.dqn_cnn_ms import DqnCnnMS +from xt.model.ms_utils import MSVariables +from xt.model.ms_compat import ms, Adam, MSELoss, WithLossCell, DynamicLossScaleUpdateCell, Tensor +from zeus.common.util.register import Registers +from xt.model.dqn.dqn_cnn_ms import MyTrainOneStepCell + + +@Registers.model +class DqnCnnPongMS(DqnCnnMS): + """Docstring for DqnPong.""" + + def create_model(self, model_info): + """Create Deep-Q CNN network.""" + loss_fn = MSELoss() + adam = Adam(params=self.net.trainable_params(), learning_rate=self.learning_rate, use_amsgrad=True) + loss_net = WithLossCell(self.net, loss_fn) + device_target = ms.get_context("device_target") + if device_target == 'Ascend': + manager = DynamicLossScaleUpdateCell(loss_scale_value=2 ** 12, scale_factor=2, scale_window=1000) + model = MyTrainOneStepCell(loss_net, adam, manager, grad_clip=True, clipnorm=10.) + else: + model = MyTrainOneStepCell(loss_net, adam, grad_clip=True, clipnorm=10.) + self.actor_var = MSVariables(self.net) + return model + + def predict(self, state): + state = Tensor(state, dtype=ms.float32) + return self.net(state).asnumpy() \ No newline at end of file diff --git a/xt/model/dqn/dqn_mlp.py b/xt/model/dqn/dqn_mlp.py index 6e809ea..905349d 100644 --- a/xt/model/dqn/dqn_mlp.py +++ b/xt/model/dqn/dqn_mlp.py @@ -84,4 +84,4 @@ def layer_normalize(x): def layer_add(x): """Compute Q given Advantage and V.""" - return x[0] + x[1] + return x[0] + x[1] \ No newline at end of file diff --git a/xt/model/dqn/dqn_mlp_ms.py b/xt/model/dqn/dqn_mlp_ms.py new file mode 100644 index 0000000..731a86f --- /dev/null +++ b/xt/model/dqn/dqn_mlp_ms.py @@ -0,0 +1,83 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +from xt.model.dqn.default_config import HIDDEN_SIZE, NUM_LAYERS, LR +from xt.model.model_ms import XTModel_MS +from zeus.common.util.common import import_config +from zeus.common.util.register import Registers +from xt.model.ms_compat import Dense, Adam, DynamicLossScaleUpdateCell, MSELoss, Cell, Model, ms +import mindspore.ops as ops +from xt.model.ms_utils import MSVariables +from xt.model.dqn.dqn_cnn_ms import MyTrainOneStepCell + + +@Registers.model +class DqnMlpMS(XTModel_MS): + + def __init__(self, model_info): + model_config = model_info.get('model_config', None) + import_config(globals(), model_config) + + self.state_dim = model_info['state_dim'] + self.action_dim = model_info['action_dim'] + self.learning_rate = LR + self.dueling = model_config.get('dueling', False) + self.net = DqnMlpNet(state_dim=self.state_dim, action_dim=self.action_dim, dueling=self.dueling) + super().__init__(model_info) + + def create_model(self, model_info): + """Create Deep-Q CNN network.""" + loss_fn = MSELoss() + adam = Adam(params=self.net.trainable_params(), learning_rate=self.learning_rate) + loss_net = ms.nn.WithLossCell(self.net, loss_fn) + device_target = ms.get_context("device_target") + if device_target == 'Ascend': + manager = DynamicLossScaleUpdateCell(loss_scale_value=2 ** 12, scale_factor=2, scale_window=1000) + model = MyTrainOneStepCell(loss_net, adam, manager, grad_clip=True, clipnorm=10.) + else: + model = MyTrainOneStepCell(loss_net, adam, grad_clip=True, clipnorm=10.) + self.actor_var = MSVariables(self.net) + return model + + def predict(self, state): + state = ms.Tensor(state, dtype=ms.float32) + return self.net(state).asnumpy() + + +class DqnMlpNet(Cell): + def __init__(self, **descript): + super(DqnMlpNet, self).__init__() + self.state_dim = descript.get("state_dim") + self.action_dim = descript.get("action_dim") + self.dueling = descript.get("dueling") + self.denselayer1 = Dense(self.state_dim[-1], HIDDEN_SIZE, activation='relu', weight_init='xavier_uniform') + self.denselayer2 = Dense(HIDDEN_SIZE, HIDDEN_SIZE, activation='relu', weight_init='xavier_uniform') + self.denselayer3 = Dense(HIDDEN_SIZE, self.action_dim, weight_init='xavier_uniform') + self.denselayer4 = Dense(HIDDEN_SIZE, 1, weight_init='xavier_uniform') + + def construct(self, x): + out = self.denselayer1(x.astype("float32")) + for _ in range(NUM_LAYERS - 1): + out = self.denselayer2(out) + value = self.denselayer3(out) + if self.dueling: + adv = self.denselayer4(out) + mean = value.sub(value.mean(axis=1, keep_dims=True)) + value = adv.add(mean) + return value \ No newline at end of file diff --git a/xt/model/model_ms.py b/xt/model/model_ms.py new file mode 100644 index 0000000..b8edda0 --- /dev/null +++ b/xt/model/model_ms.py @@ -0,0 +1,93 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +"""MS_Model base.""" + +import os +import glob +import mindspore as ms +from xt.model.model import XTModel + + + +class XTModel_MS(XTModel): + + def __init__(self, model_info): + # User Could assign it within create model. + self.actor_var = None + self._summary = model_info.get("summary", False) + self.model_format = model_info.get('model_format') + self.max_to_keep = model_info.get("max_to_keep", 100) + self.model = self.create_model(model_info) + if 'init_weights' in model_info: + model_name = model_info['init_weights'] + try: + self.load_model(model_name) + print("load weight: {} success.".format(model_name)) + except BaseException: + print("load weight: {} failed!".format(model_name)) + + def predict(self, state): + """ + Do predict use the newest model. + + :param state: + :return: output tensor ref to policy.model + """ + return self.model.predict(state) + + def train(self, state, label): + """Train the model.""" + state = ms.Tensor(state, dtype=ms.float32) + label = ms.Tensor(label, dtype=ms.float32) + loss = self.model(state, label) + return loss.asnumpy().item() + + def set_weights(self, weights): + """Set weight with memory tensor.""" + self.actor_var.set_weights(weights) + + def get_weights(self): + """Get the weights.""" + return self.actor_var.get_weights() + + def get_grad(self, data): + self.model.get_grad(data) + + def save_model(self, file_name): + """Save weights into .h5 file.""" + # check max model file to keep + if self.max_to_keep > -1: + check_keep_model(os.path.dirname(file_name), self.max_to_keep) + self.actor_var.save_weights(file_name + ".npz") + + def load_model(self, model_name): + self.actor_var.set_weights_with_npz(model_name) + + +def check_keep_model(model_path, keep_num): + """Check model saved count under path.""" + target_file = glob.glob( + os.path.join( + model_path, + "actor*".format(model_path))) + if len(target_file) > keep_num: + to_rm_model = sorted(target_file, reverse=True)[keep_num:] + for item in to_rm_model: + os.remove(item) diff --git a/xt/model/model_utils_ms.py b/xt/model/model_utils_ms.py new file mode 100644 index 0000000..b4156fe --- /dev/null +++ b/xt/model/model_utils_ms.py @@ -0,0 +1,327 @@ +"""Retain model utils.""" + +import numpy as np + +from xt.model.ms_compat import ms, SequentialCell, Dense, Conv2d, Flatten,\ + get_activation, Cell +from mindspore._checkparam import twice + +ACTIVATION_MAP_MS = { + 'sigmoid': 'sigmoid', + 'tanh': 'tanh', + 'softsign': 'softsign', + 'softplus': 'softplus', + 'relu': 'relu', + 'leakyrelu': 'leakyrelu', + 'elu': 'elu', + 'selu': 'seLU', + 'hswish': 'hswish', + 'gelu': 'gelu' +} + + +def cal_shape(input_shape, kernel_size, stride): + kernel_size = twice(kernel_size) + stride = twice(stride) + return tuple( + (v - kernel_size[i]) // stride[i] + 1 for i, + v in enumerate(input_shape)) + + +class MlpBackbone(Cell): + def __init__(self, state_dim, act_dim, hidden_sizes, activation): + super().__init__() + self.dense_layer_pi = bulid_mlp_layers_ms( + state_dim[-1], hidden_sizes, activation) + self.dense_pi = Dense( + hidden_sizes[-1], act_dim, weight_init="XavierUniform") + self.dense_layer_v = bulid_mlp_layers_ms( + state_dim[-1], hidden_sizes, activation) + self.dense_out = Dense( + hidden_sizes[-1], 1, weight_init="XavierUniform") + + def construct(self, x): + if x.dtype == ms.float64: + x = x.astype(ms.float32) + pi_latent = self.dense_layer_pi(x) + pi_latent = self.dense_pi(pi_latent) + out_value = self.dense_layer_v(x) + out_value = self.dense_out(out_value) + + return [pi_latent, out_value] + + +class MlpBackboneShare(Cell): + def __init__(self, state_dim, act_dim, hidden_sizes, activation): + super().__init__() + self.dense_layer_share = bulid_mlp_layers_ms( + state_dim[-1], hidden_sizes, activation + ) + self.dense_pi = Dense( + hidden_sizes[-1], act_dim, weight_init="XavierUniform") + self.dense_out = Dense( + hidden_sizes[-1], 1, weight_init="XavierUniform") + + def construct(self, x): + if x.dtype == ms.float64: + x = x.astype(ms.float32) + share = self.dense_layer_share(x) + pi_latent = self.dense_pi(share) + out_value = self.dense_out(share) + + return [pi_latent, out_value] + + +class CnnBackbone(Cell): + def __init__( + self, + state_dim, + act_dim, + hidden_sizes, + activation, + filter_arches, + dtype, + ): + super().__init__() + self.dtype = dtype + self.conv_layer_pi = build_conv_layers_ms( + state_dim[-1], filter_arches, activation) + self.flatten_layer = Flatten() + height, width = state_dim[-3], state_dim[-2] + filters = 1 + for filters, kernel_size, strides in filter_arches: + height, width = cal_shape((height, width), kernel_size, strides) + dim = height * width * filters + self.dense_layer_pi = bulid_mlp_layers_ms( + dim, hidden_sizes, activation) + self.dense_pi = Dense( + hidden_sizes[-1], act_dim, weight_init="XavierUniform") + self.conv_layer_v = build_conv_layers_ms( + state_dim[-1], filter_arches, activation) + self.dense_layer_v = bulid_mlp_layers_ms(dim, hidden_sizes, activation) + self.dense_v = Dense(hidden_sizes[-1], 1, weight_init="XavierUniform") + + def construct(self, x): + x = x.transpose((0, 3, 1, 2)) + if self.dtype == "uint8": + x = layer_function_ms(x) + pi_latent = self.conv_layer_pi(x) + pi_latent = self.flatten_layer(pi_latent) + pi_latent = self.dense_layer_pi(pi_latent) + pi_latent = self.dense_pi(pi_latent) + out_value = self.conv_layer_v(x) + out_value = self.flatten_layer(out_value) + out_value = self.dense_layer_v(out_value) + out_value = self.dense_v(out_value) + + return [pi_latent, out_value] + + +class CnnBackboneShare(Cell): + def __init__( + self, + state_dim, + act_dim, + hidden_sizes, + activation, + filter_arches, + dtype, + ): + super().__init__() + self.dtype = dtype + self.conv_layer_share = build_conv_layers_ms( + state_dim[-1], filter_arches, activation + ) + self.flatten_layer = Flatten() + height, width = state_dim[-3], state_dim[-2] + filters = 1 + for filters, kernel_size, strides in filter_arches: + height, width = cal_shape((height, width), kernel_size, strides) + dim = height * width * filters + self.dense_layer_share = bulid_mlp_layers_ms( + dim, hidden_sizes, activation) + self.dense_pi = Dense( + hidden_sizes[-1], act_dim, weight_init="XavierUniform") + self.dense_v = Dense(hidden_sizes[-1], 1, weight_init="XavierUniform") + + def construct(self, x): + x = x.transpose((0, 3, 1, 2)) + if self.dtype == "uint8": + x = layer_function_ms(x) + share = self.conv_layer_share(x) + share = self.flatten_layer(share) + share = self.dense_layer_share(share) + pi_latent = self.dense_pi(share) + out_value = self.dense_v(share) + return [pi_latent, out_value] + + +def get_mlp_backbone_ms( + state_dim, + act_dim, + hidden_sizes, + activation, + vf_share_layers=False, + summary=False, + dtype='float32', +): + """Get mlp backbone.""" + if dtype != "float32": + raise ValueError( + 'dtype: {} not supported automatically, please implement it yourself'.format( + dtype + ) + ) + if not vf_share_layers: + return MlpBackbone(state_dim, act_dim, hidden_sizes, activation) + + return MlpBackboneShare(state_dim, act_dim, hidden_sizes, activation) + + +def get_cnn_backbone_ms( + state_dim, + act_dim, + hidden_sizes, + activation, + filter_arches, + vf_share_layers=True, + summary=False, + dtype='uint8', +): + """Get CNN backbone.""" + if dtype != "uint8" and dtype != "float32": + raise ValueError( + 'dtype: {} not supported automatically, \ + please implement it yourself'.format( + dtype + ) + ) + if vf_share_layers: + return CnnBackboneShare( + state_dim, + act_dim, + hidden_sizes, + activation, + filter_arches, + dtype, + ) + return CnnBackbone( + state_dim, + act_dim, + hidden_sizes, + activation, + filter_arches, + dtype, + ) + + +def bulid_mlp_layers_ms(input_size, hidden_sizes, activation): + build_block = SequentialCell() + for hidden_size in hidden_sizes: + build_block.append( + Dense( + input_size, + hidden_size, + activation=activation, + weight_init="XavierUniform", + ) + ) + input_size = hidden_size + return build_block + + +def build_conv_layers_ms(input_size, filter_arches, activation): + build_block = SequentialCell() + for filters, kernel_size, strides in filter_arches: + build_block.append( + Conv2d( + input_size, + filters, + kernel_size, + strides, + pad_mode="valid", + has_bias=True, + weight_init="XavierUniform", + ) + ) + build_block.append(get_activation(activation)) + input_size = filters + return build_block + + +def get_mlp_default_settings_ms(kind): + """Get default setting for mlp model.""" + if kind == "hidden_sizes": + return [64, 64] + elif kind == "activation": + return "tanh" + else: + raise KeyError("unknown type: {}".format(kind)) + + +def get_cnn_default_settings_ms(kind): + """Get default setting for mlp model.""" + if kind == 'hidden_sizes': + return [512] + elif kind == 'activation': + return 'relu' + else: + raise KeyError('unknown type: {}'.format(kind)) + + +def get_default_filters_ms(shape): + """Get default model set for atari environments.""" + shape = list(shape) + if len(shape) != 3: + raise ValueError( + 'Without default architecture for obs shape {}'.format(shape)) + filters_84x84 = [[32, (8, 8), (4, 4)], [32, (4, 4), (2, 2)], [ + 64, (3, 3), (1, 1)]] + filters_42x42 = [[32, (4, 4), (2, 2)], [32, (4, 4), (2, 2)], [ + 64, (3, 3), (1, 1)]] + filters_15x15 = [[32, (5, 5), (1, 1)], [64, (3, 3), (1, 1)], [ + 64, (3, 3), (1, 1)]] + if shape[:2] == [84, 84]: + return filters_84x84 + elif shape[:2] == [42, 42]: + return filters_42x42 + elif shape[:2] == [15, 15]: + return filters_15x15 + else: + filters = [] + input_w, input_h = shape[:2] + flat_flag_w, flat_flag_h = False, False + num_filters = 16 + while not flat_flag_w or not flat_flag_h: + filter_w, stride_w, flat_flag_w = _infer_stride_and_kernel_ms( + input_w, flat_flag_w + ) + filter_h, stride_h, flat_flag_h = _infer_stride_and_kernel_ms( + input_h, flat_flag_h + ) + filters.append( + (num_filters, (filter_w, filter_h), (stride_w, stride_h))) + num_filters *= 2 + input_w = input_w // stride_w + input_h = input_h // stride_h + return filters + + +def _infer_stride_and_kernel_ms(size, flat_flag): + if flat_flag or size <= 3: + return 1, 1, True + + if size <= 8: + return 3, 1, True + elif size <= 64: + return 5, 2, False + else: + power = int(np.floor(np.log2(size))) + stride = 2**power + return 2 * stride + 1, stride, False + + +def layer_function_ms(x): + """Normalize data.""" + return x.astype(ms.float32) / 255.0 + diff --git a/xt/model/ms_compat.py b/xt/model/ms_compat.py new file mode 100644 index 0000000..e40bbc4 --- /dev/null +++ b/xt/model/ms_compat.py @@ -0,0 +1,75 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import sys + + +def import_ms_compact(): + """Import mindspore with compact behavior.""" + if "mindspore" not in sys.modules: + try: + import mindspore.compat.v1 as ms + ms.disable_v2_behavior() + except ImportError: + import mindspore as ms + return ms + else: + return sys.modules["mindspore"] + + +ms = import_ms_compact() + + +# pylint: disable=W0611 +if ms.__version__ in ("2.0.0"): + from mindspore.nn import Adam + from mindspore.nn import Conv2d, Dense, Flatten, ReLU + from mindspore.nn import MSELoss + from mindspore.train import Model + from mindspore.nn import WithLossCell, TrainOneStepCell, SoftmaxCrossEntropyWithLogits, SequentialCell + from mindspore.nn import Cell, WithLossCell, DynamicLossScaleUpdateCell, get_activation, LossBase, FixedLossScaleUpdateCell + from mindspore import Model, Tensor + from mindspore.ops import Cast, MultitypeFuncGraph, ReduceSum, ReduceMax, ReduceMin, ReduceMean, Reciprocal + from mindspore.ops import Depend, clip_by_global_norm, Minimum, Maximum, Exp, Square, clip_by_value + from mindspore import History, value_and_grad + +if ms.__version__ in ("1.9.0"): + from mindspore.nn import Adam + from mindspore.nn import Conv2d, Dense, Flatten, ReLU + from mindspore.nn import MSELoss + from mindspore.train import Model + from mindspore.nn import WithLossCell, TrainOneStepCell, SoftmaxCrossEntropyWithLogits, SequentialCell + from mindspore.nn import Cell, WithLossCell, DynamicLossScaleUpdateCell, get_activation, LossBase, FixedLossScaleUpdateCell + from mindspore import Model, Tensor + from mindspore.ops import Cast, MultitypeFuncGraph, ReduceSum, ReduceMax, ReduceMin, ReduceMean, Reciprocal + from mindspore.ops import Depend, value_and_grad, clip_by_global_norm, Minimum, Maximum, Exp, Square, clip_by_value + from mindspore import History + +def loss_to_val(loss): + """Make keras instance into value.""" + if isinstance(loss, History): + loss = loss.history.get("loss")[0] + return loss + + +DTYPE_MAP = { + "float32": ms.float32, + "float16": ms.float16, +} diff --git a/xt/model/ms_dist.py b/xt/model/ms_dist.py new file mode 100644 index 0000000..1535ec8 --- /dev/null +++ b/xt/model/ms_dist.py @@ -0,0 +1,186 @@ +"""Action distribution with mindspore""" +import numpy as np +from xt.model.ms_compat import ms, Cast, ReduceSum, ReduceMax, Tensor +from mindspore import ops +from mindspore import ms_class +import mindspore.nn.probability.distribution as msd + +@ms_class +class ActionDist: + """Build base action distribution.""" + + def init_by_param(self, param): + raise NotImplementedError + + def flatparam(self): + raise NotImplementedError + + def sample(self, repeat): + """Sample action from this distribution.""" + raise NotImplementedError + + def sample_dtype(self): + raise NotImplementedError + + def get_shape(self): + return self.flatparam().shape.as_list() + + @property + def shape(self): + return self.get_shape() + + def __getitem__(self, idx): + return self.flatparam()[idx] + + def neglog_prob(self, x, logits): + raise NotImplementedError + + def log_prob(self, x, logits): + """Calculate the log-likelihood.""" + return -self.neglog_prob(x, logits) + + def mode(self): + raise NotImplementedError + + def entropy(self): + raise NotImplementedError + + def kl(self, other): + raise NotImplementedError + + +class DiagGaussianDist(ActionDist): + """Build Diagonal Gaussian distribution, each vector represented one distribution.""" + + def __init__(self, size): + self.size = size + self.reduce_sum = ReduceSum(keep_dims=True) + self.log = ops.Log() + self.shape = ops.Shape() + self.square = ops.Square() + self.normal = ops.StandardNormal() + self.cast = Cast() + + def init_by_param(self, param): + self.param = param + self.mean, self.log_std = ops.split(self.param, axis=-1, output_num=2) + self.std = ops.exp(self.log_std) + + def flatparam(self): + return self.param + + def sample_dtype(self): + return ms.float32 + + def log_prob(self, x, mean, sd=None): + if sd is not None: + log_sd = self.log(sd) + neglog_prob = 0.5 * self.log(2.0 * np.pi) * self.cast((self.shape(x)[-1]), ms.float32) + \ + 0.5 * self.reduce_sum(self.square((x - mean) / sd), axis=-1) + \ + self.reduce_sum(log_sd, axis=-1) + else: + neglog_prob = 0.5 * self.log(2.0 * np.pi) * self.cast((self.shape( + x)[-1]), ms.float32) + 0.5 * self.reduce_sum(self.square((x - mean) / sd), axis=-1) + return -neglog_prob + + def mode(self): + return self.mean + + def entropy(self, mean, sd=None): + if sd is not None: + log_sd = self.log(sd) + return self.reduce_sum( + log_sd + 0.5 * (self.log(2.0 * np.pi) + 1.0), axis=-1) + return 0.5 * (self.log(2.0 * np.pi) + 1.0) + + def kl(self, other): + assert isinstance( + other, DiagGaussianDist), 'Distribution type not match.' + reduce_sum = ReduceSum(keep_dims=True) + return reduce_sum((self.square(self.std) + + self.square(self.mean - other.mean)) / + (2.0 * self.square(other.std)) + + other.log_std - self.log_std - 0.5, axis=-1) + + def sample(self, mean, sd=None): + if sd is not None: + return mean + sd * self.normal(self.shape(mean), dtype=ms.float32) + return mean + self.normal(self.shape(mean), dtype=ms.float32) + + +class CategoricalDist(ActionDist): + + def __init__(self, size): + self.size = size + self.oneHot = ops.OneHot() + self.softmax_cross = ops.SoftmaxCrossEntropyWithLogits() + self.reduce_max = ReduceMax(keep_dims=True) + self.reduce_sum = ReduceSum(keep_dims=True) + self.exp = ops.Exp() + self.log = ops.Log() + self.expand_dims = ops.ExpandDims() + self.random_categorical = ops.RandomCategorical(dtype=ms.int64) + self.on_value, self.off_value = Tensor( + 1.0, ms.float32), Tensor(0.0, ms.float32) + self.new_dist = msd.Categorical(seed =0,dtype=ms.int32) + self.softmax = ops.Softmax() + + + def init_by_param(self, logits): + self.logits = logits + + def flatparam(self): + return self.logits + + def sample_dtype(self): + return ms.int32 + + def log_prob(self, x, logits): + x = self.oneHot(x, self.size, self.on_value, self.off_value) + if x.dtype != logits.dtype: + logits = logits.astype(ms.float16) + x = x.astype(ms.float16) + loss, _ = self.softmax_cross(logits, x) + return -self.expand_dims(loss, -1) + + def entropy(self, logits): + rescaled_logits = logits - self.reduce_max(logits, -1) + exp_logits = self.exp(rescaled_logits) + + z = self.reduce_sum(exp_logits, -1) + p = exp_logits / z + return self.reduce_sum(p * (self.log(z) - rescaled_logits), -1) + + def kl(self, other): + assert isinstance( + other, CategoricalDist), 'Distribution type not match.' + reduce_max = ReduceMax(keep_dims=True) + reduce_sum = ReduceSum(keep_dims=True) + rescaled_logits_self = self.logits - reduce_max(self.logits, axis=-1) + rescaled_logits_other = other.logits - \ + reduce_max(other.logits, axis=-1) + exp_logits_self = self.exp(rescaled_logits_self) + exp_logits_other = self.exp(rescaled_logits_other) + z_self = reduce_sum(exp_logits_self, axis=-1) + z_other = reduce_sum(exp_logits_other, axis=-1) + p = exp_logits_self / z_self + return reduce_sum(p * + (rescaled_logits_self - + self.log(z_self) - + rescaled_logits_other + + self.log(z_other)), axis=-1) + + def sample(self, logits): + prob = self.softmax(logits) + samples = self.new_dist.sample((), prob) + return samples + + + +def make_dist(ac_type, ac_dim): + if ac_type == 'Categorical': + return CategoricalDist(ac_dim) + elif ac_type == 'DiagGaussian': + return DiagGaussianDist(ac_dim) + else: + raise NotImplementedError diff --git a/xt/model/ms_utils.py b/xt/model/ms_utils.py new file mode 100644 index 0000000..dc01f26 --- /dev/null +++ b/xt/model/ms_utils.py @@ -0,0 +1,68 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +"""Create tf utils for assign weights between learner and actor\ + and model utils for universal usage.""" + +import numpy as np +from mindspore import nn +import mindspore as ms +import copy +from collections import OrderedDict + + +class MSVariables: + def __init__(self, net: nn.Cell) -> None: + self.net = net + + def get_weights(self) -> OrderedDict: + _weights = OrderedDict((par_name, par.data.asnumpy()) + for par_name, par in + self.net.parameters_and_names()) + return _weights + + def save_weights(self, save_name: str): + _weights = OrderedDict((par_name, par.data.asnumpy()) + for par_name, par in + self.net.parameters_and_names()) + np.savez(save_name, **_weights) + + def set_weights(self, to_weights): + for _, param in self.net.parameters_and_names(): + if param.name in to_weights: + new_param_data = ms.Tensor( + copy.deepcopy(to_weights[param.name])) + param.set_data(new_param_data, param.sliced) + return + def read_weights(weight_file: str): + """Read weights with numpy.npz""" + np_file = np.load(weight_file) + return OrderedDict(**np_file) + + def set_weights_with_npz(self, npz_file: str): + """Set weight with numpy file.""" + weights = self.read_weights(npz_file) + self.set_weights(weights) + + def save_weight_with_checkpoint(self, filename: str): + ms.save_checkpoint(self._weights, filename) + + def load_weight_with_checkpoint(self, filename: str): + param_dict = ms.load_checkpoint(filename, self.net) + param_not_load = ms.load_param_into_net(self.net, param_dict) diff --git a/xt/model/muzero/muzero_cnn_ms.py b/xt/model/muzero/muzero_cnn_ms.py new file mode 100644 index 0000000..85c017c --- /dev/null +++ b/xt/model/muzero/muzero_cnn_ms.py @@ -0,0 +1,126 @@ +from xt.model.ms_compat import ms, Dense, Conv2d, Flatten, ReLU, Cell +from xt.model.muzero.muzero_model_ms import MuzeroModelMS +from xt.model.muzero.default_config import HIDDEN_OUT +from zeus.common.util.common import import_config +from zeus.common.util.register import Registers + +# pylint: disable=W0201 + + +@Registers.model +class MuzeroCnnMS(MuzeroModelMS): + """Docstring for ActorNetwork.""" + + def __init__(self, model_info): + model_config = model_info.get('model_config', None) + import_config(globals(), model_config) + + super().__init__(model_info) + + def create_rep_network(self): + return RepNet(self.state_dim) + + def create_policy_network(self): + return PolicyNet(self.value_support_size, self.action_dim) + + def create_dyn_network(self): + return DynNet(self.action_dim, self.reward_support_size) + + +class RepNet(Cell): + def __init__(self, state_dim): + super().__init__() + self.convlayer1 = Conv2d(state_dim[-1], + 32, + (8, + 8), + stride=(4, + 4), + pad_mode="valid", + has_bias=True, + weight_init="XavierUniform") + self.convlayer2 = Conv2d(32, 32, (4, 4), stride=(2, 2), + pad_mode="valid", has_bias=True, + weight_init="XavierUniform") + self.convlayer3 = Conv2d(32, 64, (3, 3), stride=(1, 1), + pad_mode="valid", has_bias=True, + weight_init="XavierUniform") + self.relu = ReLU() + self.flattenlayer = Flatten() + dim = ( + (((state_dim[0] - 4) // 4 - 2) // 2 - 2) + * (((state_dim[1] - 4) // 4 - 2) // 2 - 2) + * 64 + ) + self.denselayer = Dense( + dim, + HIDDEN_OUT, + activation="relu", + weight_init="XavierUniform") + + def construct(self, x: ms.Tensor): + out = x.transpose((0, 3, 1, 2)).astype("float32") / 255. + out = self.convlayer1(out) + out = self.relu(out) + out = self.convlayer2(out) + out = self.relu(out) + out = self.convlayer3(out) + out = self.relu(out) + out = self.flattenlayer(out) + out = self.denselayer(out) + return out + + +class PolicyNet(Cell): + def __init__(self, value_support_size, action_dim): + super().__init__() + self.hidden = Dense( + HIDDEN_OUT, + 128, + activation="relu", + weight_init="XavierUniform") + self.out_v = Dense( + 128, + value_support_size, + activation="softmax", + weight_init="XavierUniform") + self.out_p = Dense( + 128, + action_dim, + activation="softmax", + weight_init="XavierUniform") + + def construct(self, x): + hidden = self.hidden(x) + out_v = self.out_v(hidden) + out_p = self.out_p(hidden) + return out_p, out_v + + +class DynNet(Cell): + def __init__(self, action_dim, reward_support_size): + super().__init__() + self.hidden1 = Dense( + HIDDEN_OUT + action_dim, + 256, + activation="relu", + weight_init="XavierUniform") + self.hidden2 = Dense(256, 128, activation="relu", + weight_init="XavierUniform") + self.out_h = Dense( + 128, + HIDDEN_OUT, + activation="relu", + weight_init="XavierUniform") + self.out_r = Dense( + 128, + reward_support_size, + activation="softmax", + weight_init="XavierUniform") + + def construct(self, x): + hidden = self.hidden1(x) + hidden = self.hidden2(hidden) + out_h = self.out_h(hidden) + out_r = self.out_r(hidden) + return out_h, out_r diff --git a/xt/model/muzero/muzero_mlp_ms.py b/xt/model/muzero/muzero_mlp_ms.py new file mode 100644 index 0000000..aaa68aa --- /dev/null +++ b/xt/model/muzero/muzero_mlp_ms.py @@ -0,0 +1,99 @@ +from mindspore import nn +from mindspore.nn import Dense +from xt.model.muzero.muzero_model_ms import MuzeroModelMS +from xt.model.muzero.default_config import HIDDEN1_UNITS, HIDDEN2_UNITS +from zeus.common.util.common import import_config +from zeus.common.util.register import Registers + +# pylint: disable=W0201 + + +@Registers.model +class MuzeroMlpMS(MuzeroModelMS): + """Docstring for ActorNetwork.""" + + def __init__(self, model_info): + model_config = model_info.get('model_config', None) + import_config(globals(), model_config) + + super().__init__(model_info) + + def create_rep_network(self): + return RepNet(self.state_dim) + + def create_policy_network(self): + return PolicyNet(self.value_support_size, self.action_dim) + + def create_dyn_network(self): + return DynNet(self.action_dim, self.reward_support_size) + + +class RepNet(nn.Cell): + def __init__(self, state_dim): + super().__init__() + self.hidden = Dense(state_dim[-1], + HIDDEN1_UNITS, + activation="relu", + weight_init="XavierUniform") + self.out_rep = Dense( + HIDDEN1_UNITS, + HIDDEN2_UNITS, + activation="relu", + weight_init="XavierUniform") + + def construct(self, x): + out = self.hidden(x) + out = self.out_rep(out) + return out + + +class PolicyNet(nn.Cell): + def __init__(self, value_support_size, action_dim): + super().__init__() + self.hidden = Dense( + HIDDEN2_UNITS, + HIDDEN1_UNITS, + activation="relu", + weight_init="XavierUniform") + self.out_v = Dense( + HIDDEN1_UNITS, + value_support_size, + activation="softmax", + weight_init="XavierUniform") + self.out_p = Dense( + HIDDEN1_UNITS, + action_dim, + activation="softmax", + weight_init="XavierUniform") + + def construct(self, x): + hidden = self.hidden(x) + out_v = self.out_v(hidden) + out_p = self.out_p(hidden) + return out_p, out_v + + +class DynNet(nn.Cell): + def __init__(self, action_dim, reward_support_size): + super().__init__() + self.hidden = Dense( + HIDDEN2_UNITS + action_dim, + HIDDEN1_UNITS, + activation="relu", + weight_init="XavierUniform") + self.out_h = Dense( + HIDDEN1_UNITS, + HIDDEN2_UNITS, + activation="relu", + weight_init="XavierUniform") + self.out_r = Dense( + HIDDEN1_UNITS, + reward_support_size, + activation="softmax", + weight_init="XavierUniform") + + def construct(self, x): + hidden = self.hidden(x) + out_h = self.out_h(hidden) + out_r = self.out_r(hidden) + return out_h, out_r diff --git a/xt/model/muzero/muzero_model_ms.py b/xt/model/muzero/muzero_model_ms.py new file mode 100644 index 0000000..f40d988 --- /dev/null +++ b/xt/model/muzero/muzero_model_ms.py @@ -0,0 +1,370 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import os +import copy +import typing +import math +import numpy as np + +from collections import OrderedDict +from typing import List +from mindspore import nn, ops, ParameterTuple +from xt.model.ms_compat import ms, Tensor, Adam, Cell, TrainOneStepCell, FixedLossScaleUpdateCell +from xt.model.model_ms import XTModel_MS, check_keep_model +from xt.model.muzero.default_config import LR, td_step +from xt.model.muzero.muzero_utils_ms import value_compression_ms,\ + value_decompression_ms, cross_entropy_ms, scale_gradient_ms +from zeus.common.util.common import import_config +from zeus.common.util.register import Registers +from mindspore import set_context +from xt.model.dqn.dqn_cnn_ms import MyTrainOneStepCell +set_context(runtime_num_threads=3, mode=0,device_id=0) +from mindspore import amp +# pylint: disable=W0201 + +@Registers.model +class MuzeroModelMS(XTModel_MS): + """Docstring for ActorNetwork.""" + + class InitInferNet(Cell): + def __init__(self, representation_network, policy_network): + super(MuzeroModelMS.InitInferNet, self).__init__() + self.representation_network = representation_network + self.policy_network = policy_network + + def construct(self, obs): + hidden = self.representation_network(obs) + policy, value = self.policy_network(hidden) + return value, policy, hidden + + class RecurInferNet(Cell): + def __init__(self, dynamic_network, policy_network): + super(MuzeroModelMS.RecurInferNet, self).__init__() + self.dynamic_network = dynamic_network + self.policy_network = policy_network + + def construct(self, conditioned_hidden): + hidden, reward = self.dynamic_network(conditioned_hidden) + policy, value = self.policy_network(hidden) + return value, reward, policy, hidden + + def __init__(self, model_info): + model_config = model_info.get('model_config', None) + import_config(globals(), model_config) + self.state_dim = model_info['state_dim'] + self.action_dim = model_info['action_dim'] + self.reward_min = model_config.get('reward_min', -300) + self.reward_max = model_config.get('reward_max', 300) + self.reward_support_size = math.ceil( + value_compression_ms( + self.reward_max - self.reward_min)) + 1 + self.value_min = model_config.get('value_min', 0) + self.value_max = model_config.get('value_max', 60000) + self.value_support_size = math.ceil( + value_compression_ms( + self.value_max - self.value_min)) + 1 + self.obs_type = model_config.get('obs_type', 'float32') + self.td_step = td_step + self.weight_decay = 1e-4 + self.representation_network = self.create_rep_network() + self.policy_network = self.create_policy_network() + self.dynamic_network = self.create_dyn_network() + super().__init__(model_info) + self.trainable_parameter = self.model.trainable_params() + self.net_with_loss = NetWithLoss( + self.model, + self.model.rnet, + self.model.pnet, + self.model.dnet, + td_step, + self.action_dim, + self.weight_decay) + self.adam = Adam(params=self.trainable_parameter, learning_rate=LR) + self.init_infer_net = self.InitInferNet( + self.model.rnet, self.model.pnet) + self.recur_infer_net = self.RecurInferNet( + self.model.dnet, self.model.pnet) + device_target = ms.get_context("device_target") + if device_target == 'Ascend': + manager = FixedLossScaleUpdateCell(loss_scale_value=2**14) + self.net_with_loss = amp.auto_mixed_precision(self.net_with_loss, "O2") + self.train_net = MyTrainOneStepCell(self.net_with_loss, self.adam, manager) + elif device_target == "GPU" or device_target == "CPU" : + self.train_net = myTrainOneStepCell(self.net_with_loss, optimizer=self.adam) + else: + raise Exception("Target error, GPU or Ascend is supported.") + super(MuzeroModelMS, self).__init__(model_info) + self.recur_infer_net.compile(ms.Tensor(np.zeros((1, 260))).astype(ms.float32)) + self.init_infer_net.compile(ms.Tensor(np.zeros((1, 84, 84, 4))).astype(ms.float32)) + + def create_model(self, model_info): + self.full_model = MuzeroBaseMS(self.representation_network, + self.dynamic_network, + self.policy_network) + + return self.full_model + + def initial_inference(self, input_data): + obs = Tensor.from_numpy(input_data) + value, policy, hidden = self.init_infer_net(obs) + hidden = hidden.asnumpy() + policy = policy.asnumpy() + value = value.asnumpy() + value = self.value_transform( + value[0], + self.value_support_size, + self.value_min, + self.value_max) + return NetworkOutput(value, 0, policy[0], hidden[0]) + + """这里的变量使用还要考虑一下""" + + def recurrent_inference(self, hidden_state, action): + action = np.expand_dims(np.eye(self.action_dim)[action], 0) + hidden_state = np.expand_dims(hidden_state, 0) + conditioned_hidden = np.hstack((hidden_state, action)) + conditioned_hidden = Tensor(conditioned_hidden, ms.float32) + value, reward, policy, hidden = self.recur_infer_net( + conditioned_hidden) + + hidden = hidden.asnumpy() + reward = reward.asnumpy() + policy = policy.asnumpy() + value = value.asnumpy() + value = self.value_transform( + value[0], + self.value_support_size, + self.value_min, + self.value_max) + reward = self.value_transform( + reward[0], + self.reward_support_size, + self.reward_min, + self.reward_max) + return NetworkOutput(value, reward, policy[0], hidden[0]) + + def train(self, state, label): + target_value = self.conver_value( + label[0], + self.value_support_size, + self.value_min, + self.value_max) + target_reward = self.conver_value( + label[1], + self.reward_support_size, + self.reward_min, + self.reward_max) + obs = Tensor.from_numpy(state[0]) + action = Tensor.from_numpy(state[1]) + loss_weights = Tensor.from_numpy(state[2]).astype(ms.float32) + target_value = Tensor.from_numpy(target_value).astype(ms.float32) + target_reward = Tensor.from_numpy(target_reward).astype(ms.float32) + target_policy = Tensor.from_numpy(label[2]).astype(ms.float32) + loss = self.train_net( + obs, + action, + loss_weights, + target_value, + target_reward, + target_policy).asnumpy() + return np.mean(loss) + + def get_weights(self): + """return the weights of the model""" + _weights = OrderedDict([(par_name, par.data.asnumpy()) + for par_name, par in + self.model.parameters_and_names()]) + return _weights + + def set_weights(self, weights): + """set the new weights""" + for _, param in self.model.parameters_and_names(): + if param.name in weights: + new_param_data = Tensor.from_numpy(copy.deepcopy(weights[param.name])) + param.set_data(new_param_data, param.sliced) + + def save_model(self, file_name): + """save weights into .h5 file""" + # check max model file to keep + check_keep_model(os.path.dirname(file_name), self.max_to_keep) + _weights = OrderedDict([(par_name, par.data.asnumpy()) + for par_name, par in + self.model.parameters_and_names()]) + np.savez(file_name + ".h5", **_weights) + if self.model_format == 'pb': + pb_model(self.model, file_name) + return file_name + ".h5" + + def load_model(self, model_name, by_name=False): + np_file = np.load(model_name) + weights = OrderedDict(**np_file) + self.set_weights(weights) + + def conver_value(self, target_value, support_size, min, max): + # MSE in board games, cross entropy between categorical values in + # Atari. + targets = np.zeros(target_value.shape[0:2] + (support_size,)) + target_value = np.clip(target_value, min, max) - min + batch_size = target_value.shape[0] + td_size = target_value.shape[1] + + for i in range(batch_size): + value = value_compression_ms(target_value[i]) + floor_value = np.floor(value).astype(int) + rest = value - floor_value + + index = floor_value.astype(int) + targets[i, range(td_size), index] = 1 - rest + targets[i, range(td_size), index + 1] = rest + + return targets + + def value_transform(self, value_support, support_size, min, max): + """ + The value is obtained by first computing the expected value + from the discrete support. + Second, the inverse transform is then apply (the square function). + """ + value = np.dot(value_support, range(0, support_size)) + value = value_decompression_ms(value) + min + value = np.clip(value, min, max) + return np.asscalar(value) + + def value_inference(self, input_data): + obs = Tensor.from_numpy(input_data) + value, _, _ = self.init_infer_net(obs) + value = value.asnumpy() + value_list = [] + for value_data in value: + value_list.append( + self.value_transform( + value_data, + self.value_support_size, + self.value_min, + self.value_max)) + return np.asarray(value_list) + + +class myTrainOneStepCell(TrainOneStepCell): + def __init__(self, network, optimizer): + super(myTrainOneStepCell, self).__init__(network, optimizer) + self.depend = ops.Depend() + self.network = network + self.grad_fn = ops.value_and_grad( + self.network, grad_position=None, weights=self.weights) + + def construct(self, *inputs): + loss, grads = self.grad_fn(*inputs) + grads = self.grad_reducer(grads) + loss = self.depend(loss, self.optimizer(grads)) + return loss + + +class NetWithLoss(nn.Cell): + def __init__( + self, + full_model, + representation_network, + policy_network, + dynamic_network, + td_step, + action_dim, + weight_decay): + super(NetWithLoss, self).__init__(auto_prefix=False) + self.full_model = full_model + self.representation_network = representation_network + self.policy_network = policy_network + self.dynamic_network = dynamic_network + self.params = list(self.full_model.parameters_and_names()) + self.on_value, self.off_value = Tensor(1.0, ms.float32),\ + Tensor(0.0, ms.float32) + self.td_step = td_step + self.action_dim = action_dim + self.weight_decay = weight_decay + self.l2_loss = ops.L2Loss() + self.one_hot = ops.OneHot() + self.reshape = ops.Reshape() + self.concat = ops.Concat(-1) + + def construct( + self, + obs, + action, + loss_weights, + target_value, + target_reward, + target_policy): + hidden_state = self.representation_network(obs) + policy_logits, value = self.policy_network(hidden_state) + loss = cross_entropy_ms( + policy_logits, target_policy[:, 0], loss_weights) + loss += cross_entropy_ms(value, target_value[:, 0], loss_weights) + gradient_scale = 1.0 / self.td_step + for i in range(self.td_step): + action_change = self.one_hot( + action[:, i], self.action_dim, self.on_value, self.off_value) + action_change = self.reshape(action_change, (-1, self.action_dim,)) + conditioned_state = self.concat((hidden_state, action_change)) + hidden_state, reward = self.dynamic_network(conditioned_state) + policy_logits, value = self.policy_network(hidden_state) + hidden_state = scale_gradient_ms(hidden_state, 0.5) + l = cross_entropy_ms(reward, target_reward[:, i], loss_weights) + l += cross_entropy_ms(policy_logits, + target_policy[:, i + 1], loss_weights) + l += cross_entropy_ms(value, target_value[:, i + 1], loss_weights) + loss += scale_gradient_ms(l, gradient_scale) + + for _, param in self.params: + loss += self.weight_decay * self.l2_loss(param) + return loss + + +class NetworkOutput(typing.NamedTuple): + value: float + reward: float + policy: List[int] + hidden_state: List[float] + + +class MuzeroBaseMS(Cell): + """Model that combine the representation and prediction + (value+policy) network. + """ + + def __init__( + self, + representation_network: Cell, + dynamic_network: Cell, + policy_network: Cell): + super().__init__() + self.representation_network = representation_network + self.dynamic_network = dynamic_network + self.policy_network = policy_network + + @property + def rnet(self): + return self.representation_network + @property + def dnet(self): + return self.dynamic_network + @property + def pnet(self): + return self.policy_network diff --git a/xt/model/muzero/muzero_utils_ms.py b/xt/model/muzero/muzero_utils_ms.py new file mode 100644 index 0000000..aad48c9 --- /dev/null +++ b/xt/model/muzero/muzero_utils_ms.py @@ -0,0 +1,37 @@ +import numpy as np +from mindspore import ops +from xt.model.ms_compat import ReduceMax, ReduceMin, ReduceMean + + +def scale_gradient_ms(tensor, scale): + """Scales the gradient for the backward pass.""" + return tensor * scale + ops.stop_gradient(tensor) * (1 - scale) + + +def hidden_normlize_ms(hidden): + reduce_max = ReduceMax(keep_dims=True) + reduce_min = ReduceMin(keep_dims=True) + hidden_max = reduce_max(hidden, -1) + hidden_min = reduce_min(hidden, -1) + hidden_norm = (hidden - hidden_min) / (hidden_max - hidden_min + 1e-10) + return hidden_norm + + +def cross_entropy_ms(pred_p, target_p, loss_weights): + log = ops.Log() + reduce_mean = ReduceMean(keep_dims=True) + _cross_entropy = reduce_mean(-target_p * log(pred_p + 1e-10), -1) + return reduce_mean(_cross_entropy * 1.0) + + +def value_compression_ms(value): + return np.sign(value) * (np.sqrt(np.abs(value) + 1) - 1) + 0.001 * value + + +def value_decompression_ms(value): + return np.sign(value) * ( + ( + (np.sqrt(1 + 4 * 0.001 * (np.abs(value) + 1 + 0.001)) - 1) + / (2 * 0.001) + ) ** 2 - 1 + ) diff --git a/xt/model/ppo/ppo_cnn_ms.py b/xt/model/ppo/ppo_cnn_ms.py new file mode 100644 index 0000000..a3810d0 --- /dev/null +++ b/xt/model/ppo/ppo_cnn_ms.py @@ -0,0 +1,61 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from xt.model.model_utils_ms import ACTIVATION_MAP_MS, get_cnn_backbone_ms,\ + get_cnn_default_settings_ms, get_default_filters_ms +from xt.model.ppo.default_config import CNN_SHARE_LAYERS +from xt.model.ppo.ppo_ms import PPOMS +from zeus.common.util.register import Registers +from xt.model.ms_utils import MSVariables + + +@Registers.model +class PpoCnnMS(PPOMS): + """Build PPO CNN network.""" + + def __init__(self, model_info): + model_config = model_info.get('model_config') + + self.vf_share_layers = model_config.get( + 'VF_SHARE_LAYERS', CNN_SHARE_LAYERS) + self.hidden_sizes = model_config.get( + 'hidden_sizes', get_cnn_default_settings_ms('hidden_sizes')) + activation = model_config.get( + 'activation', get_cnn_default_settings_ms('activation')) + try: + self.activation = ACTIVATION_MAP_MS[activation] + except KeyError: + raise KeyError('activation {} not implemented.'.format(activation)) + + super().__init__(model_info) + + def create_model(self, model_info): + filter_arches = get_default_filters_ms(self.state_dim) + net = get_cnn_backbone_ms( + self.state_dim, + self.action_dim, + self.hidden_sizes, + self.activation, + filter_arches, + self.vf_share_layers, + self.verbose, + dtype=self.input_dtype) + self.actor_var = MSVariables(net) + return net diff --git a/xt/model/ppo/ppo_mlp_ms.py b/xt/model/ppo/ppo_mlp_ms.py new file mode 100644 index 0000000..5850908 --- /dev/null +++ b/xt/model/ppo/ppo_mlp_ms.py @@ -0,0 +1,59 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from xt.model.model_utils_ms import ACTIVATION_MAP_MS,\ + get_mlp_backbone_ms, get_mlp_default_settings_ms +from xt.model.ppo.default_config import MLP_SHARE_LAYERS +from xt.model.ppo.ppo_ms import PPOMS +from zeus.common.util.register import Registers +from xt.model.ms_utils import MSVariables + + +@Registers.model +class PpoMlpMS(PPOMS): + """Build PPO MLP network.""" + + def __init__(self, model_info): + model_config = model_info.get('model_config') + + self.vf_share_layers = model_config.get( + 'VF_SHARE_LAYERS', MLP_SHARE_LAYERS) + self.hidden_sizes = model_config.get( + 'hidden_sizes', get_mlp_default_settings_ms('hidden_sizes')) + activation = model_config.get( + 'activation', get_mlp_default_settings_ms('activation')) + try: + self.activation = ACTIVATION_MAP_MS[activation] + except KeyError: + raise KeyError('activation {} not implemented.'.format(activation)) + + super().__init__(model_info) + + def create_model(self, model_info): + net = get_mlp_backbone_ms( + self.state_dim, + self.action_dim, + self.hidden_sizes, + self.activation, + self.vf_share_layers, + self.verbose, + dtype=self.input_dtype) + self.actor_var = MSVariables(net) + return net diff --git a/xt/model/ppo/ppo_ms.py b/xt/model/ppo/ppo_ms.py new file mode 100644 index 0000000..72a2690 --- /dev/null +++ b/xt/model/ppo/ppo_ms.py @@ -0,0 +1,181 @@ +# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import numpy as np +from xt.model.ppo.default_config import LR, BATCH_SIZE, CRITIC_LOSS_COEF,\ + ENTROPY_LOSS, LOSS_CLIPPING, MAX_GRAD_NORM, NUM_SGD_ITER, SUMMARY, VF_CLIP +from xt.model.ms_dist import make_dist +from zeus.common.util.common import import_config +from zeus.common.util.register import Registers +from xt.model.ms_compat import Cell, TrainOneStepCell, LossBase, ReduceMean, ReduceSum, Tensor, Adam +from xt.model.ms_compat import Depend, value_and_grad, clip_by_global_norm, Minimum, Maximum, Exp, Square, clip_by_value, DynamicLossScaleUpdateCell, FixedLossScaleUpdateCell +from xt.model.model_ms import XTModel_MS +from xt.model.ms_utils import MSVariables +import mindspore as ms +from xt.model.dqn.dqn_cnn_ms import MyTrainOneStepCell +from mindspore.train import Model + +from mindspore import amp +ms.set_context(runtime_num_threads=5,mode =0,device_id=0) + +@Registers.model +class PPOMS(XTModel_MS): + + class PPOPredictPolicy(Cell): + def __init__(self, net, dist): + super(PPOMS.PPOPredictPolicy, self).__init__(auto_prefix=False) + self.network = net + self.dist = dist + + def construct(self, state): + pi_latent, v_out = self.network(state) + action = self.dist.sample(pi_latent) + logp = self.dist.log_prob(action, pi_latent) + return action, logp, v_out + + def __init__(self, model_info): + model_config = model_info.get('model_config') + import_config(globals(), model_config) + # fixme: could read action_dim&obs_dim from env.info + self.state_dim = model_info['state_dim'] + self.action_dim = model_info['action_dim'] + self.input_dtype = model_info.get('input_dtype', 'float32') + self.action_type = model_config.get('action_type') + self._lr = model_config.get('LR', LR) + self._batch_size = model_config.get('BATCH_SIZE', BATCH_SIZE) + self.critic_loss_coef = model_config.get( + 'CRITIC_LOSS_COEF', CRITIC_LOSS_COEF) + self.ent_coef = Tensor(model_config.get('ENTROPY_LOSS', ENTROPY_LOSS)) + self.clip_ratio = Tensor(model_config.get( + 'LOSS_CLIPPING', LOSS_CLIPPING)) + self._max_grad_norm = model_config.get('MAX_GRAD_NORM', MAX_GRAD_NORM) + self.num_sgd_iter = model_config.get('NUM_SGD_ITER', NUM_SGD_ITER) + self.verbose = model_config.get('SUMMARY', SUMMARY) + self.vf_clip = Tensor(model_config.get('VF_CLIP', VF_CLIP)) + self.dist = make_dist(self.action_type, self.action_dim) + self.amsgrad = model_config.get('USE_AMSGRAD', False) + super().__init__(model_info) + self.predict_net = self.PPOPredictPolicy(self.model, self.dist) + adam = Adam(params=self.predict_net.trainable_params(), learning_rate=self._lr, use_amsgrad=True, use_locking=True) + loss_fn = WithLossCell(self.critic_loss_coef, self.clip_ratio, self.ent_coef, self.vf_clip) + forward_fn = NetWithLoss(self.model, loss_fn, self.dist) + device_target = ms.get_context("device_target") + if device_target == 'Ascend': + manager = FixedLossScaleUpdateCell(loss_scale_value=2**14) + forward_fn = amp.auto_mixed_precision(forward_fn, "O2") + self.train_net = MyTrainOneStepCell(forward_fn, adam, manager, grad_clip=True, clipnorm=self._max_grad_norm) + elif device_target == "GPU" or device_target == "CPU": + self.train_net = myTrainOneStepCell(forward_fn, optimizer=adam, max_grad_norm=self._max_grad_norm) + else: + raise Exception("Target error, GPU or Ascend is supported.") + self.predict_net.compile(ms.Tensor(np.zeros((1, 84, 84, 4))).astype(ms.float32)) + + def predict(self, state): + """Predict state.""" + state = Tensor.from_numpy(state) + action, logp, v_out = self.predict_net(state) + action = action.asnumpy() + logp = logp.asnumpy() + v_out = v_out.asnumpy() + return action, logp, v_out + + def train(self, state, label): + nbatch = state[0].shape[0] + inds = np.arange(nbatch) + loss_val = [] + for _ in range(self.num_sgd_iter): + np.random.shuffle(inds) + for start in range(0, nbatch, self._batch_size): + end = start + self._batch_size + mbinds = inds[start:end] + state_ph = Tensor.from_numpy(state[0][mbinds]) + behavior_action_ph = Tensor.from_numpy(label[0][mbinds]) + old_logp_ph = Tensor.from_numpy(label[1][mbinds]) + adv_ph = Tensor.from_numpy(label[2][mbinds]).astype(ms.float32) + old_v_ph = Tensor.from_numpy(label[3][mbinds]) + target_v_ph = Tensor.from_numpy(label[4][mbinds]).astype(ms.float32) + loss = self.train_net(state_ph, adv_ph, old_logp_ph, behavior_action_ph, target_v_ph, old_v_ph) + loss = loss.asnumpy() + loss_val.append(np.mean(loss)) + self.actor_var = MSVariables(self.model) + return np.mean(loss_val) + + +class myTrainOneStepCell(TrainOneStepCell): + def __init__(self, network, optimizer, max_grad_norm, sens=1.0): + super(myTrainOneStepCell, self).__init__(network, optimizer, sens) + self.sens = sens + self.depend = Depend() + self.max_grad_norm = max_grad_norm + self.grad_fn = value_and_grad(self.network, grad_position=None, weights=self.weights) + + def construct(self, *inputs): + loss, grads = self.grad_fn(*inputs) + grads = clip_by_global_norm(grads, self.max_grad_norm) + grads = self.grad_reducer(grads) + loss = self.depend(loss, self.optimizer(grads)) + return loss + + +class NetWithLoss(Cell): + def __init__(self, net, loss_fn, dist): + super(NetWithLoss, self).__init__(auto_prefix=False) + self.net = net + self._loss_fn = loss_fn + self.dist = dist + + def construct(self, state_ph, adv_ph, old_logp_ph, behavior_action, target_v, old_v_ph): + pi_latent, v_out = self.net(state_ph) + ent = self.dist.entropy(pi_latent) + action_log_prob = self.dist.log_prob(behavior_action, pi_latent) + loss = self._loss_fn(action_log_prob, ent, adv_ph, old_logp_ph, target_v, v_out, old_v_ph) + return loss + + +class WithLossCell(LossBase): + def __init__(self, critic_loss_coef, clip_ratio, ent_coef, val_clip): + super(WithLossCell, self).__init__() + self.reduce_mean = ReduceMean(keep_dims=True) + self.critic_loss_coef = critic_loss_coef + self.clip_ratio = clip_ratio + self.ent_coef = ent_coef + self.val_clip = val_clip + self.minimum = Minimum() + self.maximum = Maximum() + self.exp = Exp() + self.square = Square() + + def construct(self, action_log_prob, ent, adv, old_log_p, target_v, out_v, old_v): + ratio = self.exp(action_log_prob - old_log_p) + + surr_loss_1 = ratio * adv + surr_loss_2 = clip_by_value(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * adv + surr_loss = self.reduce_mean(self.minimum(surr_loss_1, surr_loss_2)) + ent = self.reduce_mean(ent) + + actor_loss = -surr_loss - self.ent_coef * ent + + vf_losses1 = self.square(out_v - target_v) + val_pred_clipped = old_v + clip_by_value(out_v - old_v, -self.val_clip, self.val_clip) + vf_losses2 = self.square(val_pred_clipped - target_v) + + critic_loss = 0.5 * self.reduce_mean(self.maximum(vf_losses1, vf_losses2)) + loss = actor_loss + self.critic_loss_coef * critic_loss + return loss