Merge branch 'main' into mgs28-char-rnn-update

pytorch · Aug 2, 2024 · 8508a8b · 8508a8b
2 parents ecb97ef + f1c0b8a
commit 8508a8b
Show file tree

Hide file tree

Showing 24 changed files with 817 additions and 63 deletions.
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -13,7 +13,7 @@ tqdm==4.66.1
 numpy==1.24.4
 matplotlib
 librosa
-torch==2.3
+torch==2.4
 torchvision
 torchtext
 torchdata
@@ -28,9 +28,9 @@ tensorboard
 jinja2==3.1.3
 pytorch-lightning
 torchx
-torchrl==0.3.0
-tensordict==0.3.0
-ax-platform
+torchrl==0.5.0
+tensordict==0.5.0
+ax-platform>==0.4.0
 nbformat>==5.9.2
 datasets
 transformers
@@ -68,4 +68,4 @@ pygame==2.1.2
 pycocotools
 semilearn==0.3.2
 torchao==0.0.3
-segment_anything==1.0
+segment_anything==1.0
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -22,8 +22,8 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.4 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.4.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+# pip uninstall -y torch torchvision torchaudio torchtext torchdata
+# pip3 install torch==2.4.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png
diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png
diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png
diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png
diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png
diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py
@@ -182,7 +182,7 @@
 # Later, we will see how the target parameters should be updated in TorchRL.
 #
 
-from tensordict.nn import TensorDictModule
+from tensordict.nn import TensorDictModule, TensorDictSequential
 
 
 def _init(
@@ -290,12 +290,11 @@ def _loss_actor(
 ) -> torch.Tensor:
     td_copy = tensordict.select(*self.actor_in_keys)
     # Get an action from the actor network: since we made it functional, we need to pass the params
-    td_copy = self.actor_network(td_copy, params=self.actor_network_params)
+    with self.actor_network_params.to_module(self.actor_network):
+        td_copy = self.actor_network(td_copy)
     # get the value associated with that action
-    td_copy = self.value_network(
-        td_copy,
-        params=self.value_network_params.detach(),
-    )
+    with self.value_network_params.detach().to_module(self.value_network):
+        td_copy = self.value_network(td_copy)
     return -td_copy.get("state_action_value")
 
 
@@ -317,7 +316,8 @@ def _loss_value(
     td_copy = tensordict.clone()
 
     # V(s, a)
-    self.value_network(td_copy, params=self.value_network_params)
+    with self.value_network_params.to_module(self.value_network):
+        self.value_network(td_copy)
     pred_val = td_copy.get("state_action_value").squeeze(-1)
 
     # we manually reconstruct the parameters of the actor-critic, where the first
@@ -332,9 +332,8 @@ def _loss_value(
         batch_size=self.target_actor_network_params.batch_size,
         device=self.target_actor_network_params.device,
     )
-    target_value = self.value_estimator.value_estimate(
-        tensordict, target_params=target_params
-    ).squeeze(-1)
+    with target_params.to_module(self.actor_critic):
+        target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
 
     # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function`
     loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function)
@@ -717,7 +716,7 @@ def get_env_stats():
     ActorCriticWrapper,
     DdpgMlpActor,
     DdpgMlpQNet,
-    OrnsteinUhlenbeckProcessWrapper,
+    OrnsteinUhlenbeckProcessModule,
     ProbabilisticActor,
     TanhDelta,
     ValueOperator,
@@ -776,15 +775,18 @@ def make_ddpg_actor(
 # Exploration
 # ~~~~~~~~~~~
 #
-# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper`
+# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule`
 # exploration module, as suggested in the original paper.
 # Let's define the number of frames before OU noise reaches its minimum value
 annealing_frames = 1_000_000
 
-actor_model_explore = OrnsteinUhlenbeckProcessWrapper(
+actor_model_explore = TensorDictSequential(
     actor,
-    annealing_num_steps=annealing_frames,
-).to(device)
+    OrnsteinUhlenbeckProcessModule(
+        spec=actor.spec.clone(),
+        annealing_num_steps=annealing_frames,
+    ).to(device),
+)
 if device == torch.device("cpu"):
     actor_model_explore.share_memory()
 
@@ -1168,7 +1170,7 @@ def ceil_div(x, y):
         )
 
     # update the exploration strategy
-    actor_model_explore.step(current_frames)
+    actor_model_explore[1].step(current_frames)
 
 collector.shutdown()
 del collector

diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py
@@ -448,17 +448,19 @@
 m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix
 
 print('\nVectors & Matrices:')
-print(torch.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
+print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
 print(m1)
-m3 = torch.matmul(m1, m2)
+m3 = torch.linalg.matmul(m1, m2)
 print(m3)                  # 3 times m1
-print(torch.svd(m3))       # singular value decomposition
+print(torch.linalg.svd(m3))       # singular value decomposition
 
 
 ##################################################################################
 # This is a small sample of operations. For more details and the full inventory of
 # math functions, have a look at the
 # `documentation <https://pytorch.org/docs/stable/torch.html#math-operations>`__.
+# For more details and the full inventory of linear algebra operations, have a
+# look at this `documentation <https://pytorch.org/docs/stable/linalg.html>`__.
 # 
 # Altering Tensors in Place
 # ~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py
@@ -352,7 +352,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
 # Cosine loss minimization run
 # ----------------------------
 # Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients.
-# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization.
+# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization.
 # Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers.
 # Our goal is to convey information from the teacher's representation to the student by including a naive loss function,
 # whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases.

diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -1,3 +1,4 @@
+
 ACL
 ADI
 AOT
@@ -50,6 +51,7 @@ DDP
 DDPG
 DDQN
 DLRM
+DMA
 DNN
 DQN
 DataLoaders
@@ -68,6 +70,8 @@ Ecker
 ExportDB
 FC
 FGSM
+tensordict
+DataLoader's
 FLAVA
 FSDP
 FX
@@ -139,6 +143,7 @@ MKLDNN
 MLP
 MLPs
 MNIST
+MPS
 MUC
 MacBook
 MacOS
@@ -222,6 +227,7 @@ STR
 SVE
 SciPy
 Sequentials
+Sharding
 Sigmoid
 SoTA
 Sohn
@@ -257,6 +263,7 @@ VLDB
 VQA
 VS Code
 ViT
+Volterra
 WMT
 WSI
 WSIs
@@ -339,11 +346,11 @@ dataset’s
 deallocation
 decompositions
 decorrelated
-devicemesh
 deserialize
 deserialized
 desynchronization
 deterministically
+devicemesh
 dimensionality
 dir
 discontiguous
@@ -388,6 +395,7 @@ hessian
 hessians
 histoencoder
 histologically
+homonymous
 hotspot
 hvp
 hyperparameter
@@ -463,6 +471,7 @@ optimizer's
 optimizers
 otsu
 overfitting
+pageable
 parallelizable
 parallelization
 parametrization
@@ -527,7 +536,6 @@ runtime
 runtimes
 scalable
 sharded
-Sharding
 softmax
 sparsified
 sparsifier
@@ -616,4 +624,4 @@ warmstarting
 warmup
 webp
 wsi
-wsis
+wsis
diff --git a/index.rst b/index.rst
@@ -3,11 +3,12 @@ Welcome to PyTorch Tutorials
 
 **What's new in PyTorch tutorials?**
 
-* `Using User-Defined Triton Kernels with torch.compile <https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html>`__
-* `Large Scale Transformer model training with Tensor Parallel (TP) <https://pytorch.org/tutorials/intermediate/TP_tutorial.html>`__
-* `Accelerating BERT with semi-structured (2:4) sparsity <https://pytorch.org/tutorials/advanced/semi_structured_sparse.html>`__
-* `torch.export Tutorial with torch.export.Dim <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__
-* `Extension points in nn.Module for load_state_dict and tensor subclasses <https://pytorch.org/tutorials/recipes/recipes/swap_tensors.html>`__
+* `A guide on good usage of non_blocking and pin_memory() in PyTorch <https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html>`__
+* `Introduction to Distributed Pipeline Parallelism <https://pytorch.org/tutorials/intermediate/pipelining_tutorial.html>`__
+* `Introduction to Libuv TCPStore Backend <https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html>`__ 
+* `Asynchronous Saving with Distributed Checkpoint (DCP) <https://pytorch.org/tutorials/recipes/distributed_async_checkpoint_recipe.html>`__
+* `Python Custom Operators <https://pytorch.org/tutorials/advanced/python_custom_ops.html>`__
+* Updated `Getting Started with DeviceMesh <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__
 
 .. raw:: html
 
@@ -93,6 +94,13 @@ Welcome to PyTorch Tutorials
    :link: intermediate/tensorboard_tutorial.html
    :tags: Interpretability,Getting-Started,TensorBoard
 
+.. customcarditem::
+   :header: Good usage of `non_blocking` and `pin_memory()` in PyTorch
+   :card_description: A guide on best practices to copy data from CPU to GPU.
+   :image: _static/img/pinmem.png
+   :link: intermediate/pinmem_nonblock.html
+   :tags: Getting-Started
+
 .. Image/Video
 
 .. customcarditem::
@@ -969,6 +977,7 @@ Additional Resources
    beginner/pytorch_with_examples
    beginner/nn_tutorial
    intermediate/tensorboard_tutorial
+   intermediate/pinmem_nonblock
 
 .. toctree::
    :maxdepth: 2

diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst
@@ -502,7 +502,7 @@ layer class (holding MHSA and FFN).
   
 
     model = FSDP(model,
-        fsdp_auto_wrap_policy=t5_auto_wrap_policy)
+        auto_wrap_policy=t5_auto_wrap_policy)
 
 To see the wrapped model, you can easily print the model and visually inspect
 the sharding and FSDP units as well.

diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
@@ -70,7 +70,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
 1.2  Import necessary packages
 
 .. note::
-    This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`.
+    This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`.
 
 .. code-block:: python
 
@@ -308,7 +308,7 @@ We have recorded cuda events to measure the time of FSDP model specifics. The CU
     CUDA event elapsed time on training loop 40.67462890625sec
 
 Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit.
-Alternatively, we will look at adding the fsdp_auto_wrap_policy next and will discuss the differences. 
+Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences. 
 
 .. code-block:: bash
 
@@ -335,12 +335,12 @@ The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarg
 
    FSDP Peak Memory Usage
 
-Applying *fsdp_auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. 
+Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. 
 The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model. 
 In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding.
 Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers. 
 
-To avoid that, you can pass in an fsdp_auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit).
+To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit).
 In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers.
 Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100.
 
@@ -358,9 +358,9 @@ Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning
     model = Net().to(rank)
 
     model = FSDP(model,
-        fsdp_auto_wrap_policy=my_auto_wrap_policy)
+        auto_wrap_policy=my_auto_wrap_policy)
 
-Applying the fsdp_auto_wrap_policy, the model would be as follows:
+Applying the auto_wrap_policy, the model would be as follows:
 
 .. code-block:: bash
 
@@ -411,7 +411,7 @@ In 2.4 we just add it to the FSDP wrapper
 .. code-block:: python
 
     model = FSDP(model,
-        fsdp_auto_wrap_policy=my_auto_wrap_policy,
+        auto_wrap_policy=my_auto_wrap_policy,
         cpu_offload=CPUOffload(offload_params=True))
 
 

diff --git a/intermediate_source/TCPStore_libuv_backend.rst b/intermediate_source/TCPStore_libuv_backend.rst
@@ -8,7 +8,8 @@ Introduction to Libuv TCPStore Backend
 .. grid:: 2
 
    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
-    :class-card: card-prerequisites
+      :class-card: card-prerequisites
+
       *  What is the new TCPStore backend
       *  Compare the new libuv backend against the legacy backend
       *  How to enable to use the legacy backend