Merge branch 'main' into patch-1

pytorch · Sep 5, 2024 · 5e1833f · 5e1833f
2 parents ab10ae5 + 0e530ea
commit 5e1833f
Show file tree

Hide file tree

Showing 85 changed files with 2,379 additions and 191 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -11,8 +11,9 @@ IMAGE_NAME="$1"
 shift
 
 export UBUNTU_VERSION="20.04"
+export CUDA_VERSION="12.4.1"
 
-export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}"
+export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
 echo "Building ${IMAGE_NAME} Docker image"
 
 docker build \

diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
@@ -22,5 +22,5 @@ conda_run() {
 }
 
 pip_install() {
-  as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
+  as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip3 install --progress-bar off $*
 }
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -13,7 +13,7 @@ tqdm==4.66.1
 numpy==1.24.4
 matplotlib
 librosa
-torch==2.3
+torch==2.4
 torchvision
 torchtext
 torchdata
@@ -28,10 +28,10 @@ tensorboard
 jinja2==3.1.3
 pytorch-lightning
 torchx
-torchrl==0.3.0
-tensordict==0.3.0
-ax-platform
-nbformat>==5.9.2
+torchrl==0.5.0
+tensordict==0.5.0
+ax-platform>=0.4.0
+nbformat>=5.9.2
 datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
@@ -64,7 +64,7 @@ pyopengl
 gymnasium[mujoco]==0.27.0
 timm
 iopath
-pygame==2.1.2
+pygame==2.6.0
 pycocotools
 semilearn==0.3.2
 torchao==0.0.3

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -21,9 +21,9 @@ sudo apt-get install -y pandoc
 
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
-# Install 2.2 for testing - uncomment to install nightly binaries (update the version as needed).
+# Install 2.4 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 # pip uninstall -y torch torchvision torchaudio torchtext torchdata
-# pip3 install torch==2.3.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu121
+# pip3 install torch==2.4.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
@@ -28,6 +28,9 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
+  "recipes_source/torch_export_aoti_python.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  }, 
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -29,7 +29,6 @@
     "intermediate_source/fx_conv_bn_fuser",
     "intermediate_source/_torch_export_nightly_tutorial",  # does not work on release
     "advanced_source/super_resolution_with_onnxruntime",
-    "advanced_source/python_custom_ops",  # https://github.com/pytorch/pytorch/issues/127443
     "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
     "prototype_source/fx_graph_mode_ptq_dynamic",
     "prototype_source/vmap_recipe",
@@ -54,8 +53,6 @@
     "intermediate_source/flask_rest_api_tutorial",
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
-    "intermediate_source/inductor_debug_cpu", # reenable after 2942 
-    "beginner_source/onnx/onnx_registry_tutorial", # reenable after 2941 is fixed.
     "intermediate_source/torch_export_tutorial" # reenable after 2940 is fixed.
 ]
 

diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github
 
 Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)):
 
+NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md).
+
 1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`.
 2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`.
 3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst).
@@ -31,7 +33,7 @@ If you are starting off with a Jupyter notebook, you can use [this script](https
 
 ## Building locally
 
-The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: 
+The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code:
 
 1. Install required dependencies by running: `pip install -r requirements.txt`.
 
@@ -40,8 +42,6 @@ The tutorial build is very large and requires a GPU. If your machine does not ha
 - If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step.
 - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial.
 
-> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`.
-
 ## Building a single tutorial
 
 You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run:
@@ -59,8 +59,8 @@ The `GALLERY_PATTERN` variable respects regular expressions.
 
 
 ## About contributing to PyTorch Documentation and Tutorials
-* You can find information about contributing to PyTorch documentation in the 
-PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. 
+* You can find information about contributing to PyTorch documentation in the
+PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file.
 * Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
 
 

diff --git a/_static/css/custom.css b/_static/css/custom.css
@@ -91,3 +91,24 @@
     transition: none;
     transform-origin: none;
 }
+
+.pytorch-left-menu-search input[type=text] {
+    background-image: none;
+}
+
+.gsc-control-cse {
+   padding-left: 0px !important;
+   padding-bottom: 0px !important;
+}
+
+.gsc-search-button .gsc-search-button-v2:focus {
+   border: transparent !important;
+   outline: none;
+   box-shadow: none;
+}
+.gsc-search-button-v2:active {
+   border: none !important;
+}
+.gsc-search-button-v2 {
+   border: none !important;
+}
diff --git a/_static/img/distributed/tcpstore_barrier_time.png b/_static/img/distributed/tcpstore_barrier_time.png
diff --git a/_static/img/distributed/tcpstore_init_time.png b/_static/img/distributed/tcpstore_init_time.png
diff --git a/_static/img/onnx/custom_aten_add_function.png b/_static/img/onnx/custom_aten_add_function.png
diff --git a/_static/img/onnx/custom_aten_gelu_function.png b/_static/img/onnx/custom_aten_gelu_function.png
diff --git a/_static/img/onnx/custom_aten_gelu_model.png b/_static/img/onnx/custom_aten_gelu_model.png
diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png
diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png
diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png
diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png
diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png
diff --git a/_templates/layout.html b/_templates/layout.html
@@ -11,6 +11,23 @@
 </script>
 {%- endblock %}
 
+{% block sidebartitle %}
+    {% if theme_display_version %}
+      {%- set nav_version = version %}
+      {% if READTHEDOCS and current_version %}
+        {%- set nav_version = current_version %}
+      {% endif %}
+      {% if nav_version %}
+        <div class="version">
+            {{ nav_version }}
+        </div>
+      {% endif %}
+    {% endif %}
+    <div class="searchbox">
+        <script async src="https://cse.google.com/cse.js?cx=e65585f8c3ea1440e"></script>
+        <div class="gcse-search"></div>
+    </div>
+{% endblock %}
 
 {% block footer %}
 {{ super() }}

diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py
@@ -182,7 +182,7 @@
 # Later, we will see how the target parameters should be updated in TorchRL.
 #
 
-from tensordict.nn import TensorDictModule
+from tensordict.nn import TensorDictModule, TensorDictSequential
 
 
 def _init(
@@ -290,12 +290,11 @@ def _loss_actor(
 ) -> torch.Tensor:
     td_copy = tensordict.select(*self.actor_in_keys)
     # Get an action from the actor network: since we made it functional, we need to pass the params
-    td_copy = self.actor_network(td_copy, params=self.actor_network_params)
+    with self.actor_network_params.to_module(self.actor_network):
+        td_copy = self.actor_network(td_copy)
     # get the value associated with that action
-    td_copy = self.value_network(
-        td_copy,
-        params=self.value_network_params.detach(),
-    )
+    with self.value_network_params.detach().to_module(self.value_network):
+        td_copy = self.value_network(td_copy)
     return -td_copy.get("state_action_value")
 
 
@@ -317,7 +316,8 @@ def _loss_value(
     td_copy = tensordict.clone()
 
     # V(s, a)
-    self.value_network(td_copy, params=self.value_network_params)
+    with self.value_network_params.to_module(self.value_network):
+        self.value_network(td_copy)
     pred_val = td_copy.get("state_action_value").squeeze(-1)
 
     # we manually reconstruct the parameters of the actor-critic, where the first
@@ -332,9 +332,8 @@ def _loss_value(
         batch_size=self.target_actor_network_params.batch_size,
         device=self.target_actor_network_params.device,
     )
-    target_value = self.value_estimator.value_estimate(
-        tensordict, target_params=target_params
-    ).squeeze(-1)
+    with target_params.to_module(self.actor_critic):
+        target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
 
     # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function`
     loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function)
@@ -717,7 +716,7 @@ def get_env_stats():
     ActorCriticWrapper,
     DdpgMlpActor,
     DdpgMlpQNet,
-    OrnsteinUhlenbeckProcessWrapper,
+    OrnsteinUhlenbeckProcessModule,
     ProbabilisticActor,
     TanhDelta,
     ValueOperator,
@@ -776,15 +775,18 @@ def make_ddpg_actor(
 # Exploration
 # ~~~~~~~~~~~
 #
-# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper`
+# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule`
 # exploration module, as suggested in the original paper.
 # Let's define the number of frames before OU noise reaches its minimum value
 annealing_frames = 1_000_000
 
-actor_model_explore = OrnsteinUhlenbeckProcessWrapper(
+actor_model_explore = TensorDictSequential(
     actor,
-    annealing_num_steps=annealing_frames,
-).to(device)
+    OrnsteinUhlenbeckProcessModule(
+        spec=actor.spec.clone(),
+        annealing_num_steps=annealing_frames,
+    ).to(device),
+)
 if device == torch.device("cpu"):
     actor_model_explore.share_memory()
 
@@ -1168,7 +1170,7 @@ def ceil_div(x, y):
         )
 
     # update the exploration strategy
-    actor_model_explore.step(current_frames)
+    actor_model_explore[1].step(current_frames)
 
 collector.shutdown()
 del collector

diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst
@@ -174,6 +174,8 @@ To add ``torch.compile`` support for an operator, we must add a FakeTensor kerne
 known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have
 metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an
 operator specifies how to compute the metadata of output tensors given the metadata of input tensors.
+The FakeTensor kernel should return dummy Tensors of your choice with
+the correct Tensor metadata (shape/strides/``dtype``/device).
 
 We recommend that this be done from Python via the `torch.library.register_fake` API,
 though it is possible to do this from C++ as well (see
@@ -417,4 +419,4 @@ Conclusion
 In this tutorial, we went over the recommended approach to integrating Custom C++
 and CUDA operators with PyTorch. The ``TORCH_LIBRARY/torch.library`` APIs are fairly
 low-level. For more information about how to use the API, see
-`The Custom Operators Manual <https://pytorch.org/docs/main/notes/custom_operators.html>`_.
+`The Custom Operators Manual <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html#the-custom-operators-manual>`_.
diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst
@@ -2,6 +2,10 @@ Custom C++ and CUDA Extensions
 ==============================
 **Author**: `Peter Goldsborough <https://www.goldsborough.me/>`_
 
+.. warning::
+
+    This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page`
+    for the newest up-to-date guides on extending PyTorch with Custom C++/CUDA Extensions.
 
 PyTorch provides a plethora of operations related to neural networks, arbitrary
 tensor algebra, data wrangling and other purposes. However, you may still find
@@ -225,7 +229,7 @@ Instead of:
 Currently open issue for nvcc bug `here
 <https://github.com/pytorch/pytorch/issues/69460>`_.
 Complete workaround code example `here
-<https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48>`_. 
+<https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48>`_.
 
 Forward Pass
 ************

diff --git a/advanced_source/custom_ops_landing_page.rst b/advanced_source/custom_ops_landing_page.rst
@@ -1,7 +1,7 @@
 .. _custom-ops-landing-page:
 
-PyTorch Custom Operators Landing Page
-=====================================
+PyTorch Custom Operators
+===========================
 
 PyTorch offers a large library of operators that work on Tensors (e.g. ``torch.add``,
 ``torch.sum``, etc). However, you may wish to bring a new custom operation to PyTorch
@@ -10,26 +10,27 @@ In order to do so, you must register the custom operation with PyTorch via the P
 `torch.library docs <https://pytorch.org/docs/stable/library.html>`_ or C++ ``TORCH_LIBRARY``
 APIs.
 
-TL;DR
------
+
 
 Authoring a custom operator from Python
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Please see :ref:`python-custom-ops-tutorial`.
 
 You may wish to author a custom operator from Python (as opposed to C++) if:
+
 - you have a Python function you want PyTorch to treat as an opaque callable, especially with
-respect to ``torch.compile`` and ``torch.export``.
+  respect to ``torch.compile`` and ``torch.export``.
 - you have some Python bindings to C++/CUDA kernels and want those to compose with PyTorch
-subsystems (like ``torch.compile`` or ``torch.autograd``)
+  subsystems (like ``torch.compile`` or ``torch.autograd``)
 
 Integrating custom C++ and/or CUDA code with PyTorch
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Please see :ref:`cpp-custom-ops-tutorial`.
 
 You may wish to author a custom operator from C++ (as opposed to Python) if:
+
 - you have custom C++ and/or CUDA code.
 - you plan to use this code with ``AOTInductor`` to do Python-less inference.
 

diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
@@ -1,6 +1,11 @@
 Registering a Dispatched Operator in C++
 ========================================
 
+.. warning::
+
+    This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page`
+    for the newest up-to-date guides on extending PyTorch with Custom Operators.
+
 The dispatcher is an internal component of PyTorch which is responsible for
 figuring out what code should actually get run when you call a function like
 ``torch::add``.  This can be nontrivial, because PyTorch operations need

diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py
@@ -151,7 +151,8 @@ def tokenize(self, path):
 model.load_state_dict(
     torch.load(
         model_data_filepath + 'word_language_model_quantize.pth',
-        map_location=torch.device('cpu')
+        map_location=torch.device('cpu'),
+        weights_only=True
         )
     )