Skip to content

Commit

Permalink
Merge branch 'main' into mgs28-char-rnn-update
Browse files Browse the repository at this point in the history
  • Loading branch information
svekars authored Aug 2, 2024
2 parents ecb97ef + f1c0b8a commit 8508a8b
Show file tree
Hide file tree
Showing 24 changed files with 817 additions and 63 deletions.
10 changes: 5 additions & 5 deletions .ci/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ tqdm==4.66.1
numpy==1.24.4
matplotlib
librosa
torch==2.3
torch==2.4
torchvision
torchtext
torchdata
Expand All @@ -28,9 +28,9 @@ tensorboard
jinja2==3.1.3
pytorch-lightning
torchx
torchrl==0.3.0
tensordict==0.3.0
ax-platform
torchrl==0.5.0
tensordict==0.5.0
ax-platform>==0.4.0
nbformat>==5.9.2
datasets
transformers
Expand Down Expand Up @@ -68,4 +68,4 @@ pygame==2.1.2
pycocotools
semilearn==0.3.2
torchao==0.0.3
segment_anything==1.0
segment_anything==1.0
4 changes: 2 additions & 2 deletions .jenkins/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ sudo apt-get install -y pandoc
#Install PyTorch Nightly for test.
# Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
# Install 2.4 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
pip uninstall -y torch torchvision torchaudio torchtext torchdata
pip3 install torch==2.4.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
# pip uninstall -y torch torchvision torchaudio torchtext torchdata
# pip3 install torch==2.4.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124

# Install two language tokenizers for Translation with TorchText tutorial
python -m spacy download en_core_web_sm
Expand Down
Binary file added _static/img/pinmem/pinmem.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _static/img/pinmem/trace_streamed0_pinned0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _static/img/pinmem/trace_streamed0_pinned1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _static/img/pinmem/trace_streamed1_pinned0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added _static/img/pinmem/trace_streamed1_pinned1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
34 changes: 18 additions & 16 deletions advanced_source/coding_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@
# Later, we will see how the target parameters should be updated in TorchRL.
#

from tensordict.nn import TensorDictModule
from tensordict.nn import TensorDictModule, TensorDictSequential


def _init(
Expand Down Expand Up @@ -290,12 +290,11 @@ def _loss_actor(
) -> torch.Tensor:
td_copy = tensordict.select(*self.actor_in_keys)
# Get an action from the actor network: since we made it functional, we need to pass the params
td_copy = self.actor_network(td_copy, params=self.actor_network_params)
with self.actor_network_params.to_module(self.actor_network):
td_copy = self.actor_network(td_copy)
# get the value associated with that action
td_copy = self.value_network(
td_copy,
params=self.value_network_params.detach(),
)
with self.value_network_params.detach().to_module(self.value_network):
td_copy = self.value_network(td_copy)
return -td_copy.get("state_action_value")


Expand All @@ -317,7 +316,8 @@ def _loss_value(
td_copy = tensordict.clone()

# V(s, a)
self.value_network(td_copy, params=self.value_network_params)
with self.value_network_params.to_module(self.value_network):
self.value_network(td_copy)
pred_val = td_copy.get("state_action_value").squeeze(-1)

# we manually reconstruct the parameters of the actor-critic, where the first
Expand All @@ -332,9 +332,8 @@ def _loss_value(
batch_size=self.target_actor_network_params.batch_size,
device=self.target_actor_network_params.device,
)
target_value = self.value_estimator.value_estimate(
tensordict, target_params=target_params
).squeeze(-1)
with target_params.to_module(self.actor_critic):
target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)

# Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function`
loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function)
Expand Down Expand Up @@ -717,7 +716,7 @@ def get_env_stats():
ActorCriticWrapper,
DdpgMlpActor,
DdpgMlpQNet,
OrnsteinUhlenbeckProcessWrapper,
OrnsteinUhlenbeckProcessModule,
ProbabilisticActor,
TanhDelta,
ValueOperator,
Expand Down Expand Up @@ -776,15 +775,18 @@ def make_ddpg_actor(
# Exploration
# ~~~~~~~~~~~
#
# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper`
# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule`
# exploration module, as suggested in the original paper.
# Let's define the number of frames before OU noise reaches its minimum value
annealing_frames = 1_000_000

actor_model_explore = OrnsteinUhlenbeckProcessWrapper(
actor_model_explore = TensorDictSequential(
actor,
annealing_num_steps=annealing_frames,
).to(device)
OrnsteinUhlenbeckProcessModule(
spec=actor.spec.clone(),
annealing_num_steps=annealing_frames,
).to(device),
)
if device == torch.device("cpu"):
actor_model_explore.share_memory()

Expand Down Expand Up @@ -1168,7 +1170,7 @@ def ceil_div(x, y):
)

# update the exploration strategy
actor_model_explore.step(current_frames)
actor_model_explore[1].step(current_frames)

collector.shutdown()
del collector
Expand Down
8 changes: 5 additions & 3 deletions beginner_source/introyt/tensors_deeper_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,17 +448,19 @@
m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix

print('\nVectors & Matrices:')
print(torch.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
print(m1)
m3 = torch.matmul(m1, m2)
m3 = torch.linalg.matmul(m1, m2)
print(m3) # 3 times m1
print(torch.svd(m3)) # singular value decomposition
print(torch.linalg.svd(m3)) # singular value decomposition


##################################################################################
# This is a small sample of operations. For more details and the full inventory of
# math functions, have a look at the
# `documentation <https://pytorch.org/docs/stable/torch.html#math-operations>`__.
# For more details and the full inventory of linear algebra operations, have a
# look at this `documentation <https://pytorch.org/docs/stable/linalg.html>`__.
#
# Altering Tensors in Place
# ~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion beginner_source/knowledge_distillation_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
# Cosine loss minimization run
# ----------------------------
# Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients.
# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization.
# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization.
# Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers.
# Our goal is to convey information from the teacher's representation to the student by including a naive loss function,
# whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases.
Expand Down
14 changes: 11 additions & 3 deletions en-wordlist.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

ACL
ADI
AOT
Expand Down Expand Up @@ -50,6 +51,7 @@ DDP
DDPG
DDQN
DLRM
DMA
DNN
DQN
DataLoaders
Expand All @@ -68,6 +70,8 @@ Ecker
ExportDB
FC
FGSM
tensordict
DataLoader's
FLAVA
FSDP
FX
Expand Down Expand Up @@ -139,6 +143,7 @@ MKLDNN
MLP
MLPs
MNIST
MPS
MUC
MacBook
MacOS
Expand Down Expand Up @@ -222,6 +227,7 @@ STR
SVE
SciPy
Sequentials
Sharding
Sigmoid
SoTA
Sohn
Expand Down Expand Up @@ -257,6 +263,7 @@ VLDB
VQA
VS Code
ViT
Volterra
WMT
WSI
WSIs
Expand Down Expand Up @@ -339,11 +346,11 @@ dataset’s
deallocation
decompositions
decorrelated
devicemesh
deserialize
deserialized
desynchronization
deterministically
devicemesh
dimensionality
dir
discontiguous
Expand Down Expand Up @@ -388,6 +395,7 @@ hessian
hessians
histoencoder
histologically
homonymous
hotspot
hvp
hyperparameter
Expand Down Expand Up @@ -463,6 +471,7 @@ optimizer's
optimizers
otsu
overfitting
pageable
parallelizable
parallelization
parametrization
Expand Down Expand Up @@ -527,7 +536,6 @@ runtime
runtimes
scalable
sharded
Sharding
softmax
sparsified
sparsifier
Expand Down Expand Up @@ -616,4 +624,4 @@ warmstarting
warmup
webp
wsi
wsis
wsis
19 changes: 14 additions & 5 deletions index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ Welcome to PyTorch Tutorials

**What's new in PyTorch tutorials?**

* `Using User-Defined Triton Kernels with torch.compile <https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html>`__
* `Large Scale Transformer model training with Tensor Parallel (TP) <https://pytorch.org/tutorials/intermediate/TP_tutorial.html>`__
* `Accelerating BERT with semi-structured (2:4) sparsity <https://pytorch.org/tutorials/advanced/semi_structured_sparse.html>`__
* `torch.export Tutorial with torch.export.Dim <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__
* `Extension points in nn.Module for load_state_dict and tensor subclasses <https://pytorch.org/tutorials/recipes/recipes/swap_tensors.html>`__
* `A guide on good usage of non_blocking and pin_memory() in PyTorch <https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html>`__
* `Introduction to Distributed Pipeline Parallelism <https://pytorch.org/tutorials/intermediate/pipelining_tutorial.html>`__
* `Introduction to Libuv TCPStore Backend <https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html>`__
* `Asynchronous Saving with Distributed Checkpoint (DCP) <https://pytorch.org/tutorials/recipes/distributed_async_checkpoint_recipe.html>`__
* `Python Custom Operators <https://pytorch.org/tutorials/advanced/python_custom_ops.html>`__
* Updated `Getting Started with DeviceMesh <https://pytorch.org/tutorials/recipes/distributed_device_mesh.html>`__

.. raw:: html

Expand Down Expand Up @@ -93,6 +94,13 @@ Welcome to PyTorch Tutorials
:link: intermediate/tensorboard_tutorial.html
:tags: Interpretability,Getting-Started,TensorBoard

.. customcarditem::
:header: Good usage of `non_blocking` and `pin_memory()` in PyTorch
:card_description: A guide on best practices to copy data from CPU to GPU.
:image: _static/img/pinmem.png
:link: intermediate/pinmem_nonblock.html
:tags: Getting-Started

.. Image/Video
.. customcarditem::
Expand Down Expand Up @@ -969,6 +977,7 @@ Additional Resources
beginner/pytorch_with_examples
beginner/nn_tutorial
intermediate/tensorboard_tutorial
intermediate/pinmem_nonblock

.. toctree::
:maxdepth: 2
Expand Down
2 changes: 1 addition & 1 deletion intermediate_source/FSDP_adavnced_tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ layer class (holding MHSA and FFN).
model = FSDP(model,
fsdp_auto_wrap_policy=t5_auto_wrap_policy)
auto_wrap_policy=t5_auto_wrap_policy)
To see the wrapped model, you can easily print the model and visually inspect
the sharding and FSDP units as well.
Expand Down
14 changes: 7 additions & 7 deletions intermediate_source/FSDP_tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”.
1.2 Import necessary packages

.. note::
This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`.
This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`.

.. code-block:: python
Expand Down Expand Up @@ -308,7 +308,7 @@ We have recorded cuda events to measure the time of FSDP model specifics. The CU
CUDA event elapsed time on training loop 40.67462890625sec
Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit.
Alternatively, we will look at adding the fsdp_auto_wrap_policy next and will discuss the differences.
Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences.

.. code-block:: bash
Expand All @@ -335,12 +335,12 @@ The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarg

FSDP Peak Memory Usage

Applying *fsdp_auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency.
Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency.
The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model.
In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding.
Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers.

To avoid that, you can pass in an fsdp_auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit).
To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit).
In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers.
Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100.

Expand All @@ -358,9 +358,9 @@ Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning
model = Net().to(rank)
model = FSDP(model,
fsdp_auto_wrap_policy=my_auto_wrap_policy)
auto_wrap_policy=my_auto_wrap_policy)
Applying the fsdp_auto_wrap_policy, the model would be as follows:
Applying the auto_wrap_policy, the model would be as follows:

.. code-block:: bash
Expand Down Expand Up @@ -411,7 +411,7 @@ In 2.4 we just add it to the FSDP wrapper
.. code-block:: python
model = FSDP(model,
fsdp_auto_wrap_policy=my_auto_wrap_policy,
auto_wrap_policy=my_auto_wrap_policy,
cpu_offload=CPUOffload(offload_params=True))
Expand Down
3 changes: 2 additions & 1 deletion intermediate_source/TCPStore_libuv_backend.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ Introduction to Libuv TCPStore Backend
.. grid:: 2

.. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
:class-card: card-prerequisites
:class-card: card-prerequisites

* What is the new TCPStore backend
* Compare the new libuv backend against the legacy backend
* How to enable to use the legacy backend
Expand Down
Loading

0 comments on commit 8508a8b

Please sign in to comment.