Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docker image for BERT e2e inference task #455

Merged
merged 8 commits into from
Jul 16, 2024
7 changes: 6 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/nvidia/Dockerfile .
- run: docker build --file e2e2/test/images/nvidia/Dockerfile .
build-bert-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/bert-inference/Dockerfile.bert-inference e2e2/test/images/bert-inference
31 changes: 31 additions & 0 deletions e2e2/test/images/bert-inference/Dockerfile.bert-inference
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thinking just name this as Dockerfile, following the current pattern neuron and nvidia images

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm good with that. Added on a prefix as I wasn't sure what the directory structure would like at first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done :)

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Use the NVIDIA CUDA runtime as a parent image
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04

# Set environment variable to disable interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# Install Python 3.11
RUN apt-get update && apt-get install -y \
software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y \
python3.11 \
python3.11-dev \
python3.11-distutils \
python3-pip && \
rm -rf /var/lib/apt/lists/*

# Create a symbolic link to use python3.11 as python
RUN ln -sf /usr/bin/python3.11 /usr/bin/python

# Set the working directory in the container
WORKDIR /app

# Copy only the necessary files into the container at /app
COPY infer.py /app/
COPY requirements.txt /app/

# Install any needed packages specified in requirements.txt
RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
112 changes: 112 additions & 0 deletions e2e2/test/images/bert-inference/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os
import time
import torch
from transformers import BertForPreTraining, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


def create_dummy_data(tokenizer, num_samples=100, max_length=128):
# Create dummy input data
sentences = [
"This is a dummy sentence number {}".format(i) for i in range(num_samples)
]
tokenized_inputs = tokenizer(
sentences,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
labels = tokenized_inputs.input_ids.detach().clone()

# MLM task: randomly mask some tokens
mlm_probability = 0.15
input_ids, labels = mask_tokens(
tokenized_inputs.input_ids, tokenizer, mlm_probability
)

# NSP task: create dummy pairs
next_sentence_labels = torch.randint(0, 2, (num_samples,))

return TensorDataset(
input_ids, tokenized_inputs.attention_mask, next_sentence_labels
)


def mask_tokens(inputs, tokenizer, mlm_probability):
labels = inputs.clone()
probability_matrix = torch.full(labels.shape, mlm_probability)
special_tokens_mask = [
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in labels.tolist()
]
probability_matrix.masked_fill_(
torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens

inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

return inputs, labels


def run_inference(model, tokenizer, batch_size, mode):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

dataset = create_dummy_data(tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size)

total_time = 0
total_batches = len(dataloader)

with torch.no_grad():
for batch in dataloader:
inputs, masks, next_sentence_labels = batch
inputs, masks, next_sentence_labels = (
inputs.to(device),
masks.to(device),
next_sentence_labels.to(device),
)

start_time = time.time()
outputs = model(
input_ids=inputs,
attention_mask=masks,
next_sentence_label=next_sentence_labels,
)
end_time = time.time()

total_time += end_time - start_time

avg_time_per_batch = total_time / total_batches
throughput = (total_batches * batch_size) / total_time

print(f"Inference Mode: {mode}")
print(f"Average time per batch: {avg_time_per_batch:.4f} seconds")
print(f"Throughput: {throughput:.2f} samples/second")


def main():
# Verify GPU availability
if not torch.cuda.is_available():
raise RuntimeError("GPU isnot available. Exiting")

print("GPU is available")

# Pre-download model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")

mode = os.environ.get("INFERENCE_MODE", "throughput").lower()
batch_size = 1 if mode == "latency" else 8

print(f"Running inference in {mode} mode with batch size {batch_size}")
run_inference(model, tokenizer, batch_size, mode)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions e2e2/test/images/bert-inference/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
torch==2.3
transformers==4.29
numpy==1.23
Loading