Skip to content

Commit

Permalink
Merge branch 'master' into secure-check
Browse files Browse the repository at this point in the history
  • Loading branch information
maaquib authored Oct 2, 2024
2 parents 187ed53 + 6bdb1ba commit 3f88489
Show file tree
Hide file tree
Showing 20 changed files with 61 additions and 33 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' -

```bash
#export token=<HUGGINGFACE_HUB_TOKEN>
docker build --pull . -f docker/Dockerfile.llm -t ts/llm
docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm

docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth

# Try it out
curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
Expand Down
26 changes: 26 additions & 0 deletions benchmarks/utils/system_under_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def start(self):
execute("torchserve --stop", wait=True)
click.secho("*Setting up model store...", fg="green")
self._prepare_local_dependency()
self._clear_neuron_cache_if_exists()
click.secho("*Starting local Torchserve instance...", fg="green")

ts_cmd = (
Expand Down Expand Up @@ -141,6 +142,31 @@ def start(self):
if "Model server started" in str(line).strip():
break

def _clear_neuron_cache_if_exists(self):
cache_dir = "/var/tmp/neuron-compile-cache/"

# Check if the directory exists
if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
click.secho(
f"Directory {cache_dir} exists. Clearing contents...", fg="green"
)

# Remove the directory contents
for filename in os.listdir(cache_dir):
file_path = os.path.join(cache_dir, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red")
click.secho(f"Cache cleared: {cache_dir}", fg="green")
else:
click.secho(
f"Directory {cache_dir} does not exist. No action taken.", fg="green"
)

def stop(self):
click.secho("*Terminating Torchserve instance...", fg="green")
execute("torchserve --stop", wait=True)
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ COPY ./ serve
RUN \
if echo "$LOCAL_CHANGES" | grep -q "false"; then \
rm -rf serve;\
git clone --recursive $REPO_URL -b $BRANCH_NAME; \
git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
fi


Expand Down Expand Up @@ -238,7 +238,7 @@ COPY ./ serve
RUN \
if echo "$LOCAL_CHANGES" | grep -q "false"; then \
rm -rf serve;\
git clone --recursive $REPO_URL -b $BRANCH_NAME; \
git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
fi

COPY --from=compile-image /home/venv /home/venv
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions docs/llm_deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The launcher can either be used standalone or in combination with our provided T

To launch the docker we first need to build it:
```bash
docker build . -f docker/Dockerfile.llm -t ts/llm
docker build . -f docker/Dockerfile.vllm -t ts/vllm
```

Models are usually loaded from the HuggingFace hub and are cached in a [docker volume](https://docs.docker.com/storage/volumes/) for faster reload.
Expand All @@ -22,7 +22,7 @@ export token=<HUGGINGFACE_HUB_TOKEN>

You can then go ahead and launch a TorchServe instance serving your selected model:
```bash
docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
```

To change the model you just need to exchange the identifier given to the `--model_id` parameter.
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/utils/test_llm_streaming_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def parse_args():
"--model-version",
type=str,
default="1.0",
help="Model vesion. Default: 1.0",
help="Model version. Default: 1.0",
)

return parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/llama3/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo
```bash
python -m pip install -r ../requirements.txt
```
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm).
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm).

### Step 1: Download Model from HuggingFace

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/lora/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo
```bash
python -m pip install -r ../requirements.txt
```
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm).
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm).

### Step 1: Download Model from HuggingFace

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/mistral/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo
```bash
python -m pip install -r ../requirements.txt
```
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm).
For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm).

### Step 1: Download Model from HuggingFace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
public class ModelConfig {
private static final Logger logger = LoggerFactory.getLogger(ModelConfig.class);

public static final int defaultStartupTimeout = 120; // unit: sec
public static final int defaultResponseTimeout = 120; // unit: sec

/** the minimum number of workers of a model */
private int minWorkers;
/** the maximum number of workers of a model */
Expand All @@ -20,9 +23,9 @@ public class ModelConfig {
/** the maximum delay in msec of a batch of a model */
private int maxBatchDelay;
/** the timeout in sec of a specific model's response. */
private int responseTimeout = 120; // unit: sec
private int responseTimeout = defaultResponseTimeout;
/** the timeout in sec of a specific model's startup. */
private int startupTimeout = 120; // unit: sec
private int startupTimeout = defaultStartupTimeout;
/**
* the device type where the model is loaded. It can be gpu, cpu. The model is loaded on CPU if
* deviceType: "cpu" is set on a GPU host.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ private void initModelStore() throws InvalidSnapshotException, IOException {
String fileName = file.getName();
if (file.isFile()
&& !fileName.endsWith(".mar")
&& !fileName.endsWith(".model")) {
&& !fileName.endsWith(".model")
&& !fileName.endsWith(".tar.gz")) {
continue;
}
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ private static Operation getSetDefaultOperation() {
MediaType error = getErrorResponse();

operation.addResponse(
new Response("200", "Default vesion succsesfully updated for model", status));
new Response("200", "Default version successfully updated for model", status));
operation.addResponse(
new Response("404", "Model not found or Model version not found", error));
operation.addResponse(new Response("500", "Internal Server Error", error));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public static String setDefault(String modelName, String newModelVersion)
ModelManager modelManager = ModelManager.getInstance();
modelManager.setDefaultVersion(modelName, newModelVersion);
String msg =
"Default vesion succsesfully updated for model \""
"Default version successfully updated for model \""
+ modelName
+ "\" to \""
+ newModelVersion
Expand Down
12 changes: 10 additions & 2 deletions frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,17 @@ public void setModelState(JsonObject modelInfo) {
minWorkers = modelInfo.get(MIN_WORKERS).getAsInt();
maxWorkers = modelInfo.get(MAX_WORKERS).getAsInt();
maxBatchDelay = modelInfo.get(MAX_BATCH_DELAY).getAsInt();
responseTimeout = modelInfo.get(RESPONSE_TIMEOUT).getAsInt();
startupTimeout = modelInfo.get(STARTUP_TIMEOUT).getAsInt();
batchSize = modelInfo.get(BATCH_SIZE).getAsInt();
responseTimeout =
modelInfo.has(RESPONSE_TIMEOUT) && !modelInfo.get(RESPONSE_TIMEOUT).isJsonNull()
? modelInfo.get(RESPONSE_TIMEOUT).getAsInt()
: modelArchive.getModelConfig()
.defaultResponseTimeout; // default value for responseTimeout
startupTimeout =
modelInfo.has(STARTUP_TIMEOUT) && !modelInfo.get(STARTUP_TIMEOUT).isJsonNull()
? modelInfo.get(STARTUP_TIMEOUT).getAsInt()
: modelArchive.getModelConfig()
.defaultStartupTimeout; // default value for startupTimeout

JsonElement runtime = modelInfo.get(RUNTIME_TYPE);
String runtime_str = Manifest.RuntimeType.PYTHON.getValue();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ public void testSetDefaultVersionNoop() throws InterruptedException {
StatusResponse resp = JsonUtils.GSON.fromJson(TestUtils.getResult(), StatusResponse.class);
Assert.assertEquals(
resp.getStatus(),
"Default vesion succsesfully updated for model \"noopversioned\" to \"1.2.1\"");
"Default version successfully updated for model \"noopversioned\" to \"1.2.1\"");
}

@Test(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,7 @@
],
"responses": {
"200": {
"description": "Default vesion succsesfully updated for model",
"description": "Default version successfully updated for model",
"content": {
"application/json": {
"schema": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@
],
"responses": {
"200": {
"description": "Default vesion succsesfully updated for model",
"description": "Default version successfully updated for model",
"content": {
"application/json": {
"schema": {
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/kserve/build_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,5 @@ cp -r ../../third_party .
if [ "${MULTI}" == "true" ]; then
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE --platform "${ARCH}" -t "$DOCKER_TAG" --push .
else
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --load .
DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --push .
fi
5 changes: 0 additions & 5 deletions kubernetes/kserve/build_upload_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@
dry_run,
)

for image in [
f"{organization}/torchserve-kfs:{check_ts_version()}-gpu",
]:
try_and_handle(f"docker push {image}", dry_run)

# Cleanup built images
if args.cleanup:
try_and_handle(f"docker system prune --all --volumes -f", dry_run)
9 changes: 2 additions & 7 deletions kubernetes/kserve/docker_nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,17 @@
dry_run,
)

# Push Nightly images to official PyTorch Dockerhub account
try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)

# Tag nightly images with latest
try_and_handle(
f"docker buildx imagetools create --tag {organization}/{project}:latest-cpu {organization}/{cpu_version}",
dry_run,
)

try_and_handle(
f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
f"docker buildx imagetools create --tag {organization}/{project}:latest-gpu {organization}/{gpu_version}",
dry_run,
)

# Push images with latest tag
try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)

# Cleanup built images
if args.cleanup:
try_and_handle(f"docker system prune --all --volumes -f", dry_run)

0 comments on commit 3f88489

Please sign in to comment.