[FEAT][Llava]

The-Swarm-Corporation · Apr 24, 2024 · 526da75 · 526da75
1 parent c0de2d8
commit 526da75
Show file tree

Hide file tree

Showing 14 changed files with 161 additions and 61 deletions.
diff --git a/scripts/.DS_Store b/scripts/.DS_Store
diff --git a/scripts/terraform_to_sky_serve/main.tf b/scripts/terraform_to_sky_serve/main.tf
@@ -0,0 +1,102 @@
+provider "aws" {
+  region = "us-east-1"
+}
+
+# Create a new VPC
+resource "aws_vpc" "swarms_vpc" {
+  cidr_block           = "10.0.0.0/16"
+  enable_dns_support   = true
+  enable_dns_hostnames = true
+
+  tags = {
+    Name = "SwarmsVPC"
+  }
+}
+
+# Create an Internet Gateway for the VPC
+resource "aws_internet_gateway" "swarms_igw" {
+  vpc_id = aws_vpc.swarms_vpc.id
+
+  tags = {
+    Name = "SwarmsIGW"
+  }
+}
+
+# Create a subnet within the VPC
+resource "aws_subnet" "swarms_subnet" {
+  vpc_id            = aws_vpc.swarms_vpc.id
+  cidr_block        = "10.0.1.0/24"
+  availability_zone = "us-east-1a"
+
+  map_public_ip_on_launch = true
+
+  tags = {
+    Name = "SwarmsSubnet"
+  }
+}
+
+# CloudWatch Log Group
+resource "aws_cloudwatch_log_group" "swarms_log_group" {
+  name              = "/ec2/swarms/logs"
+  retention_in_days = 14
+}
+
+# CloudWatch Log Stream
+resource "aws_cloudwatch_log_stream" "swarms_log_stream" {
+  name           = "InstanceLogStream"
+  log_group_name = aws_cloudwatch_log_group.swarms_log_group.name
+}
+
+
+# Create a security group in the VPC
+resource "aws_security_group" "swarms_sg" {
+  name        = "swarms-sg"
+  description = "Security Group for EC2 Instances in Swarms VPC"
+  vpc_id      = aws_vpc.swarms_vpc.id
+
+  ingress {
+    from_port   = 80
+    to_port     = 80
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = {
+    Name = "swarms-sg"
+  }
+}
+
+# Create a Route 53 hosted zone
+resource "aws_route53_zone" "swarms_zone" {
+  name = "api.swarms.world"
+}
+
+# EC2 Instance using the created resources
+resource "aws_instance" "swarms_ec2" {
+  ami                         = "ami-0ac1f653c5b6af751" # Specify the correct AMI ID
+  instance_type               = "m6i.xlarge"
+  subnet_id                   = aws_subnet.swarms_subnet.id
+  key_name                    = "your-key-name" # Update with your actual key name
+  vpc_security_group_ids      = [aws_security_group.swarms_sg.id]
+  associate_public_ip_address = true
+
+  tags = {
+    Name = "sky-sky-serve-controller-3f665020-3f66-head"
+  }
+}
+
+# Route 53 Record to point to the EC2 instance
+resource "aws_route53_record" "swarms_record" {
+  zone_id = aws_route53_zone.swarms_zone.zone_id
+  name    = "api.swarms.world"
+  type    = "A"
+  ttl     = "300"
+  records = [aws_instance.swarms_ec2.public_ip]
+}
diff --git a/servers/cogvlm/cogvlm.py b/servers/cogvlm/cogvlm.py
@@ -338,7 +338,6 @@ def generate_stream_cogvlm(
     query, history, image_list = process_history_and_images(messages)
     logger.debug(f"==== request ====\n{query}")
 
-
     input_by_model = model.build_conversation_input_ids(
         tokenizer, query=query, history=history, images=[image_list[-1]]
     )

diff --git a/servers/cogvlm/openai_api_example.py b/servers/cogvlm/openai_api_example.py
@@ -2,10 +2,10 @@
 
 # Set the API key
 client = openai.OpenAI(
-
     #  clusters
     # base_url="https://44.203.144.168:30001/v1", api_key="sk-1g3Z3y2c1f2z3d4b5p6w7c8b9v0n1m2"
-    base_url="http://40.87.83.103:30001/v1", api_key="sk-1g3Z3y2c1f2z3d4b5p6w7c8b9v0n1m2"
+    base_url="http://40.87.83.103:30001/v1",
+    api_key="sk-1g3Z3y2c1f2z3d4b5p6w7c8b9v0n1m2"
     # base_url="https://18.189.185.191:8000/v1", api_key="sk-1g3Z3y2c1f2z3d4b5p6w7c8b9v0n1m2"
 )
 # Create a client object to interact with the OpenAI API

diff --git a/servers/cogvlm/sky_serve.yaml b/servers/cogvlm/sky_serve.yaml
@@ -82,7 +82,7 @@ envs:
 
 # Fields below describe each replica.
 resources:
-  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2}
+  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, V100:1} #V100:4, V100:8, H100:1, T4:1, A10:1 }
   # cpus: 32+
   # memory: 512+
   # use_spot: True

diff --git a/servers/llama3/sky_serve.yaml b/servers/llama3/sky_serve.yaml
@@ -1,5 +1,5 @@
-#  Serving Meta Llama-3 on your own infra.
-
+# Serving Meta Llama-3 on your own infra.
+#
 # Usage:
 #
 #  HF_TOKEN=xxx sky launch llama3.yaml -c llama3 --env HF_TOKEN
@@ -10,7 +10,6 @@
 #  
 #   # We need to manually specify the stop_token_ids to make sure the model finish
 #   # on <|eot_id|>.
-
 #   curl http://$ENDPOINT/v1/chat/completions \
 #     -H "Content-Type: application/json" \
 #     -d '{
@@ -56,10 +55,12 @@
 #       ]
 #     }'
 
+
 envs:
   # MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
   MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
   HF_TOKEN: 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  # Change to your own huggingface token, or use --env to pass.
+  HF_HUB_ENABLE_HF_TRANSFER: True
 
 service:
   replicas: 2
@@ -95,6 +96,7 @@ setup: |
   # Install Gradio for web UI.
   pip install gradio openai
   pip install flash-attn==2.5.7
+  pip install hf_transfer
 
 run: |
   # Serve VLM

diff --git a/servers/llava/llava_request.py b/servers/llava/llava_request.py
@@ -1,13 +1,22 @@
+import os
+
+from dotenv import load_dotenv
 from openai import OpenAI
+
+load_dotenv()
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+
+openai_api_base = os.getenv("OPENAI_API_BASE", "http://34.201.45.48:30002/v1")
+model = os.getenv("MODEL", "llava")
+
 client = OpenAI(
-    api_key=openai_api_key,
+    api_key="sk-23232323",
     base_url=openai_api_base,
+    timeout=500
 )
 # Note that this model expects the image to come before the main text
 chat_response = client.chat.completions.create(
-    model="llava-hf/llava-1.5-7b-hf",
+    model=model,
     messages=[{
         "role": "user",
         "content": [

diff --git a/servers/lm_deploy/api.py b/servers/lm_deploy/api.py
@@ -1,22 +1,27 @@
 from lmdeploy import pipeline, TurbomindEngineConfig
 
-engine_config = TurbomindEngineConfig(model_format='awq')
+engine_config = TurbomindEngineConfig(model_format="awq")
 pipe = pipeline("./internlm2-chat-7b-4bit", backend_config=engine_config)
 response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 
 # inference with models from lmdeploy space
 from lmdeploy import pipeline, TurbomindEngineConfig
-pipe = pipeline("lmdeploy/llama2-chat-70b-4bit",
-                backend_config=TurbomindEngineConfig(model_format='awq', tp=4))
+
+pipe = pipeline(
+    "lmdeploy/llama2-chat-70b-4bit",
+    backend_config=TurbomindEngineConfig(model_format="awq", tp=4),
+)
 response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 
 # inference with models from thebloke space
 from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
-pipe = pipeline("TheBloke/LLaMA2-13B-Tiefighter-AWQ",
-                backend_config=TurbomindEngineConfig(model_format='awq'),
-                chat_template_config=ChatTemplateConfig(model_name='llama2')
-                )
+
+pipe = pipeline(
+    "TheBloke/LLaMA2-13B-Tiefighter-AWQ",
+    backend_config=TurbomindEngineConfig(model_format="awq"),
+    chat_template_config=ChatTemplateConfig(model_name="llama2"),
+)
 response = pipe(["Hi, pls intro yourself", "Shanghai is"])
-print(response)
+print(response)
diff --git a/servers/lm_deploy/openai_curl.sh b/servers/lm_deploy/openai_curl.sh
@@ -9,10 +9,10 @@ fi
 MODEL=$MODEL
 
 # First run curl on v1/models to get available models
-curl http://$OPENAI_API_BASE:8080/v1/models
+curl http://34.201.45.48:30002:8080/v1/models
 
 # Run curl on v1/chat/completions with the specified model
-curl http://$OPENAI_API_BASE:8080/v1/chat/completions \
+curl http://34.201.45.48:30002/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "'$MODEL'",

diff --git a/servers/lm_deploy/openai_vision_request.py b/servers/lm_deploy/openai_vision_request.py
@@ -13,17 +13,19 @@
 # Note that this model expects the image to come before the main text
 chat_response = client.chat.completions.create(
     model=model,
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
                 },
-            },
-            {"type": "text", "text": "What's in this image?"},
-        ],
-    }],
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ],
 )
-print("Chat response:", chat_response)
+print("Chat response:", chat_response)
diff --git a/servers/text_to_video/sky_serve.yaml b/servers/text_to_video/sky_serve.yaml
@@ -22,7 +22,7 @@ service:
 
 # Fields below describe each replica.
 resources:
-  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
   # cpus: 32+
   # memory: 512+
   # use_spot: True
@@ -31,10 +31,12 @@ resources:
   ports: 8000  # Expose to internet traffic.
   # spot_recovery: none
 
-# workdir: ~/swarms-cloud/servers/cogvlm
 
 setup: |
-  docker build -t ttv .
+  git clone https://github.com/kyegomez/swarms-cloud.git && \
+  cd swarms-cloud/servers/text_to_video && \
+  python3 -m pip install -r requirements.txt
+
 
 run: |
-  docker run --gpus all ttv
+  python3 text_to_video.py
diff --git a/servers/text_to_video/text_to_video.py b/servers/text_to_video/text_to_video.py
@@ -105,7 +105,7 @@ def text_to_video(
         return None
 
 
-@app.post("/v1/chat/completions", response_model=TextToVideoResponse)
+@app.post("/v1/video/completions", response_model=TextToVideoResponse)
 async def create_chat_completion(
     request: TextToVideoRequest,  # token: str = Depends(authenticate_user)
 ):

diff --git a/swarms_cloud/methods.py b/swarms_cloud/methods.py
diff --git a/swarms_cloud/stripe_utils.py b/swarms_cloud/stripe_utils.py
@@ -2,8 +2,7 @@
 
 import stripe
 from pydantic import BaseModel
-
-stripe.api_key = "your_stripe_api_key"
+import os
 
 
 class StripeInterface(BaseModel):
@@ -13,6 +12,8 @@ class StripeInterface(BaseModel):
 
 
 def bill_customer(customer_id: str, amount: float, description: str):
+    stripe.api_key = os.getenv("STRIPE_API_KEY")
+
     try:
         stripe.Charge.create(
             customer=customer_id,