Merge branch 'main' into gpu-support

serge-chat · Feb 24, 2024 · 9dc8f42 · 9dc8f42
2 parents 235d65c + e1f966a
commit 9dc8f42
Show file tree

Hide file tree

Showing 9 changed files with 204 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -60,11 +60,13 @@ Instructions for setting up Serge on Kubernetes can be found in the [wiki](https
 | **Alfred** | 40B-1023 |
 | **Code** | 13B, 33B |
 | **CodeLLaMA** | 7B, 7B-Instruct, 7B-Python, 13B, 13B-Instruct, 13B-Python, 34B, 34B-Instruct, 34B-Python |
+| **Gemma** | 2B, 7B |
 | **Falcon** | 7B, 7B-Instruct, 40B, 40B-Instruct |
 | **LLaMA 2**  | 7B, 7B-Chat, 7B-Coder, 13B, 13B-Chat, 70B, 70B-Chat, 70B-OASST |
+| **LLaMA Pro** | 8B, 8B-Instruct |
 | **Med42** | 70B |
 | **Medalpaca** | 13B |
-| **Medicine-LLM** | 13B |
+| **Medicine** | Chat, LLM |
 | **Meditron** | 7B, 7B-Chat, 70B |
 | **Mistral** | 7B-V0.1, 7B-Instruct-v0.2, 7B-OpenOrca |
 | **MistralLite** | 7B |
@@ -79,6 +81,7 @@ Instructions for setting up Serge on Kubernetes can be found in the [wiki](https
 | **Python Code** | 13B, 33B |
 | **PsyMedRP** | 13B-v1, 20B-v1 |
 | **Starling LM** | 7B-Alpha |
+| **TinyLlama** | 1.1B |
 | **Vicuna** | 7B-v1.5, 13B-v1.5, 33B-v1.3, 33B-Coder |
 | **WizardLM** | 7B-v1.0, 13B-v1.2, 70B-v1.0 |
 | **Zephyr** | 3B, 7B-Alpha, 7B-Beta |

diff --git a/api/poetry.lock b/api/poetry.lock
diff --git a/api/pyproject.toml b/api/pyproject.toml
@@ -28,14 +28,14 @@ pydantic = "^1.10.14"
 sse-starlette = "^1.8.2"
 starlette = "^0.26.1"
 typing-extensions = "^4.9.0"
-urllib3 = "^2.2.0"
+urllib3 = "^2.2.1"
 fastapi = "^0.95.1"
 huggingface-hub = "^0.20.3"
 requests = "^2.31.0"
 langchain = "^0.0.180"
 loguru = "^0.7.2"
 redis = {extras = ["hiredis"], version = "^5.0.1"}
-pytest = "^8.0.0"
+pytest = "^8.0.1"
 hypercorn = {extras = ["trio"], version = "^0.16.0"}
 
 [tool.ruff]

diff --git a/api/src/serge/data/models.json b/api/src/serge/data/models.json
@@ -195,6 +195,33 @@
             }
         ]
     },
+    {   
+        "name": "Gemma",
+        "models": [
+            {
+                "name": "Gemma-2B",
+                "repo": "MaziyarPanahi/gemma-2b-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "gemma-2b.Q4_K_M.gguf",
+                        "disk_space": 1495245728.0
+                    }
+                ]
+            },            
+            {
+                "name": "Gemma-7B",
+                "repo": "MaziyarPanahi/gemma-7b-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "gemma-7b.Q4_K_M.gguf",
+                        "disk_space": 5127231648.0
+                    }
+                ]
+            }       
+        ]
+    },
     {
         "name": "LLaMA_2",
         "models": [
@@ -287,6 +314,33 @@
                 ]
             }
         ]
+    },
+    {   
+        "name": "LLaMA-Pro",
+        "models": [
+            {
+                "name": "Llama-Pro-8B",
+                "repo": "TheBloke/LLaMA-Pro-8B-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "llama-pro-8b.Q4_K_M.gguf",
+                        "disk_space": 5055758336.0
+                    }
+                ]
+            },            
+            {
+                "name": "Llama-Pro-8B-Instruct",
+                "repo": "TheBloke/LLaMA-Pro-8B-Instruct-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "llama-pro-8b-instruct.Q4_K_M.gguf",
+                        "disk_space": 5055758688.0
+                    }
+                ]
+            }
+        ]
     },    
     {   
         "name": "Med42",
@@ -321,10 +375,32 @@
         ]
     },
     {   
-        "name": "medicine-LLM",
+        "name": "Medicine",
         "models": [
             {
-                "name": "Medicine LLM 13B",
+                "name": "Medicine-Chat",
+                "repo": "TheBloke/medicine-chat-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "medicine-chat.Q4_K_M.gguf",
+                        "disk_space": 4081010048.0
+                    }
+                ]
+            },
+            {
+                "name": "Medicine-LLM",
+                "repo": "TheBloke/medicine-LLM-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "medicine-llm.Q4_K_M.gguf",
+                        "disk_space": 4081009920.0
+                    }
+                ]
+            },             
+            {
+                "name": "Medicine-LLM-13B",
                 "repo": "TheBloke/medicine-LLM-13B-GGUF",
                 "files": [
                     {
@@ -335,7 +411,7 @@
                 ]
             }           
         ]
-    },    
+    },   
     {
         "name": "Meditron",
         "models": [
@@ -696,6 +772,22 @@
                 ]
             }
         ]
+    },
+    {   
+        "name": "Tinyllama",
+        "models": [
+            {
+                "name": "Tinyllama-1.1B-Chat-v1.0",
+                "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+                "files": [
+                    {
+                        "name": "q4_K_M",
+                        "filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+                        "disk_space": 668788096.0
+                    }
+                ]
+            }           
+        ]
     },    
     {
         "name": "Vicuna",

diff --git a/scripts/deploy.sh b/scripts/deploy.sh
@@ -25,6 +25,7 @@ if [ "$cpu_arch" = "aarch64" ]; then
 else
 	# Use @smartappli provided wheels
 	cpu_feature=$(detect_cpu_features)
+
 	if [ "$SERGE_GPU_NVIDIA_SUPPORT" = true ]; then
 		pip_command="python -m pip install -v llama-cpp-python==$LLAMA_PYTHON_VERSION --only-binary=:all: --extra-index-url=https://smartappli.github.io/serge-wheels/$cpu_feature/cu122"
 	elif [ "$SERGE_GPU_AMD_SUPPORT" = true ]; then
@@ -56,7 +57,11 @@ redis_process=$!
 # Start the API
 cd /usr/src/app/api || exit 1
 hypercorn_cmd="hypercorn src.serge.main:app --bind 0.0.0.0:8008"
-[ "$SERGE_ENABLE_IPV6" = true ] && hypercorn_cmd+=" --bind [::]:8008"
+if [ "$SERGE_ENABLE_IPV6" = true ] && [ "$SERGE_ENABLE_IPV4" != true ]; then
+	hypercorn_cmd="hypercorn src.serge.main:app --bind [::]:8008"
+elif [ "$SERGE_ENABLE_IPV4" = true ] && [ "$SERGE_ENABLE_IPV6" = true ]; then
+	hypercorn_cmd="hypercorn src.serge.main:app --bind 0.0.0.0:8008 --bind [::]:8008"
+fi
 
 $hypercorn_cmd || {
 	echo 'Failed to start main app'

diff --git a/scripts/dev.sh b/scripts/dev.sh
@@ -26,6 +26,7 @@ if [ "$cpu_arch" = "aarch64" ]; then
 else
 	# Use @smartappli provided wheels
 	cpu_feature=$(detect_cpu_features)
+
 	if [ "$SERGE_GPU_NVIDIA_SUPPORT" = true ]; then
 		pip_command="python -m pip install -v llama-cpp-python==$LLAMA_PYTHON_VERSION --only-binary=:all: --extra-index-url=https://smartappli.github.io/serge-wheels/$cpu_feature/cu122"
 	elif [ "$SERGE_GPU_AMD_SUPPORT" = true ]; then
@@ -64,8 +65,12 @@ npm run dev -- --host 0.0.0.0 --port 8008 &
 
 # Start the API
 cd /usr/src/app/api || exit 1
-hypercorn_cmd="hypercorn src.serge.main:api_app --reload --bind 0.0.0.0:9124"
-[ "$SERGE_ENABLE_IPV6" = true ] && hypercorn_cmd+=" --bind [::]:9124"
+hypercorn_cmd="hypercorn src.serge.main:api_app --bind 0.0.0.0:9124"
+if [ "$SERGE_ENABLE_IPV6" = true ] && [ "$SERGE_ENABLE_IPV4" != true ]; then
+	hypercorn_cmd="hypercorn src.serge.main:api_app --bind [::]:9124"
+elif [ "$SERGE_ENABLE_IPV4" = true ] && [ "$SERGE_ENABLE_IPV6" = true ]; then
+	hypercorn_cmd="hypercorn src.serge.main:api_app --bind 0.0.0.0:9124 --bind [::]:9124"
+fi
 
 $hypercorn_cmd || {
 	echo 'Failed to start main app'

diff --git a/scripts/serge.env b/scripts/serge.env
@@ -1,4 +1,5 @@
 SERGE_GPU_NVIDIA_SUPPORT=false
 SERGE_GPU_AMD_SUPPORT=false
-LLAMA_PYTHON_VERSION=0.2.44
+LLAMA_PYTHON_VERSION=0.2.50
+SERGE_ENABLE_IPV4=true
 SERGE_ENABLE_IPV6=false