diff --git a/.github/workflows/build-triton.yml b/.github/workflows/build-triton.yml new file mode 100644 index 0000000..3c5360f --- /dev/null +++ b/.github/workflows/build-triton.yml @@ -0,0 +1,70 @@ +name: Build Triton wheel + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.ref }} + repository: ${{ github.event.inputs.repository }} + + - name: Resolve SOURCE_DATE_EPOCH + id: source-date-epoch + run: | + echo "SOURCE_DATE_EPOCH=$(git log -1 --format=%ct)" >> $GITHUB_OUTPUT + + - name: Apply patches + run: | + set -e + + # List of patches + patches=( + "https://raw.githubusercontent.com/${{ github.repository }}/${{ github.sha }}/patches/triton/0000-add-support-for-conversion-fp16-to-fp32.patch" + "https://raw.githubusercontent.com/${{ github.repository }}/${{ github.sha }}/patches/triton/0000-fix-max-ptx-version.patch" + ) + + # Apply patches + for patch in ${patches[@]}; do + echo "Applying patch $patch" + curl "$patch" | patch -p1 + done + + - name: Build wheels + uses: pypa/cibuildwheel@v2.18.0 + with: + package-dir: python + env: + CIBW_BUILD: cp311-manylinux_x86_64 + SOURCE_DATE_EPOCH: ${{ needs.source-date-epoch.outputs.SOURCE_DATE_EPOCH }} + + - name: Create release + uses: softprops/action-gh-release@v2 + with: + files: wheelhouse/*.whl + tag_name: ${{ github.event.inputs.tag_name }} + +on: + workflow_dispatch: + inputs: + repository: + default: triton-lang/triton + description: Source repository + required: true + type: string + + ref: + default: 3f8d91bb17f6e7bc33dc995ae0860db89d351c7b + description: Source ref + required: true + type: string + + tag_name: + description: Target tag + required: true + type: string + +permissions: + contents: write diff --git a/.github/workflows/build-vllm.yml b/.github/workflows/build-vllm.yml new file mode 100644 index 0000000..8ace2b1 --- /dev/null +++ b/.github/workflows/build-vllm.yml @@ -0,0 +1,72 @@ +name: Build vLLM wheel + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.ref }} + repository: ${{ github.event.inputs.repository }} + + - name: Resolve SOURCE_DATE_EPOCH + id: source-date-epoch + run: | + echo "SOURCE_DATE_EPOCH=$(git log -1 --format=%ct)" >> $GITHUB_OUTPUT + + - name: Apply patches + run: | + set -e + + # List of patches + patches=( + "https://raw.githubusercontent.com/${{ github.repository }}/${{ github.sha }}/patches/vllm/0000-enable-support-for-pascal-gpus.patch" + "https://raw.githubusercontent.com/${{ github.repository }}/${{ github.sha }}/patches/vllm/1000-set-torch-cuda-arch-list.patch" + "https://raw.githubusercontent.com/${{ github.repository }}/${{ github.sha }}/patches/vllm/9000-add-vllm-command-that-launches-api-server.patch" + ) + + # Apply patches + for patch in ${patches[@]}; do + echo "Applying patch $patch" + curl "$patch" | patch -p1 + done + + - name: Build wheels + uses: pypa/cibuildwheel@v2.18.0 + env: + CIBW_BUILD: cp311-manylinux_x86_64 + CIBW_MANYLINUX_PYPY_X86_64_IMAGE: ghcr.io/sasha0552/manylinux2014_x86_64-cuda + CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/sasha0552/manylinux2014_x86_64-cuda + CIBW_REPAIR_WHEEL_COMMAND: ~ + SOURCE_DATE_EPOCH: ${{ needs.source-date-epoch.outputs.SOURCE_DATE_EPOCH }} + + - name: Create release + uses: softprops/action-gh-release@v2 + with: + files: wheelhouse/*.whl + tag_name: ${{ github.event.inputs.tag_name }} + +on: + workflow_dispatch: + inputs: + repository: + default: vllm-project/vllm + description: Source repository + required: true + type: string + + ref: + default: main + description: Source ref + required: true + type: string + + tag_name: + description: Target tag + required: true + type: string + +permissions: + contents: write diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..455d5b4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 sasha0552 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c0d9fd7 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# vllm-ci + +CI scripts designed to build a Pascal-compatible version of vLLM. diff --git a/patches/triton/0000-add-support-for-conversion-fp16-to-fp32.patch b/patches/triton/0000-add-support-for-conversion-fp16-to-fp32.patch new file mode 100644 index 0000000..ab1b1ec --- /dev/null +++ b/patches/triton/0000-add-support-for-conversion-fp16-to-fp32.patch @@ -0,0 +1,18 @@ +--- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp ++++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp +@@ -859,9 +859,12 @@ private: + + static Value promoteOperand(OpBuilder &builder, Location loc, Value operand, + Type promotedType) { +- Type tensorPromotedType = +- operand.getType().cast().cloneWith(std::nullopt, +- promotedType); ++ RankedTensorType tensor = operand.getType().cast(); ++ Type tensorElementType = tensor.getElementType(); ++ Type tensorPromotedType = tensor.cloneWith(std::nullopt, promotedType); ++ if (tensorElementType.isF16() && promotedType.isF32()) { ++ return builder.create(loc, tensorPromotedType, operand); ++ } + return builder.create(loc, tensorPromotedType, operand); + } + diff --git a/patches/triton/0000-fix-max-ptx-version.patch b/patches/triton/0000-fix-max-ptx-version.patch new file mode 100644 index 0000000..ccc9c38 --- /dev/null +++ b/patches/triton/0000-fix-max-ptx-version.patch @@ -0,0 +1,21 @@ +--- a/lib/Target/PTX/PTXTranslation.cpp ++++ b/lib/Target/PTX/PTXTranslation.cpp +@@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version, + // LLVM version in use may not officially support target hardware. + // Supported versions for LLVM 14 are here: + // https://github.com/llvm/llvm-project/blob/f28c006a5895fc0e329fe15fead81e37457cb1d1/clang/include/clang/Basic/BuiltinsNVPTX.def +- int maxPTX = std::min(82, version); ++ int maxPTX = std::min(80, version); + int maxCC = std::min(90, cc); + // options + auto options = llvm::cl::getRegisteredOptions(); +@@ -65,8 +65,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version, + std::string triple = "nvptx64-nvidia-cuda"; + std::string proc = "sm_" + std::to_string(maxCC); + std::string layout = ""; +- std::string features = ""; +- // std::string features = "+ptx" + std::to_string(maxPTX); ++ std::string features = "+ptx" + std::to_string(maxPTX); + for (llvm::Function &f : module.functions()) { + if (!f.hasFnAttribute(llvm::Attribute::NoInline)) + f.addFnAttr(llvm::Attribute::AlwaysInline); diff --git a/patches/vllm/0000-enable-support-for-pascal-gpus.patch b/patches/vllm/0000-enable-support-for-pascal-gpus.patch new file mode 100644 index 0000000..627360a --- /dev/null +++ b/patches/vllm/0000-enable-support-for-pascal-gpus.patch @@ -0,0 +1,11 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -16,7 +16,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") + + # Supported NVIDIA architectures. +-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") ++set(CUDA_SUPPORTED_ARCHS "6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0") + + # Supported AMD GPU architectures. + set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100") diff --git a/patches/vllm/1000-set-torch-cuda-arch-list.patch b/patches/vllm/1000-set-torch-cuda-arch-list.patch new file mode 100644 index 0000000..32256f8 --- /dev/null +++ b/patches/vllm/1000-set-torch-cuda-arch-list.patch @@ -0,0 +1,10 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -17,6 +17,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") + + # Supported NVIDIA architectures. + set(CUDA_SUPPORTED_ARCHS "6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0") ++set(TORCH_CUDA_ARCH_LIST "${CUDA_SUPPORTED_ARCHS}") + + # Supported AMD GPU architectures. + set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100") diff --git a/patches/vllm/9000-add-vllm-command-that-launches-api-server.patch b/patches/vllm/9000-add-vllm-command-that-launches-api-server.patch new file mode 100644 index 0000000..28dd2ea --- /dev/null +++ b/patches/vllm/9000-add-vllm-command-that-launches-api-server.patch @@ -0,0 +1,20 @@ +--- a/setup.py ++++ b/setup.py +@@ -430,4 +430,9 @@ def _read_requirements(filename: str) -> List[str]: + }, + cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, + package_data=package_data, ++ entry_points={ ++ "console_scripts": [ ++ "vllm=vllm.scripts:main", ++ ], ++ }, + ) +--- /dev/null ++++ b/vllm/scripts.py +@@ -0,0 +1,5 @@ ++import subprocess ++import sys ++ ++def main(): ++ subprocess.run([sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + sys.argv[1:])