From 8c0f9e84d0d268d984a283b76184e59a0cc616da Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 11 Oct 2023 22:35:54 +0800 Subject: [PATCH] v0.3.0 (#171) --- .github/workflows/codeql.yml | 10 +-- .github/workflows/integration-test-backup.yml | 15 +--- .github/workflows/lint.yml | 12 ++-- .github/workflows/ut-backup.yml | 19 +---- CMakeLists.txt | 2 +- README.md | 36 +++------- docker/base-cuda12.1.dockerfile | 11 ++- docker/dev-cuda11.8.dockerfile | 28 ++++++++ docker/dev-cuda12.1.dockerfile | 27 +++++++ include/mscclpp/core.hpp | 5 +- pyproject.toml | 2 +- python/CMakeLists.txt | 5 +- python/mscclpp/__init__.py | 3 + python/mscclpp/core_py.cpp | 2 + python/test/mscclpp_group.py | 9 +-- python/test/test_mscclpp.py | 72 ++++--------------- src/core.cc | 7 ++ src/include/communicator.hpp | 6 +- src/include/context.hpp | 6 +- src/include/endpoint.hpp | 6 +- test/mscclpp-test/check_perf_result.py | 12 +--- 21 files changed, 133 insertions(+), 162 deletions(-) create mode 100644 docker/dev-cuda11.8.dockerfile create mode 100644 docker/dev-cuda12.1.dockerfile diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 53395a785..b478dc5ae 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,7 +12,7 @@ jobs: name: Analyze runs-on: 'ubuntu-latest' container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }} + image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }} permissions: actions: read @@ -27,7 +27,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Check disk space run: | @@ -38,12 +38,6 @@ jobs: with: languages: ${{ matrix.language }} - - name: Install cmake - run: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp - sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake - - name: Dubious ownership exception run: | git config --global --add safe.directory /__w/mscclpp/mscclpp diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml index 13eb10f0f..24dacf9ec 100644 --- a/.github/workflows/integration-test-backup.yml +++ b/.github/workflows/integration-test-backup.yml @@ -4,7 +4,7 @@ on: workflow_dispatch jobs: IntegrationTest: - runs-on: self-hosted + runs-on: [ self-hosted, A100 ] defaults: run: shell: bash @@ -13,22 +13,17 @@ jobs: cuda: [ cuda11.8, cuda12.1 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: - name: Checkout uses: actions/checkout@v4 - - name: Install CMake - run: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp - - name: Build run: | mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. make -j - name: Lock GPU clock frequency @@ -41,7 +36,6 @@ jobs: - name: Run mscclpp AllGather test run: | set -e - export PATH=/usr/local/mpi/bin:$PATH mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl @@ -50,13 +44,11 @@ jobs: - name: Run mscclpp SendRecv test run: | set -e - export PATH=/usr/local/mpi/bin:$PATH mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - name: Run mscclpp AllReduce test run: | set -e - export PATH=/usr/local/mpi/bin:$PATH mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl @@ -68,7 +60,6 @@ jobs: - name: Run mscclpp AllToAll test run: | set -e - export PATH=/usr/local/mpi/bin:$PATH mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 9cdcf443d..aaffe9578 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install ClangFormat run: | @@ -28,25 +28,25 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3 - name: Install Python dependencies - run: python3.8 -m pip install black + run: python3 -m pip install black - name: Run black - run: python3.8 -m black --check --config pyproject.toml . + run: python3 -m black --check --config pyproject.toml . spelling: runs-on: ubuntu-20.04 steps: - name: Check out Git repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Download misspell run: | diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml index 9bcbf53b2..df8db2cbb 100644 --- a/.github/workflows/ut-backup.yml +++ b/.github/workflows/ut-backup.yml @@ -4,7 +4,7 @@ on: workflow_dispatch jobs: UnitTest: - runs-on: self-hosted + runs-on: [ self-hosted, A100 ] defaults: run: shell: bash @@ -14,7 +14,7 @@ jobs: cuda: [ cuda11.8, cuda12.1 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: @@ -23,10 +23,8 @@ jobs: - name: Build run: | - curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. make -j working-directory: ${{ github.workspace }} @@ -36,31 +34,20 @@ jobs: for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i done - working-directory: ${{ github.workspace }} - name: UnitTests run: | ./build/test/unit_tests - working-directory: ${{ github.workspace }} - name: MpUnitTests run: | set -e - export PATH=/usr/local/mpi/bin:$PATH mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests - working-directory: ${{ github.workspace }} - name: PyTests run: | set -e - export PATH=/usr/local/mpi/bin:$PATH cd build && make pylib-copy - if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then - python3 -m pip install -r ../python/test/requirements_cu11.txt - else - python3 -m pip install -r ../python/test/requirements_cu12.txt - fi mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x - working-directory: ${{ github.workspace }} diff --git a/CMakeLists.txt b/CMakeLists.txt index c20c4cef6..3b33a6e96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # Licensed under the MIT license. set(MSCCLPP_MAJOR "0") -set(MSCCLPP_MINOR "2") +set(MSCCLPP_MINOR "3") set(MSCCLPP_PATCH "0") set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) diff --git a/README.md b/README.md index 56a2fcf1e..7f0112ec1 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a * **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime. -## Key Features (v0.2) +## Key Features (v0.3) -MSCCL++ v0.2 supports the following features. +MSCCL++ v0.3 supports the following features. ### In-Kernel Communication Interfaces @@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases. -## Status & Roadmap +### New in MSCCL++ v0.3 (Latest Release) +* Updated interfaces +* Add Python bindings and interfaces +* Add Python unit tests +* Add more configurable parameters +* Add a new single-node AllReduce kernel +* Fix bugs -MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version. - -### MSCCL++ v0.4 (TBU) -* Automatic task scheduler -* Dynamic performance tuning - -### MSCCL++ v0.3 (TBU) -* Tile-based communication: efficient transport of 2D data patches (tiles) -* GPU computation interfaces - -### MSCCL++ v0.2 (Latest Release) -* Basic communication functionalities and new interfaces - - GPU-side communication interfaces - - Host-side helpers: bootstrap, communicator, and proxy - - Supports both NVLink and InfiniBand - - Supports both in-SM copy and DMA/RDMA -* Communication performance optimization - - Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll -* Development pipeline -* Documentation - -### MSCCL++ v0.1 -* Proof-of-concept, preliminary interfaces +See details from https://github.com/microsoft/mscclpp/issues/89. ## Contributing diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile index b28a1995f..5c5bcd602 100644 --- a/docker/base-cuda12.1.dockerfile +++ b/docker/base-cuda12.1.dockerfile @@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && \ +RUN rm -rf /opt/nvidia + +RUN apt-get clean && \ + apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ @@ -47,8 +50,10 @@ RUN cd /tmp && \ cd .. && \ rm -rf /tmp/openmpi-${OPENMPI_VERSION}* -ENV PATH="${PATH}:/usr/local/mpi/bin" \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" +ENV PATH="/usr/local/mpi/bin:${PATH}" \ + LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}" RUN echo PATH="${PATH}" > /etc/environment && \ echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment + +ENTRYPOINT [] diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile new file mode 100644 index 000000000..094772b06 --- /dev/null +++ b/docker/dev-cuda11.8.dockerfile @@ -0,0 +1,28 @@ +FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + +LABEL maintainer="MSCCL++" +LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp + +ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ + CMAKE_VERSION="3.26.4" + +ADD . ${MSCCLPP_SRC_DIR} +WORKDIR ${MSCCLPP_SRC_DIR} + +# Install cmake 3.26.4 +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ + rm -rf ${CMAKE_HOME}.tar.gz +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Install pytest & dependencies +RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +WORKDIR / +RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile new file mode 100644 index 000000000..70fe684c1 --- /dev/null +++ b/docker/dev-cuda12.1.dockerfile @@ -0,0 +1,27 @@ +FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + +LABEL maintainer="MSCCL++" +LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp + +ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ + CMAKE_VERSION="3.26.4" + +ADD . ${MSCCLPP_SRC_DIR} +WORKDIR ${MSCCLPP_SRC_DIR} + +# Install cmake 3.26.4 +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Install pytest & dependencies +RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +WORKDIR / +RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index ea7b14602..306398fb0 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -5,7 +5,7 @@ #define MSCCLPP_CORE_HPP_ #define MSCCLPP_MAJOR 0 -#define MSCCLPP_MINOR 2 +#define MSCCLPP_MINOR 3 #define MSCCLPP_PATCH 0 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) @@ -24,6 +24,9 @@ namespace mscclpp { /// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process. using UniqueId = std::array; +/// Return a version string. +std::string version(); + /// Base class for bootstraps. class Bootstrap { public: diff --git a/pyproject.toml b/pyproject.toml index 698754bf6..5902c9464 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build" [project] name = "mscclpp" -version = "0.2.0" +version = "0.3.0" [tool.scikit-build] cmake.minimum-version = "3.25.0" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7776be62c..6bb8e2700 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -7,11 +7,10 @@ add_subdirectory(test) add_custom_target(pylib-copy) add_custom_command(TARGET pylib-copy POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so + ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so + ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries" ) - diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 89e889a22..5165e95cb 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -18,8 +18,11 @@ TcpBootstrap, Transport, TransportFlags, + version, ) +__version__ = version() + def get_include(): """Return the directory that contains the MSCCL++ headers.""" diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 1d1a064ce..60ceb96cc 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) { } void register_core(nb::module_& m) { + m.def("version", &version); + nb::class_(m, "Bootstrap") .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) diff --git a/python/test/mscclpp_group.py b/python/test/mscclpp_group.py index 1b6138467..7a7c7b017 100644 --- a/python/test/mscclpp_group.py +++ b/python/test/mscclpp_group.py @@ -127,10 +127,7 @@ def make_sm_channels_with_packet( channels = {} for rank in connections: channels[rank] = SmChannel( - semaphores[rank], - registered_memories[rank], - tensor.data.ptr, - packetTensor.data.ptr, + semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr ) return channels @@ -148,8 +145,6 @@ def make_proxy_channels_with_packet( channels = {} for rank in semaphores: channels[rank] = SimpleProxyChannel( - proxy_service.proxy_channel(semaphore_ids[rank]), - memory_ids[rank], - memory_ids[self.my_rank], + proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank] ) return channels diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 0be3b2126..6674f4ea0 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -9,14 +9,7 @@ import netifaces as ni import pytest -from mscclpp import ( - Fifo, - Host2DeviceSemaphore, - Host2HostSemaphore, - ProxyService, - SmDevice2DeviceSemaphore, - Transport, -) +from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport from ._cpp import _ext from .mscclpp_group import MscclppGroup from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group @@ -61,11 +54,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str): for rank in range(group.nranks): if rank == group.my_rank: continue - group.send( - memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], - rank, - 0, - ) + group.send(memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], rank, 0) for rank in range(group.nranks): if rank == group.my_rank: continue @@ -207,43 +196,31 @@ def __init__( ): if test_name == "h2d_semaphore": self._kernel = KernelBuilder( - file="h2d_semaphore_test.cu", - kernel_name="h2d_semaphore", + file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore" ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "d2d_semaphore": self._kernel = KernelBuilder( - file="d2d_semaphore_test.cu", - kernel_name="d2d_semaphore", + file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore" ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "sm_channel": - self._kernel = KernelBuilder( - file="sm_channel_test.cu", - kernel_name="sm_channel", - ).get_compiled_kernel() + self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel() self.nblocks = nranks self.nthreads = 1024 elif test_name == "fifo": - self._kernel = KernelBuilder( - file="fifo_test.cu", - kernel_name="fifo", - ).get_compiled_kernel() + self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel() self.nblocks = 1 self.nthreads = 1 elif test_name == "proxy": - self._kernel = KernelBuilder( - file="proxy_test.cu", - kernel_name="proxy", - ).get_compiled_kernel() + self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "simple_proxy_channel": self._kernel = KernelBuilder( - file="simple_proxy_channel_test.cu", - kernel_name="simple_proxy_channel", + file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel" ).get_compiled_kernel() self.nblocks = 1 self.nthreads = 1024 @@ -364,17 +341,10 @@ def test_fifo( @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) -def test_proxy( - mpi_group: MpiGroup, - nelem: int, - transport: str, -): +def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): group, connections = create_and_connect(mpi_group, transport) - memory = cp.zeros( - nelem, - dtype=cp.int32, - ) + memory = cp.zeros(nelem, dtype=cp.int32) nelemPerRank = nelem // group.nranks nelemPerRank * memory.itemsize memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1 @@ -401,23 +371,12 @@ def test_proxy( list_reg_mem.append(all_reg_memories[rank]) - proxy = _ext.MyProxyService( - group.my_rank, - group.nranks, - nelem * memory.itemsize, - list_conn, - list_reg_mem, - list_sem, - ) + proxy = _ext.MyProxyService(group.my_rank, group.nranks, nelem * memory.itemsize, list_conn, list_reg_mem, list_sem) fifo_device_handle = proxy.fifo_device_handle() kernel = MscclppKernel( - "proxy", - my_rank=group.my_rank, - nranks=group.nranks, - semaphore_or_channels=list_sem, - fifo=fifo_device_handle, + "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle ) proxy.start() group.barrier() @@ -432,12 +391,7 @@ def test_proxy( @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["NVLink", "IB"]) @pytest.mark.parametrize("use_packet", [False, True]) -def test_simple_proxy_channel( - mpi_group: MpiGroup, - nelem: int, - transport: str, - use_packet: bool, -): +def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): group, connections = create_and_connect(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) diff --git a/src/core.cc b/src/core.cc index 0282b2e9a..4d89250d0 100644 --- a/src/core.cc +++ b/src/core.cc @@ -2,11 +2,18 @@ // Licensed under the MIT license. #include +#include #include "api.h" namespace mscclpp { +MSCCLPP_API_CPP std::string version() { + std::stringstream ss; + ss << MSCCLPP_MAJOR << "." << MSCCLPP_MINOR << "." << MSCCLPP_PATCH; + return ss.str(); +} + MSCCLPP_API_CPP TransportFlags::TransportFlags(Transport transport) : detail::TransportFlagsBase(1 << static_cast(transport)) {} diff --git a/src/include/communicator.hpp b/src/include/communicator.hpp index 0f868b140..55b5d5724 100644 --- a/src/include/communicator.hpp +++ b/src/include/communicator.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCL_COMMUNICATOR_HPP_ -#define MSCCL_COMMUNICATOR_HPP_ +#ifndef MSCCLPP_COMMUNICATOR_HPP_ +#define MSCCLPP_COMMUNICATOR_HPP_ #include #include @@ -31,4 +31,4 @@ struct Communicator::Impl { } // namespace mscclpp -#endif // MSCCL_COMMUNICATOR_HPP_ +#endif // MSCCLPP_COMMUNICATOR_HPP_ diff --git a/src/include/context.hpp b/src/include/context.hpp index 11cc98d7d..6468b1d33 100644 --- a/src/include/context.hpp +++ b/src/include/context.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCL_CONTEXT_HPP_ -#define MSCCL_CONTEXT_HPP_ +#ifndef MSCCLPP_CONTEXT_HPP_ +#define MSCCLPP_CONTEXT_HPP_ #include #include @@ -25,4 +25,4 @@ struct Context::Impl { } // namespace mscclpp -#endif // MSCCL_CONTEXT_HPP_ +#endif // MSCCLPP_CONTEXT_HPP_ diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp index f246012c5..311fa9982 100644 --- a/src/include/endpoint.hpp +++ b/src/include/endpoint.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCL_ENDPOINT_HPP_ -#define MSCCL_ENDPOINT_HPP_ +#ifndef MSCCLPP_ENDPOINT_HPP_ +#define MSCCLPP_ENDPOINT_HPP_ #include #include @@ -26,4 +26,4 @@ struct Endpoint::Impl { } // namespace mscclpp -#endif // MSCCL_ENDPOINT_HPP_ +#endif // MSCCLPP_ENDPOINT_HPP_ diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py index d5c5469a4..1430526ec 100644 --- a/test/mscclpp-test/check_perf_result.py +++ b/test/mscclpp-test/check_perf_result.py @@ -16,17 +16,9 @@ def load_perf_file(perf_fine: str) -> dict: "time": data["time"], } if "target" in data: - res[ - ( - data["name"], - data["kernel"], - data["ranks"], - data["ranksPerNode"], - data["size"], - ) - ][ + res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[ "target" - ] = data["target"] + ] return res