From 5c9623f8c155a3333750079058517f4d26a3b7cc Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Tue, 11 Jun 2024 12:19:06 -0700
Subject: [PATCH 1/8] Fix RelWithDebInfo build.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 python/src/ir.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/python/src/ir.cc b/python/src/ir.cc
index 097fb999edbd..7a20ce57170d 100644
--- a/python/src/ir.cc
+++ b/python/src/ir.cc
@@ -1615,7 +1615,10 @@ void init_triton_ir(py::module &&m) {
                           });
 
           ::llvm::DebugFlag = true;
-          ::llvm::setCurrentDebugTypes(debugTypes.data(), debugTypes.size());
+          // For release build setCurrentDebugTypes is a macro, so avoid
+          // namespace prefix
+          using namespace llvm;
+          setCurrentDebugTypes(debugTypes.data(), debugTypes.size());
         }
 
         if (failed(self.run(mod.getOperation())))

From 9b33fa94089f0d93b631f7c7fdf9570af9d58a40 Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Tue, 11 Jun 2024 12:21:28 -0700
Subject: [PATCH 2/8] Skip fp8 cast tests on CPU.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 python/test/unit/language/test_core.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 50b5248e36e9..f99e0a779f06 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1601,6 +1601,9 @@ def test_cast(dtype_x, dtype_z, bitcast, size, num_ctas, device):
     if is_hip() and (dtype_z in ("bfloat16", "float8_e4m3fn") or dtype_x == "float8_e4m3fn"):
         pytest.skip(f'test_cast{(dtype_x, dtype_z)} cast to bfloat16 not supported on HIP.')
 
+    if is_cpu() and (dtype_x in torch_float8_dtypes or dtype_z in torch_float8_dtypes):
+        pytest.skip(f'test_cast{(dtype_x, dtype_z)} is not supported on CPU.')
+
     # bf16 vector cast is broken in LLVM for large vectors:
     #   https://github.com/llvm/llvm-project/issues/92471
     # TODO: Remove the change after the bug is fixed.

From 5b05baa65fac1e2bbd62b4de74f495ac16f4eece Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Tue, 11 Jun 2024 13:51:00 -0700
Subject: [PATCH 3/8] Fix segfault.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 third_party/cpu/lib/TritonToTritonCPU/ReduceScanCommon.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/cpu/lib/TritonToTritonCPU/ReduceScanCommon.h b/third_party/cpu/lib/TritonToTritonCPU/ReduceScanCommon.h
index b2edc5e98b36..ba2d64d8f5f0 100644
--- a/third_party/cpu/lib/TritonToTritonCPU/ReduceScanCommon.h
+++ b/third_party/cpu/lib/TritonToTritonCPU/ReduceScanCommon.h
@@ -225,10 +225,12 @@ struct ReduceScanOpConversionBase : public OpConversionPattern<OpT> {
   createShuffleDummies(Location loc, ValueRange inputs,
                        ConversionPatternRewriter &rewriter) const {
     if (shuffleDummies.empty()) {
+      SmallVector<int64_t, 1> dummyShape({1});
       for (auto val : inputs) {
         auto ty = cast<VectorType>(val.getType());
         shuffleDummies.push_back(rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getZeroAttr(ty.cloneWith(1, ty.getElementType()))));
+            loc, rewriter.getZeroAttr(
+                     ty.cloneWith(dummyShape, ty.getElementType()))));
       }
     }
     return shuffleDummies;

From 98a656ef6c39d4ad6c0022c9b32fc72d04b469a3 Mon Sep 17 00:00:00 2001
From: Pablo Zimmermann <karupayun@google.com>
Date: Mon, 3 Jun 2024 16:45:16 +0200
Subject: [PATCH 4/8] [BACKEND] Update LLVM version to
 https://github.com/llvm/llvm-project/commit/765206e050453018e861637a08a4520f29238074
 (#4059)

---
 cmake/llvm-hash.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
index 4f602295e4ec..302a715455d8 100644
--- a/cmake/llvm-hash.txt
+++ b/cmake/llvm-hash.txt
@@ -1 +1 @@
-3a8316216807d64a586b971f51695e23883331f7
+765206e050453018e861637a08a4520f29238074

From 5169013eb2d1229fd576014b411e088b6fde835d Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Tue, 11 Jun 2024 15:39:02 -0700
Subject: [PATCH 5/8] Add -s option to pytest run.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 .github/workflows/build-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index f9f87e6ba1f7..8c9bcca7cf28 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -70,4 +70,4 @@ jobs:
 
       - name: Run python unit tests
         run: |
-          python -m pytest -n 32 --device cpu python/test/unit/language/test_core.py -m cpu
+          python -m pytest -s -n 32 --device cpu python/test/unit/language/test_core.py -m cpu

From e9c0801d322097931d6325165e241fa8d618c746 Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Wed, 12 Jun 2024 10:42:32 -0700
Subject: [PATCH 6/8] Add a workaround for LLVM bug causing test failure on
 Skylake CPU.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 python/test/unit/language/test_core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index f99e0a779f06..0db353b573fc 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1604,6 +1604,12 @@ def test_cast(dtype_x, dtype_z, bitcast, size, num_ctas, device):
     if is_cpu() and (dtype_x in torch_float8_dtypes or dtype_z in torch_float8_dtypes):
         pytest.skip(f'test_cast{(dtype_x, dtype_z)} is not supported on CPU.')
 
+    # fptrunc fp32->fp16 is broken in LLVM for large vectors:
+    #   https://github.com/llvm/llvm-project/issues/95274
+    # TODO: remove the change after the bug is fixed.
+    if is_cpu() and dtype_x == "float32" and dtype_z == "float16":
+        size = 512
+
     # bf16 vector cast is broken in LLVM for large vectors:
     #   https://github.com/llvm/llvm-project/issues/92471
     # TODO: Remove the change after the bug is fixed.

From 02db84c3af9edb24ab96a2347a86ba85988f85c6 Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Wed, 12 Jun 2024 10:58:14 -0700
Subject: [PATCH 7/8] Add a workaround for LLVM fpext bug causing test failure
 on Skylake CPU.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 python/test/unit/language/test_core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 0db353b573fc..6cc4863e08b4 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -2147,6 +2147,12 @@ def kernel(X, Z, BLOCK: tl.constexpr):
 def test_reduce(op, dtype_str, shape, axis, keep_dims, num_ctas, device):
     check_type_supported(dtype_str, device)  # bfloat16 on cc < 80 will not be tested
 
+    # fpext fp16->fp32 is broken in LLVM for large vectors:
+    #   https://github.com/llvm/llvm-project/issues/95278
+    # TODO: remove the change after the bug is fixed.
+    if is_cpu() and dtype_str == "float16":
+        shape = (min(shape[0], 512), min(shape[1], 512))
+
     @triton.jit
     def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, IS_3D: tl.constexpr,
                AXIS: tl.constexpr, KEEP_DIMS: tl.constexpr):

From b0689c501e6e6a4f4dd7d93ebea7f0ca5f371a36 Mon Sep 17 00:00:00 2001
From: Ilya Enkovich <ilya.enkovich@intel.com>
Date: Mon, 17 Jun 2024 15:57:08 -0700
Subject: [PATCH 8/8] Fix formatting.

Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
---
 .../tutorials/03-matrix-multiplication-cpu.py | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/python/tutorials/03-matrix-multiplication-cpu.py b/python/tutorials/03-matrix-multiplication-cpu.py
index c20a36aab10e..e96d04614661 100644
--- a/python/tutorials/03-matrix-multiplication-cpu.py
+++ b/python/tutorials/03-matrix-multiplication-cpu.py
@@ -154,13 +154,13 @@
 import triton
 import triton.language as tl
 
-
 BLOCK_SIZE_M = 32
 BLOCK_SIZE_N = 32
 BLOCK_SIZE_K = 32
 GROUP_SIZE_M = 8
 USE_GPU = True
 
+
 @triton.jit
 def matmul_kernel(
         # Pointers to matrices
@@ -227,7 +227,7 @@ def matmul_kernel(
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
-    
+
     # Convert the accumulator to the output matrix C's type if needed.
     c = accumulator
 
@@ -236,14 +236,13 @@ def matmul_kernel(
     offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    
+
     #TODO: Currently masked load is not supported yet.
     #c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
     #tl.store(c_ptrs, c, mask=c_mask)
     tl.store(c_ptrs, c)
 
 
-
 # %%
 # We can now create a convenience wrapper function that only takes two input tensors,
 # and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel.
@@ -256,9 +255,10 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
     M, K = a.shape
     K, N = b.shape
     #TODO: Currently masked load is not supported yet.
-    assert (M % BLOCK_SIZE_M == 0) and (N % BLOCK_SIZE_N == 0) and (K % BLOCK_SIZE_K == 0), "Masking currently not supported, Matrix dimensions must be multiples of block size"
+    assert (M % BLOCK_SIZE_M == 0) and (N % BLOCK_SIZE_N == 0) and (
+        K % BLOCK_SIZE_K == 0), "Masking currently not supported, Matrix dimensions must be multiples of block size"
     if c is None:
-         # Allocates output.
+        # Allocates output.
         c = torch.empty((M, N), device=a.device, dtype=a.dtype)
     else:
         assert c.shape == (M, N), "Incompatible dimensions"
@@ -270,9 +270,7 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
         a.stride(0), a.stride(1),  #
         b.stride(0), b.stride(1),  #
         c.stride(0), c.stride(1),  #
-        BLOCK_SIZE_M=BLOCK_SIZE_M, 
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,  #
+        BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K,  #
         GROUP_SIZE_M=GROUP_SIZE_M,  #
     )
     return c
@@ -298,7 +296,8 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
 if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=rtol):
     print("✅ TritonCPU and TorchCPU match")
 else:
-    print("❌ TritonCPU and TorchCPU differ, the maximum difference is "f'{torch.max(torch.abs(triton_output - torch_output))}')
+    print("❌ TritonCPU and TorchCPU differ, the maximum difference is "
+          f'{torch.max(torch.abs(triton_output - torch_output))}')
 
 # %%
 # Benchmark
@@ -326,13 +325,13 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
     if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=rtol):
         print("✅ TritonGPU and TorchGPU match")
     else:
-        print("❌ TritonGPU and TorchGPU differ, the maximum difference is "f'{torch.max(torch.abs(triton_output - torch_output))}')
+        print("❌ TritonGPU and TorchGPU differ, the maximum difference is "
+              f'{torch.max(torch.abs(triton_output - torch_output))}')
 
     LINE_VALS += ['triton-gpu', 'torch-gpu']
     LINE_NAMES += ['TritonGPU', 'TorchGPU']
     LINE_STYLES += [('yellow', '-'), ('red', '-')]
 
-
 # %%
 # Seems like we're good to go!
 
@@ -359,7 +358,6 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
         f'matmul-performance-fp32 (BLOCK_SIZE_M={BLOCK_SIZE_M}, BLOCK_SIZE_N={BLOCK_SIZE_N}, BLOCK_SIZE_K={BLOCK_SIZE_K}, GROUP_SIZE_M={GROUP_SIZE_M})',
         args={},  # Values for function arguments not in `x_names` and `y_name`.
     ))
-  
 def benchmark(M, N, K, provider):
     import os
 
@@ -383,7 +381,8 @@ def benchmark(M, N, K, provider):
         ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b, None), quantiles=quantiles)
     elif provider == 'torch-cpu':
         c = torch.empty((M, N), device=a.device, dtype=a.dtype)
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b, out=c), quantiles=quantiles, is_cpu=True)
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b, out=c), quantiles=quantiles,
+                                                     is_cpu=True)
     elif provider == 'triton-cpu-single':
         c = torch.empty((M, N), device=a.device, dtype=a.dtype)
         ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b, c), quantiles=quantiles, is_cpu=True)