arrayfire · syurkevi · Sep 17, 2025 · Sep 17, 2025
diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md
@@ -3,10 +3,18 @@ Benchmarks
 
 ## Setting up environment
 
+Create a python environment and install pytest and the compute libraries:
 ```sh
     python -m pip install -r requirements.txt
 ```
 
+If running `dpnp` with Nvidia or AMD devices, you must install the oneapi toolkit along with the corresponding oneapi pluging:
+
+```sh
+    # install oneapi toolkit and plugins
+    source /opt/intel/oneapi/setvars.sh
+```
+
 ## Benchmark parameters
 
 The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py).
@@ -20,16 +28,18 @@ These are the steps to run the benchmarks, and produce the graphs
 
 Run the benchmarks and store the results in `results.json`
 ```sh
-    pytest .\pytest_benchmark --benchmark-json=results.json
+    pytest ./pytest_benchmark --benchmark-json=results.json
 ```
 
 To create graphs and store the timing results after creating the `results.json`, run:
 ```sh
+    mkdir img
     python graphs.py
 ```
 
 To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file.
-To modify the labels shown, modify `PKG_LABELS`
+To modify the legend of the package labels shown, modify `PKG_LABELS`
+To modify the name of the tests shown, modify `TESTS_GRAPH_NAME`
 To modify the hardware display, modify `HARDWARE` 
 
 ## Notes

diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py
@@ -7,7 +7,7 @@
 BENCHMARKS_JSON = "results.json"
 
 # Hardware details shown in title
-HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)"
+HARDWARE = "Intel Xeon Gold 5315Y (8 Processors) @ 3.201GHz 63032 MB\noneAPI 2025.2.1 NVIDIA RTX A4000, 16222 MB, CUDA 12.8 Compute 8.6"
 
 # Show speedup in graph
 SHOW_NUMBERS = True
@@ -16,12 +16,13 @@
 ROUND_NUMBERS = 1
 
 # package list in graph order; arrayfire packages are added later
-PKG_NAMES = ["numpy", "dpnp", "cupy"]
+PKG_NAMES = ["numpy", "dpnp", "cupy", "cupynumeric"]
 
 # color used in graphs
 PKG_COLOR = {
     "numpy": "tab:blue",
     "cupy": "tab:green",
+    "cupynumeric": "green",
     "dpnp": "tab:red",
     "afcpu": "tab:orange",
     "afopencl": "tab:orange",
@@ -32,8 +33,9 @@
 # labels displayed in the graph
 PKG_LABELS = {
     "numpy": "numpy[cpu]",
-    "dpnp": "dpnp[level_zero:gpu]",
+    "dpnp": "dpnp[cuda:gpu]",
     "cupy": "cupy",
+    "cupynumeric": "cupynumeric",
     "afcpu": "afcpu",
     "afcuda": "afcuda",
     "afopencl": "afopencl[opencl:gpu]",
@@ -44,16 +46,16 @@
 
 # Tests to be shown in graphs
 TESTS = [
-    "qr",
+    "group_elementwise",
     "neural_network",
-    "gemm",
+    "black_scholes",
     "mandelbrot",
     "nbody",
     "pi",
-    "black_scholes",
-    "fft",
     "normal",
-    "group_elementwise",
+    "gemm",
+    "fft",
+    "qr",
     # Other tests
     # 'svd
     # 'cholesky',
@@ -63,6 +65,25 @@
     # 'inv'
 ]
 
+# Reverse list so it appears in order on graph
+TESTS.reverse()
+
+TESTS_GRAPH_NAME = {
+    "group_elementwise": "Group_elementwise (JIT)",
+    "neural_network": "Neural Network (JIT)",
+    "black_scholes": "Black Scholes (JIT)",
+    "mandelbrot": "Mandelbrot (JIT)",
+    "nbody": "Nbody (JIT)",
+    "pi": "Montecarlo Pi (JIT)",
+    "normal": "Normal Distribution",
+    "gemm": "General Matrix Multiplication",
+    "fft": "2D FFT",
+    "qr": "QR Decomposition",
+}
+
+for name in TESTS:
+    if name not in TESTS_GRAPH_NAME:
+        TESTS_GRAPH_NAME[name] = name
 
 def get_benchmark_data():
     results = {}
@@ -189,7 +210,7 @@ def generate_group_graph(test_list=None, show_numbers=False, filename="compariso
 
     xlabels = []
     for test in tests:
-        xlabels.append(test + "\n" + descriptions[test])
+        xlabels.append(TESTS_GRAPH_NAME[test] + "\n" + descriptions[test])
 
     ax.set_xlabel("Speedup")
     ax.set_xscale("log")

diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py
@@ -29,6 +29,7 @@
 import math
 
 import cupy
+import cupynumeric
 import dpctl
 import dpnp
 import numpy as np
@@ -38,19 +39,19 @@
 
 # modify parameters for most benchmarks
 ROUNDS = 30
-NSIZE = 2**13
+NSIZE = 2**11
 NNSIZE = NSIZE**2
 DTYPE = "float32"
 
 # comment a line to remove that package from testing
 PKGDICT = {
-    "dpnp": dpnp,
     "numpy": np,
     "cupy": cupy,
     # "afcpu": af,
     "afopencl": af,
-    "afcuda": af,
     "afoneapi": af,
+    "dpnp": dpnp,
+    "cupynumeric": cupynumeric,
 }
 
 PKGS = []
@@ -66,11 +67,13 @@ def initialize_package(PKG_ID):
     pkg = PKGDICT[PKG_ID]
 
     try:
+        # Free all unused memory
+        gc.collect()
         af.device_gc()
         mempool = cupy.get_default_memory_pool()
         mempool.free_all_blocks()
-    except:
-        pass
+    except Exception as e:
+        print(e)
 
     if PKG_ID == "afcpu":
         af.set_backend(af.BackendType.cpu)
@@ -98,8 +101,7 @@ def initialize_package(PKG_ID):
         print(cupy.cuda.Device())
         mempool = cupy.get_default_memory_pool()
         mempool.free_all_blocks()
+    elif PKG_ID == "cupynumeric":
+        pass
     else:
-        raise NotImplementedError()
-
-    # Free all unused memory
-    gc.collect()
+        raise NotImplementedError()
diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py
@@ -92,6 +92,29 @@ def cnd(x):
 
     return (C, P)
 
+def black_scholes_cupynumeric(S, X, R, V, T):
+    # S = Underlying stock price
+    # X = Strike Price
+    # R = Risk free rate of interest
+    # V = Volatility
+    # T = Time to maturity
+    def cnd(x):
+        temp = x > 0
+        erf = lambda arr: cupynumeric.exp(-arr * arr)
+        return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2)
+
+    d1 = cupynumeric.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * cupynumeric.sqrt(T))
+
+    d2 = d1 - (V * cupynumeric.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * cupynumeric.exp((-R) * T) * cnd_d2)
+    P = X * cupynumeric.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    return (C, P)
 
 def black_scholes_arrayfire(S, X, R, V, T):
     def cnd(x):
@@ -137,6 +160,9 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -146,4 +172,5 @@ def generate_arrays(pkgid, count):
     "numpy": black_scholes_numpy,
     "cupy": black_scholes_cupy,
     "arrayfire": black_scholes_arrayfire,
+    "cupynumeric": black_scholes_cupynumeric
 }
diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py
@@ -52,7 +52,7 @@ def func_cupy(arr):
             cupy.cuda.runtime.deviceSynchronize()
             return x
 
-        GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func}
+        GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func, "cupynumeric": func}
 
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         result = benchmark.pedantic(
@@ -312,5 +312,8 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py
@@ -50,6 +50,9 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -86,5 +89,7 @@ def fft_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return res
 
+def fft_cupynumeric(arr):
+    return cupynumeric.fft.fft(arr)
 
-FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af}
+FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af, "cupynumeric": fft_cupynumeric}
diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py
@@ -81,6 +81,9 @@ def generate_arrays(pkgid, count):
         np.random.rand(1)
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -117,5 +120,7 @@ def gemm_cupy(A, B, C):
     cupy.cuda.runtime.deviceSynchronize()
     return C
 
+def gemm_cupynumeric(A, B, C):
+    return alpha * cupynumeric.matmul(A, B) + beta * C
 
-FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp}
+FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp, "cupynumeric": gemm_cupynumeric}
diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py
@@ -12,7 +12,8 @@ class TestKmeans:
     def test_kmeans(self, benchmark, pkgid):
         initialize_package(pkgid)
         pkg = PKGDICT[pkgid]
-        kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af}
+        kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af,
+        "cupynumeric": kmeans_cupynumeric}
         obj = kmean_class[pkg.__name__]()
 
         benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers"
@@ -189,6 +190,94 @@ def kmeans(self):
         return centroids, cluster_assignments
 
 
+
+class kmeans_cupynumeric:
+    def __init__(self):
+        self.data = cupynumeric.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = cupynumeric.random.choice(self.data.shape[0], K, replace=False)
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        distances = cupynumeric.sqrt(((self.data[:, cupynumeric.newaxis, :] - centroids[cupynumeric.newaxis, :, :]) ** 2).sum(axis=2))
+        cluster_assignments = cupynumeric.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = cupynumeric.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = cupynumeric.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = cupynumeric.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if cupynumeric.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+
+
 class kmeans_af:
     def __init__(self):
         self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES))