xarray-contrib · brendancol · Jun 5, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
@@ -22,6 +22,7 @@ geotiff,2026-05-20,SAFE,IO-bound,0,2212,"Pass 13 (2026-05-20): 1 MEDIUM found an
 glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk."
 hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings."
 hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE."
+interpolate_spline,2026-06-04,SAFE,compute-bound,0,,"scope=spline-only. Audited _spline.py + _validation.py only (not _idw/_kriging). 1 MEDIUM (Cat3 GPU transfer): _spline_dask_cupy/_spline_cupy re-uploaded invariant x_pts/y_pts/weights host->device once per chunk. Fixed in PR #2929: added _tps_evaluate_gpu taking on-device point/weight arrays + only per-chunk grid slices; dask+cupy uploads invariants once at graph build (verified 48->3 on 16 chunks, scales with chunk count). numpy/cupy/dask+cupy parity ~1e-14. Added cupy+dask+cupy parity tests and an upload-count regression test (red without fix: 48!=3). _tps_cuda_kernel 30 regs/thread, 6 scalar locals -- no register pressure. CPU/dask+numpy eval @ngjit, row-major, no materialization. Dask graph probe 2560x2560/256 chunks = 200 tasks (2/chunk), no fan-in. Memory guard _check_spline_memory bounds N^2 solve. No issue filed -- gh issue create denied by auto-mode classifier; finding surfaced directly by sweep. GitHub issue field left empty."
 interpolate-kriging,2026-06-04,SAFE,graph-bound,0,2923,"MEDIUM: memory guard used full-grid k0 term on dask templates -> spurious MemoryError (issue #2923, fixed). LOW: _experimental_variogram nlags python loop vectorizable via bincount (~1.2x, pair-array materialization dominates) - doc only. Dask graph clean (2 tasks/chunk); cupy returns device arrays; no .values/.compute/.data.get materialization."
 kde,2026-04-14T12:00:00Z,SAFE,compute-bound,0,,Graph construction serialized per-tile. _filter_points_to_tile scans all points per tile. No HIGH findings.
 mahalanobis,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,False positive. Numpy path materializes by design. Dask path uses lazy reductions + map_blocks.

diff --git a/xrspatial/interpolate/_spline.py b/xrspatial/interpolate/_spline.py
@@ -162,12 +162,14 @@ def _tps_cuda_kernel(x_pts, y_pts, weights, n_pts, x_grid, y_grid, out):
 # CuPy backend (CPU solve + GPU evaluate)
 # ---------------------------------------------------------------------------
 
-def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
-                 smoothing, weights, template_data):
-    n = len(x_pts)
-    x_gpu = cupy.asarray(x_pts)
-    y_gpu = cupy.asarray(y_pts)
-    w_gpu = cupy.asarray(weights)
+def _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_grid, y_grid):
+    """Evaluate the TPS surface on the GPU.
+
+    ``x_gpu``, ``y_gpu`` and ``w_gpu`` are the point coordinates and the
+    solved weight vector, already resident on the device.  ``x_grid`` and
+    ``y_grid`` are host coordinate slices for the current output tile and
+    are the only arrays uploaded here.
+    """
     xg_gpu = cupy.asarray(x_grid)
     yg_gpu = cupy.asarray(y_grid)
 
@@ -181,6 +183,15 @@ def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
     return out
 
 
+def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
+                 smoothing, weights, template_data):
+    n = len(x_pts)
+    x_gpu = cupy.asarray(x_pts)
+    y_gpu = cupy.asarray(y_pts)
+    w_gpu = cupy.asarray(weights)
+    return _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_grid, y_grid)
+
+
 # ---------------------------------------------------------------------------
 # Dask + numpy backend
 # ---------------------------------------------------------------------------
@@ -207,14 +218,24 @@ def _chunk(block, block_info=None):
 def _spline_dask_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
                       smoothing, weights, template_data):
 
+    # The point coordinates and weight vector are the same for every
+    # chunk, so upload them to the device once instead of re-uploading
+    # inside each per-chunk call.  Under the threaded/synchronous
+    # scheduler the per-chunk closure shares these device buffers by
+    # reference; a distributed scheduler would re-serialise them per
+    # task, which is no worse than the previous per-chunk upload.
+    n = len(x_pts)
+    x_gpu = cupy.asarray(x_pts)
+    y_gpu = cupy.asarray(y_pts)
+    w_gpu = cupy.asarray(weights)
+
     def _chunk(block, block_info=None):
         if block_info is None:
             return block
         loc = block_info[0]['array-location']
         y_sl = y_grid[loc[0][0]:loc[0][1]]
         x_sl = x_grid[loc[1][0]:loc[1][1]]
-        return _spline_cupy(x_pts, y_pts, z_pts, x_sl, y_sl,
-                            smoothing, weights, None)
+        return _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_sl, y_sl)
 
     return da.map_blocks(
         _chunk, template_data, dtype=np.float64,

diff --git a/xrspatial/tests/test_interpolation.py b/xrspatial/tests/test_interpolation.py
@@ -329,6 +329,50 @@ def test_dask_cupy_matches_numpy(self):
         np.testing.assert_allclose(
             np_result.values, _to_numpy(dc_result), rtol=1e-10)
 
+    @cuda_and_cupy_available
+    @dask_array_available
+    def test_dask_cupy_uploads_points_once(self, monkeypatch):
+        """The dask+cupy path uploads the point/weight arrays once.
+
+        These arrays are the same for every chunk, so the number of
+        host-to-device transfers of them must not scale with chunk
+        count.  Regression guard against re-uploading inside the
+        per-chunk closure.
+        """
+        import cupy
+
+        from xrspatial.interpolate import _spline as spline_mod
+
+        n_points = 8
+        rng = np.random.RandomState(0)
+        x = rng.uniform(0, 2, n_points)
+        y = rng.uniform(0, 2, n_points)
+        z = rng.uniform(0, 2, n_points)
+
+        coords = np.linspace(0.0, 2.0, 8)
+
+        orig_asarray = cupy.asarray
+        invariant_uploads = {'n': 0}
+
+        def counting_asarray(a, *args, **kwargs):
+            # Point coordinate vectors (len n_points) and the weight
+            # vector (len n_points + 3) are the chunk-invariant uploads.
+            if isinstance(a, np.ndarray) and a.size in (
+                    n_points, n_points + 3):
+                invariant_uploads['n'] += 1
+            return orig_asarray(a, *args, **kwargs)
+
+        monkeypatch.setattr(spline_mod.cupy, 'asarray', counting_asarray)
+
+        template = _make_template(coords, coords,
+                                  backend='dask_cupy', chunks=(2, 2))
+        result = spline(x, y, z, template)
+        result.data.compute()
+
+        # x_pts, y_pts and weights -> exactly three uploads, regardless
+        # of how many chunks the grid was split into.
+        assert invariant_uploads['n'] == 3
+
 
 # ===================================================================
 # Kriging tests