Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .claude/sweep-performance-state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ geotiff,2026-05-20,SAFE,IO-bound,0,2212,"Pass 13 (2026-05-20): 1 MEDIUM found an
glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk."
hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings."
hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE."
interpolate_spline,2026-06-04,SAFE,compute-bound,0,,"scope=spline-only. Audited _spline.py + _validation.py only (not _idw/_kriging). 1 MEDIUM (Cat3 GPU transfer): _spline_dask_cupy/_spline_cupy re-uploaded invariant x_pts/y_pts/weights host->device once per chunk. Fixed in PR #2929: added _tps_evaluate_gpu taking on-device point/weight arrays + only per-chunk grid slices; dask+cupy uploads invariants once at graph build (verified 48->3 on 16 chunks, scales with chunk count). numpy/cupy/dask+cupy parity ~1e-14. Added cupy+dask+cupy parity tests and an upload-count regression test (red without fix: 48!=3). _tps_cuda_kernel 30 regs/thread, 6 scalar locals -- no register pressure. CPU/dask+numpy eval @ngjit, row-major, no materialization. Dask graph probe 2560x2560/256 chunks = 200 tasks (2/chunk), no fan-in. Memory guard _check_spline_memory bounds N^2 solve. No issue filed -- gh issue create denied by auto-mode classifier; finding surfaced directly by sweep. GitHub issue field left empty."
interpolate-kriging,2026-06-04,SAFE,graph-bound,0,2923,"MEDIUM: memory guard used full-grid k0 term on dask templates -> spurious MemoryError (issue #2923, fixed). LOW: _experimental_variogram nlags python loop vectorizable via bincount (~1.2x, pair-array materialization dominates) - doc only. Dask graph clean (2 tasks/chunk); cupy returns device arrays; no .values/.compute/.data.get materialization."
kde,2026-04-14T12:00:00Z,SAFE,compute-bound,0,,Graph construction serialized per-tile. _filter_points_to_tile scans all points per tile. No HIGH findings.
mahalanobis,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,False positive. Numpy path materializes by design. Dask path uses lazy reductions + map_blocks.
Expand Down
37 changes: 29 additions & 8 deletions xrspatial/interpolate/_spline.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,14 @@ def _tps_cuda_kernel(x_pts, y_pts, weights, n_pts, x_grid, y_grid, out):
# CuPy backend (CPU solve + GPU evaluate)
# ---------------------------------------------------------------------------

def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
smoothing, weights, template_data):
n = len(x_pts)
x_gpu = cupy.asarray(x_pts)
y_gpu = cupy.asarray(y_pts)
w_gpu = cupy.asarray(weights)
def _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_grid, y_grid):
"""Evaluate the TPS surface on the GPU.

``x_gpu``, ``y_gpu`` and ``w_gpu`` are the point coordinates and the
solved weight vector, already resident on the device. ``x_grid`` and
``y_grid`` are host coordinate slices for the current output tile and
are the only arrays uploaded here.
"""
xg_gpu = cupy.asarray(x_grid)
yg_gpu = cupy.asarray(y_grid)

Expand All @@ -181,6 +183,15 @@ def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
return out


def _spline_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
smoothing, weights, template_data):
n = len(x_pts)
x_gpu = cupy.asarray(x_pts)
y_gpu = cupy.asarray(y_pts)
w_gpu = cupy.asarray(weights)
return _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_grid, y_grid)


# ---------------------------------------------------------------------------
# Dask + numpy backend
# ---------------------------------------------------------------------------
Expand All @@ -207,14 +218,24 @@ def _chunk(block, block_info=None):
def _spline_dask_cupy(x_pts, y_pts, z_pts, x_grid, y_grid,
smoothing, weights, template_data):

# The point coordinates and weight vector are the same for every
# chunk, so upload them to the device once instead of re-uploading
# inside each per-chunk call. Under the threaded/synchronous
# scheduler the per-chunk closure shares these device buffers by
# reference; a distributed scheduler would re-serialise them per
# task, which is no worse than the previous per-chunk upload.
n = len(x_pts)
x_gpu = cupy.asarray(x_pts)
y_gpu = cupy.asarray(y_pts)
w_gpu = cupy.asarray(weights)

def _chunk(block, block_info=None):
if block_info is None:
return block
loc = block_info[0]['array-location']
y_sl = y_grid[loc[0][0]:loc[0][1]]
x_sl = x_grid[loc[1][0]:loc[1][1]]
return _spline_cupy(x_pts, y_pts, z_pts, x_sl, y_sl,
smoothing, weights, None)
return _tps_evaluate_gpu(x_gpu, y_gpu, w_gpu, n, x_sl, y_sl)

return da.map_blocks(
_chunk, template_data, dtype=np.float64,
Expand Down
44 changes: 44 additions & 0 deletions xrspatial/tests/test_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,50 @@ def test_dask_cupy_matches_numpy(self):
np.testing.assert_allclose(
np_result.values, _to_numpy(dc_result), rtol=1e-10)

@cuda_and_cupy_available
@dask_array_available
def test_dask_cupy_uploads_points_once(self, monkeypatch):
"""The dask+cupy path uploads the point/weight arrays once.

These arrays are the same for every chunk, so the number of
host-to-device transfers of them must not scale with chunk
count. Regression guard against re-uploading inside the
per-chunk closure.
"""
import cupy

from xrspatial.interpolate import _spline as spline_mod

n_points = 8
rng = np.random.RandomState(0)
x = rng.uniform(0, 2, n_points)
y = rng.uniform(0, 2, n_points)
z = rng.uniform(0, 2, n_points)

coords = np.linspace(0.0, 2.0, 8)

orig_asarray = cupy.asarray
invariant_uploads = {'n': 0}

def counting_asarray(a, *args, **kwargs):
# Point coordinate vectors (len n_points) and the weight
# vector (len n_points + 3) are the chunk-invariant uploads.
if isinstance(a, np.ndarray) and a.size in (
n_points, n_points + 3):
invariant_uploads['n'] += 1
return orig_asarray(a, *args, **kwargs)

monkeypatch.setattr(spline_mod.cupy, 'asarray', counting_asarray)

template = _make_template(coords, coords,
backend='dask_cupy', chunks=(2, 2))
result = spline(x, y, z, template)
result.data.compute()

# x_pts, y_pts and weights -> exactly three uploads, regardless
# of how many chunks the grid was split into.
assert invariant_uploads['n'] == 3


# ===================================================================
# Kriging tests
Expand Down
Loading