JuliaParallel · jpsamaroo · Feb 23, 2026 · Jul 8, 2025 · Aug 10, 2025 · Dec 9, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -21,7 +21,7 @@
 
 steps:
   - label: Julia 1.9
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
@@ -32,7 +32,7 @@ steps:
           codecov: true
 
   - label: Julia 1.10
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
@@ -43,7 +43,7 @@ steps:
           codecov: true
 
   - label: Julia 1.11
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
@@ -54,7 +54,7 @@ steps:
           codecov: true
 
   - label: Julia 1
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
@@ -65,7 +65,7 @@ steps:
           codecov: true
 
   - label: Julia nightly
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     plugins:
       - JuliaCI/julia#v1:
@@ -76,7 +76,7 @@ steps:
           codecov: true
 
   - label: Julia 1 (macOS)
-    timeout_in_minutes: 90
+    timeout_in_minutes: 120
     <<: *test
     agents:
       queue: "juliaecosystem"

diff --git a/docs/src/darray.md b/docs/src/darray.md
@@ -704,7 +704,9 @@ From `LinearAlgebra`:
 - `*` (Out-of-place Matrix-(Matrix/Vector) multiply)
 - `mul!` (In-place Matrix-Matrix and Matrix-Vector multiply)
 - `cholesky`/`cholesky!` (In-place/Out-of-place Cholesky factorization)
-- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` only))
+- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` and `RowMaximum`))
+- `\`/`ldiv!` (In-place/Out-of-place Linear solving with LU and Cholesky factorizations)
+- `inv` (Out-of-place matrix inversion)
 
 From `AbstractFFTs`:
 - `fft`/`fft!`

diff --git a/docs/src/task-spawning.md b/docs/src/task-spawning.md
@@ -29,15 +29,15 @@ it'll be passed as-is to the function `f` (with some exceptions).
 
 !!! note "Task / thread occupancy"
     By default, `Dagger` assumes that tasks saturate the thread they are running on and does not try to schedule other tasks on the thread.
-    This default can be controlled by specifying [`Options`](@ref) (more details can be found under [Task and Scheduler options](@ref)).
+    This default can be controlled by specifying [`Options`](@ref Dagger.Options) (more details can be found under [Task and Scheduler options](@ref)).
     The section [Changing the thread occupancy](@ref) shows a runnable example of how to achieve this.
 
 ## Options
 
 The [`Options`](@ref Dagger.Options) struct in the second argument position is
 optional; if provided, it is passed to the scheduler to control its
 behavior. [`Options`](@ref Dagger.Options) contains option
-key-value pairs, which can be any field in [`Options`](@ref)
+key-value pairs, which can be any field in [`Options`](@ref Dagger.Options)
 (see [Task and Scheduler options](@ref)).
 
 ## Simple example
@@ -125,7 +125,7 @@ The [`Options`](@ref Dagger.Options) struct in the second argument position is
 optional; if provided, it is passed to the scheduler to control its
 behavior. [`Options`](@ref Dagger.Options) contains a `NamedTuple` of option
 key-value pairs, which can be any of:
-- Any field in [`Options`](@ref) (see [Task and Scheduler options](@ref))
+- Any field in [`Options`](@ref Dagger.Options) (see [Task and Scheduler options](@ref))
 - `meta::Bool` -- Pass the input [`Chunk`](@ref) objects themselves to `f` and
   not the value contained in them.
 
@@ -228,7 +228,7 @@ Note that, as a legacy API, usage of the lazy API is generally discouraged for m
 
 While Dagger generally "just works", sometimes one needs to exert some more
 fine-grained control over how the scheduler allocates work. There are two
-parallel mechanisms to achieve this: Task options (from [`Options`](@ref)) and
+parallel mechanisms to achieve this: Task options (from [`Options`](@ref Dagger.Options)) and
 Scheduler options (from [`Sch.SchedulerOptions`](@ref)). Scheduler
 options operate globally across an entire DAG, and Task options operate on a
 task-by-task basis.
@@ -258,7 +258,7 @@ delayed(+; single=1)(1, 2)
 
 ## Changing the thread occupancy
 
-One of the supported [`Options`](@ref) is the `occupancy` keyword.
+One of the supported [`Options`](@ref Dagger.Options) is the `occupancy` keyword.
 This keyword can be used to communicate that a task is not expected to fully
 saturate a CPU core (e.g. due to being IO-bound).
 The basic usage looks like this:

diff --git a/src/Dagger.jl b/src/Dagger.jl
@@ -7,10 +7,10 @@ import SparseArrays: sprand, SparseMatrixCSC
 import MemPool
 import MemPool: DRef, FileRef, poolget, poolset
 
-import Base: collect, reduce
+import Base: collect, reduce, view
 
 import LinearAlgebra
-import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric
+import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LU, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, Cholesky, diagind, ishermitian, issymmetric, I
 import Random
 import Random: AbstractRNG
 
@@ -125,6 +125,7 @@ include("array/sort.jl")
 include("array/linalg.jl")
 include("array/mul.jl")
 include("array/cholesky.jl")
+include("array/trsm.jl")
 include("array/lu.jl")
 
 # GPU

diff --git a/src/array/alloc.jl b/src/array/alloc.jl
@@ -184,6 +184,18 @@ function Base.zero(x::DArray{T,N}) where {T,N}
     return _to_darray(a)
 end
 
+# Weird LinearAlgebra dispatch in `\` needs this
+function LinearAlgebra._zeros(::Type{T}, B::DVector, n::Integer) where T
+    m = max(size(B, 1), n)
+    sz = (m,)
+    return zeros(auto_blocks(sz), T, sz)
+end
+function LinearAlgebra._zeros(::Type{T}, B::DMatrix, n::Integer) where T
+    m = max(size(B, 1), n)
+    sz = (m, size(B, 2))
+    return zeros(auto_blocks(sz), T, sz)
+end
+
 function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N}
     d = ArrayDomain(Base.index_shape(A))
     dc = partition(p, d)
@@ -192,3 +204,5 @@ function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N}
     chunks = [tochunk(view(A, x.indexes...)) for x in dc]
     return DArray(T, d, dc, chunks, p)
 end
+Base.view(A::AbstractArray, ::AutoBlocks) =
+    view(A, auto_blocks(size(A)))
diff --git a/src/array/cholesky.jl b/src/array/cholesky.jl
@@ -15,27 +15,31 @@ function LinearAlgebra._chol!(A::DArray{T,2}, ::Type{UpperTriangular}) where T
     rzone = one(real(T))
     rmzone = -one(real(T))
     uplo = 'U'
-    Ac = A.chunks
-    mt, nt = size(Ac)
     iscomplex = T <: Complex
     trans = iscomplex ? 'C' : 'T'
 
+    mb, nb = A.partitioning.blocksize
+    min_bs = min(mb, nb)
     info = [convert(LinearAlgebra.BlasInt, 0)]
     try
-        Dagger.spawn_datadeps() do
-            for k in range(1, mt)
-                Dagger.@spawn potrf_checked!(uplo, InOut(Ac[k, k]), Out(info))
-                for n in range(k+1, nt)
-                    Dagger.@spawn BLAS.trsm!('L', uplo, trans, 'N', zone, In(Ac[k, k]), InOut(Ac[k, n]))
-                end
-                for m in range(k+1, mt)
-                    if iscomplex
-                        Dagger.@spawn BLAS.herk!(uplo, 'C', rmzone, In(Ac[k, m]), rzone, InOut(Ac[m, m]))
-                    else
-                        Dagger.@spawn BLAS.syrk!(uplo, 'T', rmzone, In(Ac[k, m]), rzone, InOut(Ac[m, m]))
+        maybe_copy_buffered(A => Blocks(min_bs, min_bs)) do A
+            Ac = A.chunks
+            mt, nt = size(Ac)
+            Dagger.spawn_datadeps() do
+                for k in range(1, mt)
+                    Dagger.@spawn potrf_checked!(uplo, InOut(Ac[k, k]), Out(info))
+                    for n in range(k+1, nt)
+                        Dagger.@spawn BLAS.trsm!('L', uplo, trans, 'N', zone, In(Ac[k, k]), InOut(Ac[k, n]))
                     end
-                    for n in range(m+1, nt)
-                        Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[k, m]), In(Ac[k, n]), zone, InOut(Ac[m, n]))
+                    for m in range(k+1, mt)
+                        if iscomplex
+                            Dagger.@spawn BLAS.herk!(uplo, 'C', rmzone, In(Ac[k, m]), rzone, InOut(Ac[m, m]))
+                        else
+                            Dagger.@spawn BLAS.syrk!(uplo, 'T', rmzone, In(Ac[k, m]), rzone, InOut(Ac[m, m]))
+                        end
+                        for n in range(m+1, nt)
+                            Dagger.@spawn BLAS.gemm!(trans, 'N', mzone, In(Ac[k, m]), In(Ac[k, n]), zone, InOut(Ac[m, n]))
+                        end
                     end
                 end
             end
@@ -56,27 +60,31 @@ function LinearAlgebra._chol!(A::DArray{T,2}, ::Type{LowerTriangular}) where T
     rzone = one(real(T))
     rmzone = -one(real(T))
     uplo = 'L'
-    Ac = A.chunks
-    mt, nt = size(Ac)
     iscomplex = T <: Complex
     trans = iscomplex ? 'C' : 'T'
 
+    mb, nb = A.partitioning.blocksize
+    min_bs = min(mb, nb)
     info = [convert(LinearAlgebra.BlasInt, 0)]
     try
-        Dagger.spawn_datadeps() do
-            for k in range(1, mt)
-                Dagger.@spawn potrf_checked!(uplo, InOut(Ac[k, k]), Out(info))
-                for m in range(k+1, mt)
-                    Dagger.@spawn BLAS.trsm!('R', uplo, trans, 'N', zone, In(Ac[k, k]), InOut(Ac[m, k]))
-                end
-                for n in range(k+1, nt)
-                    if iscomplex
-                        Dagger.@spawn BLAS.herk!(uplo, 'N', rmzone, In(Ac[n, k]), rzone, InOut(Ac[n, n]))
-                    else
-                        Dagger.@spawn BLAS.syrk!(uplo, 'N', rmzone, In(Ac[n, k]), rzone, InOut(Ac[n, n]))
+        maybe_copy_buffered(A => Blocks(min_bs, min_bs)) do A
+            Ac = A.chunks
+            mt, nt = size(Ac)
+            Dagger.spawn_datadeps() do
+                for k in range(1, mt)
+                    Dagger.@spawn potrf_checked!(uplo, InOut(Ac[k, k]), Out(info))
+                    for m in range(k+1, mt)
+                        Dagger.@spawn BLAS.trsm!('R', uplo, trans, 'N', zone, In(Ac[k, k]), InOut(Ac[m, k]))
                     end
-                    for m in range(n+1, mt)
-                        Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Ac[m, k]), In(Ac[n, k]), zone, InOut(Ac[m, n]))
+                    for n in range(k+1, nt)
+                        if iscomplex
+                            Dagger.@spawn BLAS.herk!(uplo, 'N', rmzone, In(Ac[n, k]), rzone, InOut(Ac[n, n]))
+                        else
+                            Dagger.@spawn BLAS.syrk!(uplo, 'N', rmzone, In(Ac[n, k]), rzone, InOut(Ac[n, n]))
+                        end
+                        for m in range(n+1, mt)
+                            Dagger.@spawn BLAS.gemm!('N', trans, mzone, In(Ac[m, k]), In(Ac[n, k]), zone, InOut(Ac[m, n]))
+                        end
                     end
                 end
             end

diff --git a/src/array/darray.jl b/src/array/darray.jl
@@ -1,6 +1,6 @@
 import Base: ==, fetch
 
-export DArray, DVector, DMatrix, Blocks, AutoBlocks
+export DArray, DVector, DMatrix, DVecOrMat, Blocks, AutoBlocks
 export distribute
 
 
@@ -148,6 +148,7 @@ const WrappedDMatrix{T} = WrappedDArray{T,2}
 const WrappedDVector{T} = WrappedDArray{T,1}
 const DMatrix{T} = DArray{T,2}
 const DVector{T} = DArray{T,1}
+const DVecOrMat{T} = Union{DVector{T}, DMatrix{T}}
 
 # mainly for backwards-compatibility
 DArray{T, N}(domain, subdomains, chunks, partitioning, concat=cat) where {T,N} =
@@ -252,7 +253,9 @@ function Base.getindex(A::ColorArray{T,N}, idxs::NTuple{N,Int}) where {T,N}
     if !haskey(A.seen_values, idxs)
         chunk = A.A.chunks[sd_idx]
         if chunk isa Chunk || isready(chunk)
-            value = A.seen_values[idxs] = Some(getindex(A.A, idxs))
+            value = A.seen_values[idxs] = allowscalar() do
+                Some(getindex(A.A, idxs))
+            end
         else
             # Show a placeholder instead
             value = A.seen_values[idxs] = nothing