QuantumKitHub · kshyatt · May 13, 2026
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -162,3 +162,20 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+# lu dispatches to the LAPACK wrapper, which is not what we need
+function Base.inv(t::ROCTensorMap)
+    cod = codomain(t)
+    dom = domain(t)
+    cod ≅ dom ||
+        throw(SpaceMismatch("codomain $cod and domain $dom are not isomorphic: no inverse"))
+    T = float(scalartype(t))
+    tinv = similar(t, T, dom ← cod)
+    for (c, b) in blocks(t)
+        binv = one!(block(tinv, c))
+        lpt = rocSOLVER.getrf!(copy(b))
+        lub = LinearAlgebra.LU(lpt[1], lpt[2], Int(lpt[3]))
+        ldiv!(lub, binv)
+    end
+    return tinv
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -17,6 +17,7 @@ MatrixAlgebraKit = "6c742aac-3347-4629-af66-fc926824e5e4"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 TensorKit = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec"
 TensorKitSectors = "13a9c161-d5da-41f0-bcbd-e1a08ae0647f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
@@ -38,6 +39,7 @@ Combinatorics = "1"
 GPUArrays = "11.3.1"
 JET = "0.9, 0.10, 0.11"
 ParallelTestRunner = "2"
+StableRNGs = "1"
 Test = "1"
 TestExtras = "0.2,0.3"
 Zygote = "0.7"
diff --git a/test/amd/tensors.jl b/test/amd/tensors.jl
@@ -49,28 +49,6 @@ for V in spacelist
                 @test domain(t) == one(W)
                 @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, ROCVector{Float64, AMDGPU.Mem.HIPBuffer}}
             end
-            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
-                t = @constinferred AMDGPU.zeros(T, W)
-                AMDGPU.@allowscalar begin
-                    @test @constinferred(hash(t)) == hash(deepcopy(t))
-                end
-                @test scalartype(t) == T
-                @test norm(t) == 0
-                @test codomain(t) == W
-                @test space(t) == (W ← one(W))
-                @test domain(t) == one(W)
-                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
-                # blocks
-                bs = @constinferred blocks(t)
-                (c, b1), state = @constinferred Nothing iterate(bs)
-                @test c == first(blocksectors(W))
-                next = @constinferred Nothing iterate(bs, state)
-                b2 = @constinferred block(t, first(blocksectors(t)))
-                @test b1 == b2
-                @test eltype(bs) === Pair{typeof(c), typeof(b1)}
-                @test typeof(b1) === TensorKit.blocktype(t)
-                @test typeof(c) === sectortype(t)
-            end
         end
         @timedtestset "Conversion to/from host" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
@@ -92,62 +70,18 @@ for V in spacelist
                 @test domain(t2) == one(W)
             end
         end
-        @timedtestset "Tensor Dict conversion" begin
-            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+        @timedtestset "Adapt" begin
+            W = V1 ⊗ V2 ⊗ V3 ← (V4 ⊗ V5)'
             for T in (Int, Float32, ComplexF64)
-                t = @constinferred AMDGPU.rand(T, W)
-                d = convert(Dict, t)
-                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
-            end
-        end
-        symmetricbraiding && @timedtestset "Basic linear algebra" begin
-            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
-            for T in (Float32, ComplexF64)
-                t = @constinferred AMDGPU.rand(T, W)
-                @test scalartype(t) == T
-                @test space(t) == W
-                @test space(t') == W'
-                @test dim(t) == dim(space(t))
-                @test codomain(t) == codomain(W)
-                @test domain(t) == domain(W)
-                # blocks for adjoint
-                bs = @constinferred blocks(t')
-                (c, b1), state = @constinferred Nothing iterate(bs)
-                @test c == first(blocksectors(W'))
-                next = @constinferred Nothing iterate(bs, state)
-                b2 = @constinferred block(t', first(blocksectors(t')))
-                @test b1 == b2
-                @test eltype(bs) === Pair{typeof(c), typeof(b1)}
-                @test typeof(b1) === TensorKit.blocktype(t')
-                @test typeof(c) === sectortype(t)
-                # linear algebra
-                @test isa(@constinferred(norm(t)), real(T))
-                @test norm(t)^2 ≈ dot(t, t)
-                α = rand(T)
-                @test norm(α * t) ≈ abs(α) * norm(t)
-                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
-                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
-                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
-                p = 3 * rand(Float64)
-                @test norm(t + t, p) ≈ 2 * norm(t, p)
-                @test norm(t) ≈ norm(t')
-
-                t2 = @constinferred rand!(similar(t))
-                β = rand(T)
-                #@test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2)) # broken for Irrep[CU₁]
-                @test dot(β * t2, α * t) ≈ conj(β) * α * conj(dot(t, t2))
-                @test dot(t2, t) ≈ conj(dot(t, t2))
-                @test dot(t2, t) ≈ conj(dot(t2', t'))
-                @test dot(t2, t) ≈ dot(t', t2')
+                t = rand(T, W)
+                t_gpu = @constinferred adapt(ROCArray, t)
+                @test storagetype(t_gpu) <: ROCArray{T}
+                @test scalartype(t_gpu) === scalartype(t)
+                @test collect(t_gpu.data) == t.data
 
-                i1 = @constinferred(isomorphism(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ V2, V2 ⊗ V1))
-                i2 = @constinferred(isomorphism(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V2 ⊗ V1, V1 ⊗ V2))
-                @test i1 * i2 == @constinferred(id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ V2))
-                @test i2 * i1 == @constinferred(id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V2 ⊗ V1))
-                w = @constinferred(isometry(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
-                @test dim(w) == 2 * dim(V1 ← V1)
-                @test w' * w == id(ROCVector{T, AMDGPU.Mem.HIPBuffer}, V1)
-                @test w * w' == (w * w')^2
+                t_cpu = @constinferred adapt(Array, t_gpu)
+                @test t_cpu == t
+                @test storagetype(t_cpu) <: Array{T}
             end
         end
         @timedtestset "Trivial space insertion and removal" begin
@@ -182,45 +116,6 @@ for V in spacelist
                 @test @constinferred(removeunit(t5, 4)) == t
             end
         end
-        if hasfusiontensor(I)
-            @timedtestset "Basic linear algebra: test via CPU" begin
-                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
-                for T in (Float32, ComplexF64)
-                    t = AMDGPU.rand(T, W)
-                    t2 = @constinferred AMDGPU.rand!(similar(t))
-                    α = rand(T)
-                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
-                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
-                end
-            end
-            @timedtestset "Real and imaginary parts" begin
-                W = V1 ⊗ V2
-                for T in (Float64, ComplexF64, ComplexF32)
-                    t = @constinferred AMDGPU.randn(T, W, W)
-
-                    tr = @constinferred real(t)
-                    @test scalartype(tr) <: Real
-                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
-                    @test storagetype(tr) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
-
-                    ti = @constinferred imag(t)
-                    @test scalartype(ti) <: Real
-                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
-                    @test storagetype(ti) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
-
-                    tc = @inferred complex(t)
-                    @test scalartype(tc) <: Complex
-                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
-                    @test storagetype(tc) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
-
-                    tc2 = @inferred complex(tr, ti)
-                    @test tc2 ≈ tc
-                    @test storagetype(tc2) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
-                end
-            end
-        end
         @timedtestset "Tensor conversion" begin # TODO adjoint conversion methods don't work yet
             W = V1 ⊗ V2
             t = @constinferred AMDGPU.randn(W ← W)
@@ -232,15 +127,6 @@ for V in spacelist
             @test Base.promote_typeof(t, tc) == typeof(tc)
             @test Base.promote_typeof(tc, t) == typeof(tc + t)
         end
-        #=@timedtestset "diag/diagm" begin
-            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
-            t = AMDGPU.randn(ComplexF64, W)
-            d = LinearAlgebra.diag(t)
-            # TODO find a way to use AMDGPU here
-            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
-            @test LinearAlgebra.isdiag(D)
-            @test LinearAlgebra.diag(D) == d
-        end=#
         symmetricbraiding && @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             t = AMDGPU.rand(ComplexF64, W)
@@ -383,134 +269,6 @@ for V in spacelist
                 @test flip(ta, (1, 2)) ≈ tb
             end
         end=# # TODO =# # None of this works due to lack of HIPTensor support
-        @timedtestset "Multiplication of isometries: test properties" begin
-            W2 = V4 ⊗ V5
-            W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
-            for T in (Float64, ComplexF64)
-                t1 = randisometry(ROCMatrix{T, AMDGPU.Mem.HIPBuffer}, W1, W2)
-                t2 = randisometry(ROCMatrix{T, AMDGPU.Mem.HIPBuffer}, W2 ← W2)
-                @test isisometric(t1)
-                @test isunitary(t2)
-                P = t1 * t1'
-                @test P * P ≈ P
-            end
-        end
-        @timedtestset "Multiplication and inverse: test compatibility" begin
-            W1 = V1 ⊗ V2 ⊗ V3
-            W2 = V4 ⊗ V5
-            for T in (Float64, ComplexF64)
-                t1 = AMDGPU.rand(T, W1, W1)
-                t2 = AMDGPU.rand(T, W2, W2)
-                t = AMDGPU.rand(T, W1, W2)
-                #=@test t1 * (t1 \ t) ≈ t
-                @test (t / t2) * t2 ≈ t
-                AMDGPU.@allowscalar begin
-                    @test t1 \ one(t1) ≈ inv(t1)
-                    @test one(t1) / t1 ≈ pinv(t1)
-                    tp = pinv(t) * t
-                    @test tp ≈ tp * tp
-                end=#
-                @test_throws SpaceMismatch inv(t)
-                @test_throws SpaceMismatch t2 \ t
-                @test_throws SpaceMismatch t / t1
-            end
-        end
-        @timedtestset "Multiplication and inverse: test via CPU" begin
-            W1 = V1 ⊗ V2 ⊗ V3
-            W2 = V4 ⊗ V5
-            for T in (Float32, Float64, ComplexF32, ComplexF64)
-                t1 = AMDGPU.rand(T, W1, W1)
-                t2 = AMDGPU.rand(T, W2, W2)
-                t = AMDGPU.rand(T, W1, W2)
-                ht1 = TensorKit.to_cpu(t1)
-                ht2 = TensorKit.to_cpu(t2)
-                ht = TensorKit.to_cpu(t)
-                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
-                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
-                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
-                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
-
-                #=AMDGPU.@allowscalar begin
-                    @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
-                    @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
-
-                    if T == Float32 || T == ComplexF32
-                        continue
-                    end
-
-                    @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
-                    @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
-                    @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
-                    @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
-
-                    @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
-                    @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
-                    @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
-                    @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
-                end=#
-            end
-        end
-        symmetricbraiding && @timedtestset "Tensor functions" begin
-            W = V1 ⊗ V2
-            for T in (Float64, ComplexF64)
-                #=t = project_hermitian!(AMDGPU.randn(T, W, W))
-                s = dim(W)
-                @test (@constinferred sqrt(t))^2 ≈ t
-                @test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
-                expt = @constinferred exp(t)
-                @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
-                @test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
-                @test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
-
-                @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
-                      id(storagetype(t), W)
-                @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
-                @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
-                @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
-                      id(storagetype(t), W)
-                @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
-                @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)=# # TODO in AMDGPU
-
-                #=t1 = sin(t)
-                @test sin(@constinferred asin(t1)) ≈ t1
-                t2 = cos(t)
-                @test cos(@constinferred acos(t2)) ≈ t2
-                t3 = sinh(t)
-                @test sinh(@constinferred asinh(t3)) ≈ t3
-                t4 = cosh(t)
-                @test cosh(@constinferred acosh(t4)) ≈ t4
-                t5 = tan(t)
-                @test tan(@constinferred atan(t5)) ≈ t5
-                t6 = cot(t)
-                @test cot(@constinferred acot(t6)) ≈ t6
-                t7 = tanh(t)
-                @test tanh(@constinferred atanh(t7)) ≈ t7
-                t8 = coth(t)
-                @test coth(@constinferred acoth(t8)) ≈ t8=#
-                # TODO in AMDGPU
-            end
-        end
-        # Sylvester not defined for AMDGPU
-        # @timedtestset "Sylvester equation" begin
-        #     for T in (Float32, ComplexF64)
-        #         tA = AMDGPU.rand(T, V1 ⊗ V3, V1 ⊗ V3)
-        #         tB = AMDGPU.rand(T, V2 ⊗ V4, V2 ⊗ V4)
-        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
-        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
-        #         tC = AMDGPU.rand(T, V1 ⊗ V3, V2 ⊗ V4)
-        #         t = @constinferred sylvester(tA, tB, tC)
-        #         @test codomain(t) == V1 ⊗ V3
-        #         @test domain(t) == V2 ⊗ V4
-        #         @test norm(tA * t + t * tB + tC) <
-        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
-        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
-        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
-        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
-        #         end
-        #     end
-        # end
-        #
-        # TODO
         #=@timedtestset "Tensor product: test via norm preservation" begin
             for T in (Float32, ComplexF64)
                 t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)