diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl index 555996217b..fe6c78d7e6 100644 --- a/lib/cublas/wrappers.jl +++ b/lib/cublas/wrappers.jl @@ -115,7 +115,7 @@ for (fname, fname_64, elty) in ((:cublasDscal_v2, :cublasDscal_v2_64, :Float64), (:cublasCscal_v2, :cublasCscal_v2_64, :ComplexF32)) @eval begin function scal!(n::Integer, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, x::StridedCuVecOrDenseMat{$elty}) if CUBLAS.version() >= v"12.0" $fname_64(handle(), n, alpha, x, stride(x, 1)) @@ -127,8 +127,7 @@ for (fname, fname_64, elty) in ((:cublasDscal_v2, :cublasDscal_v2_64, :Float64), end end function scal!(n::Integer, alpha::Number, x::StridedCuVecOrDenseMat{T}) where {T} - α = convert(T, alpha) - gpu_α = CuRef{T}(α) + gpu_α = CuRef{T}(alpha) scal!(n, gpu_α, x) synchronize() return x @@ -138,7 +137,7 @@ function scal!(n::Integer, alpha::CuRefArray{Float32, CuVector{Float32, DeviceMe return x end function scal!(n::Integer, alpha::Number, x::StridedCuVecOrDenseMat{Float16}) - α = CuRef{Float32}(convert(Float32, alpha)) + α = CuRef{Float32}(alpha) x = scal!(n, α, x) synchronize() return x @@ -160,15 +159,13 @@ for (fname, fname_64, elty, celty) in ((:cublasCsscal_v2, :cublasCsscal_v2_64, : end end function scal!(n::Integer, alpha::Real, x::StridedCuVecOrDenseMat{T}) where {T <: Complex} - α = convert(real(T), alpha) - gpu_α = CuRef{real(T)}(α) + gpu_α = CuRef{real(T)}(alpha) scal!(n, gpu_α, x) - synchronize() return x end function scal!(n::Integer, alpha::Real, x::StridedCuVecOrDenseMat{ComplexF16}) wide_x = widen.(x) - gpu_α = CuRef{Float32}(convert(Float32, alpha)) + gpu_α = CuRef{Float32}(alpha) scal!(n, gpu_α, wide_x) thin_x = convert(typeof(x), wide_x) copyto!(x, thin_x) @@ -176,7 +173,7 @@ function scal!(n::Integer, alpha::Real, x::StridedCuVecOrDenseMat{ComplexF16}) end function scal!(n::Integer, alpha::Complex, x::StridedCuVecOrDenseMat{ComplexF16}) wide_x = widen.(x) - gpu_α = CuRef{ComplexF32}(convert(ComplexF32, alpha)) + gpu_α = CuRef{ComplexF32}(alpha) scal!(n, gpu_α, wide_x) thin_x = convert(typeof(x), wide_x) copyto!(x, thin_x) @@ -193,8 +190,8 @@ for (jname, fname, fname_64, elty) in ((:dot, :cublasDdot_v2, :cublasDdot_v2_64, @eval begin function $jname(n::Integer, x::StridedCuVecOrDenseMat{$elty}, - y::StridedCuVecOrDenseMat{$elty}, - result::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + y::StridedCuVecOrDenseMat{$elty}, + result::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, ) if CUBLAS.version() >= v"12.0" $fname_64(handle(), n, x, stride(x, 1), y, stride(y, 1), result) @@ -213,7 +210,6 @@ function dot( ) where {T <: Union{Float32, Float64}} gpu_result = CuRef{T}(zero(T)) gpu_result = dot(n, x, y, gpu_result) - synchronize() result = Array(gpu_result.x) return result[] end @@ -225,7 +221,6 @@ function dotc( ) where {T <: Union{ComplexF32, ComplexF64}} gpu_result = CuRef{T}(zero(T)) dotc(n, x, y, gpu_result) - synchronize() result = Array(gpu_result.x) return result[] end @@ -237,7 +232,6 @@ function dotu( ) where {T <: Union{ComplexF32, ComplexF64}} gpu_result = CuRef{T}(zero(T)) dotu(n, x, y, gpu_result) - synchronize() result = Array(gpu_result.x) return result[] end @@ -250,7 +244,6 @@ end function dot(n::Integer, x::StridedCuVecOrDenseMat{Float16}, y::StridedCuVecOrDenseMat{Float16}) gpu_result = CuRef{Float16}(zero(Float16)) gpu_result = dot(n, x, y, gpu_result) - synchronize() result = Array{Float16}(gpu_result.x) return result[] end @@ -284,7 +277,6 @@ for (fname, fname_64, elty, ret_type) in ((:cublasDnrm2_v2, :cublasDnrm2_v2_64, ) gpu_result = CuRef{$ret_type}(zero($ret_type)) nrm2(n, X, gpu_result) - synchronize() result = Array(gpu_result.x) return result[] end @@ -301,7 +293,6 @@ end function nrm2(n::Integer, x::StridedCuVecOrDenseMat{Float16}) gpu_result = CuRef{Float16}(zero(Float16)) nrm2(n, x, gpu_result) - synchronize() result = Array(gpu_result.x) return result[] end @@ -309,7 +300,6 @@ function nrm2(n::Integer, x::StridedCuVecOrDenseMat{ComplexF16}) wide_x = widen.(x) wide_result = CuRef{Float32}(zero(Float32)) nrm2(n, wide_x, wide_result) - synchronize() return convert(Float16, only(Array{Float32}(wide_result.x))) end @@ -349,7 +339,7 @@ for (fname, fname_64, elty) in ((:cublasDaxpy_v2, :cublasDaxpy_v2_64, :Float64), (:cublasCaxpy_v2, :cublasCaxpy_v2_64, :ComplexF32)) @eval begin function axpy!(n::Integer, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, dx::StridedCuVecOrDenseMat{$elty}, dy::StridedCuVecOrDenseMat{$elty}) if CUBLAS.version() >= v"12.0" @@ -390,9 +380,8 @@ function axpy!(n::Integer, alpha::Number, dx::StridedCuVecOrDenseMat{ComplexF16} return dy end function axpy!(n::Integer, alpha::Number, dx::StridedCuVecOrDenseMat{T}, dy::StridedCuVecOrDenseMat{T}) where {T} - gpu_alpha = CuRef{T}(convert(T, alpha)) + gpu_alpha = CuRef{T}(alpha) dy = axpy!(n, gpu_alpha, dx, dy) - synchronize() return dy end @@ -427,10 +416,9 @@ for (fname, fname_64, elty, cty, sty) in ( c::$cty, s::$sty ) - gpu_c = CuRef{$cty}(convert($cty, c)) - gpu_s = CuRef{$sty}(convert($sty, s)) + gpu_c = CuRef{$cty}(c) + gpu_s = CuRef{$sty}(s) x, y = rot!(n, x, y, gpu_c, gpu_s) - synchronize() return x, y end end @@ -484,9 +472,9 @@ for (fname, fname_64, elty) in ((:cublasIdamax_v2, :cublasIdamax_v2_64, :Float64 (:cublasIcamax_v2, :cublasIcamax_v2_64, :ComplexF32)) @eval begin function iamax(n::Integer, - dx::StridedCuVecOrDenseMat{$elty}, - result::CuRefArray{Ti, CuVector{Ti, DeviceMemory}}, - ) where {Ti <: Integer} + dx::StridedCuVecOrDenseMat{$elty}, + result::CuRefArray{Ti, CuVector{Ti, DeviceMemory}}, + ) where {Ti <: Integer} if CUBLAS.version() >= v"12.0" $fname_64(handle(), n, dx, stride(dx, 1), result) else @@ -505,9 +493,9 @@ for (fname, fname_64, elty) in ((:cublasIdamin_v2, :cublasIdamin_v2_64, :Float64 (:cublasIcamin_v2, :cublasIcamin_v2_64, :ComplexF32)) @eval begin function iamin(n::Integer, - dx::StridedCuVecOrDenseMat{$elty}, - result::CuRefArray{Ti, CuVector{Ti, DeviceMemory}}, - ) where {Ti <: Integer} + dx::StridedCuVecOrDenseMat{$elty}, + result::CuRefArray{Ti, CuVector{Ti, DeviceMemory}}, + ) where {Ti <: Integer} if CUBLAS.version() >= v"12.0" $fname_64(handle(), n, dx, stride(dx, 1), result) else @@ -520,14 +508,10 @@ end for fname in (:iamax, :iamin) @eval begin - function $fname( - n::Integer, - dx::StridedCuVecOrDenseMat - ) + function $fname(n::Integer, dx::StridedCuVecOrDenseMat) result_type = CUBLAS.version() >= v"12.0" ? Int64 : Cint gpu_result = CuRef{result_type}(zero(result_type)) gpu_result = $fname(n, dx, gpu_result) - synchronize() result = Array{result_type}(gpu_result.x) return only(result) end @@ -545,10 +529,10 @@ for (fname, fname_64, elty) in ((:cublasDgemv_v2, :cublasDgemv_v2_64, :Float64), (:cublasCgemv_v2, :cublasCgemv_v2_64, :ComplexF32)) @eval begin function gemv!(trans::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) # handle trans m,n = size(A) @@ -568,20 +552,16 @@ for (fname, fname_64, elty) in ((:cublasDgemv_v2, :cublasDgemv_v2_64, :Float64), end end function gemv!(trans::Char, alpha::Number, A::StridedCuMatrix{T}, x::StridedCuVector{T}, beta::Number, y::StridedCuVector{T}) where {T} - gpu_α = CuRef(convert(T, alpha)) - gpu_β = CuRef(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = gemv!(trans, gpu_α, A, x, gpu_β, y) synchronize() return y end -function gemv( - trans::Char, alpha::CuRefArray{T, CuVector{T, DeviceMemory}}, - A::StridedCuMatrix{T}, x::StridedCuVector{T} - ) where {T} +function gemv(trans::Char, alpha::CuRefArray{T, CuVector{T, DeviceMemory}}, A::StridedCuMatrix{T}, x::StridedCuVector{T}) where {T} return gemv!(trans, alpha, A, x, CuRef{T}(zero(T)), similar(x, size(A, (trans == 'N' ? 1 : 2)))) end -function gemv(trans::Char, alpha::Number, - A::StridedCuMatrix{T}, x::StridedCuVector{T}) where T +function gemv(trans::Char, alpha::Number, A::StridedCuMatrix{T}, x::StridedCuVector{T}) where T gemv!(trans, alpha, A, x, zero(T), similar(x, size(A, (trans == 'N' ? 1 : 2)))) end # should this be async? @@ -599,12 +579,12 @@ for (fname, fname_64, eltyin, eltyout, eltyconst) in ( ) @eval begin function gemv_batched!(trans::Char, - alpha::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, - A::Vector{<:StridedCuMatrix{$eltyin}}, - x::Vector{<:StridedCuVector{$eltyin}}, - beta::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, - y::Vector{<:StridedCuVector{$eltyout}} - ) + alpha::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, + A::Vector{<:StridedCuMatrix{$eltyin}}, + x::Vector{<:StridedCuVector{$eltyin}}, + beta::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, + y::Vector{<:StridedCuVector{$eltyout}} + ) if length(A) != length(x) || length(A) != length(y) throw(DimensionMismatch("Lengths of inputs must be the same")) end @@ -635,15 +615,15 @@ for (fname, fname_64, eltyin, eltyout, eltyconst) in ( y end function gemv_batched!( - trans::Char, - alpha::Number, - A::Vector{<:StridedCuMatrix{$eltyin}}, - x::Vector{<:StridedCuVector{$eltyin}}, - beta::Number, - y::Vector{<:StridedCuVector{$eltyout}} - ) - gpu_α = CuRef{$eltyconst}(convert($eltyconst, alpha)) - gpu_β = CuRef{$eltyconst}(convert($eltyconst, beta)) + trans::Char, + alpha::Number, + A::Vector{<:StridedCuMatrix{$eltyin}}, + x::Vector{<:StridedCuVector{$eltyin}}, + beta::Number, + y::Vector{<:StridedCuVector{$eltyout}} + ) + gpu_α = CuRef{$eltyconst}(alpha) + gpu_β = CuRef{$eltyconst}(beta) y = gemv_batched!(trans, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -661,12 +641,12 @@ for (fname, fname_64, eltyin, eltyout, eltyconst) in ( ) @eval begin function gemv_strided_batched!(trans::Char, - alpha::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, - A::AbstractArray{$eltyin, 3}, - x::AbstractArray{$eltyin, 2}, - beta::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, - y::AbstractArray{$eltyout, 2} - ) + alpha::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, + A::AbstractArray{$eltyin, 3}, + x::AbstractArray{$eltyin, 2}, + beta::CuRefArray{$eltyconst, CuVector{$eltyconst, DeviceMemory}}, + y::AbstractArray{$eltyout, 2} + ) if size(A, 3) != size(x, 2) || size(A, 3) != size(y, 2) throw(DimensionMismatch("Batch sizes must be equal for all inputs")) end @@ -691,15 +671,15 @@ for (fname, fname_64, eltyin, eltyout, eltyconst) in ( y end function gemv_strided_batched!( - trans::Char, - alpha::Number, - A::AbstractArray{$eltyin, 3}, - x::AbstractArray{$eltyin, 2}, - beta::Number, - y::AbstractArray{$eltyout, 2} - ) - gpu_α = CuRef{$eltyconst}(convert($eltyconst, alpha)) - gpu_β = CuRef{$eltyconst}(convert($eltyconst, beta)) + trans::Char, + alpha::Number, + A::AbstractArray{$eltyin, 3}, + x::AbstractArray{$eltyin, 2}, + beta::Number, + y::AbstractArray{$eltyout, 2} + ) + gpu_α = CuRef{$eltyconst}(alpha) + gpu_β = CuRef{$eltyconst}(beta) y = gemv_strided_batched!(trans, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -717,10 +697,10 @@ for (fname, fname_64, elty) in ((:cublasDgbmv_v2, :cublasDgbmv_v2_64, :Float64), m::Integer, kl::Integer, ku::Integer, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) n = size(A,2) # check dimensions @@ -736,30 +716,27 @@ for (fname, fname_64, elty) in ((:cublasDgbmv_v2, :cublasDgbmv_v2_64, :Float64), end y end - function gbmv!( - trans::Char, - m::Integer, - kl::Integer, - ku::Integer, - alpha::Number, - A::StridedCuMatrix{$elty}, - x::StridedCuVector{$elty}, - beta::Number, - y::StridedCuVector{$elty} - ) + function gbmv!(trans::Char, + m::Integer, + kl::Integer, + ku::Integer, + alpha::Number, + A::StridedCuMatrix{$elty}, + x::StridedCuVector{$elty}, + beta::Number, + y::StridedCuVector{$elty} + ) - gpu_α = CuRef{$elty}(convert($elty, alpha)) - gpu_β = CuRef{$elty}(convert($elty, beta)) + gpu_α = CuRef{$elty}(alpha) + gpu_β = CuRef{$elty}(beta) y = gbmv!(trans, m, kl, ku, gpu_α, A, x, gpu_β, y) synchronize() return y end end end -function gbmv( - trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::CuVector{T}, - A::StridedCuMatrix{T}, x::StridedCuVector{T} - ) where {T} +function gbmv(trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::CuVector{T}, + A::StridedCuMatrix{T}, x::StridedCuVector{T}) where {T} # TODO: fix gbmv bug in julia n = size(A, 2) leny = trans == 'N' ? m : n @@ -782,10 +759,10 @@ for (fname, fname_64, elty) in ((:cublasDspmv_v2, :cublasDspmv_v2_64, :Float64), (:cublasSspmv_v2, :cublasSspmv_v2_64, :Float32)) @eval begin function spmv!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, AP::StridedCuVector{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) n = round(Int, (sqrt(8*length(AP))-1)/2) if n != length(x) || n != length(y) throw(DimensionMismatch("")) end @@ -800,24 +777,21 @@ for (fname, fname_64, elty) in ((:cublasDspmv_v2, :cublasDspmv_v2_64, :Float64), end end end -function spmv!( - uplo::Char, - alpha::Number, - AP::StridedCuVector{T}, - x::StridedCuVector{T}, - beta::Number, - y::StridedCuVector{T} - ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) +function spmv!(uplo::Char, + alpha::Number, + AP::StridedCuVector{T}, + x::StridedCuVector{T}, + beta::Number, + y::StridedCuVector{T} + ) where {T} + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = spmv!(uplo, gpu_α, AP, x, gpu_β, y) synchronize() return y end -function spmv( - uplo::Char, alpha::CuVector{T}, - AP::StridedCuVector{T}, x::StridedCuVector{T} - ) where {T} +function spmv(uplo::Char, alpha::CuVector{T}, + AP::StridedCuVector{T}, x::StridedCuVector{T}) where {T} return spmv!(uplo, alpha, AP, x, CuRef{T}(zero(T)), similar(x)) end function spmv(uplo::Char, alpha::Number, @@ -836,10 +810,10 @@ for (fname, fname_64, elty) in ((:cublasDsymv_v2, :cublasDsymv_v2_64, :Float64), # Note that the complex symv are not BLAS but auiliary functions in LAPACK @eval begin function symv!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end @@ -864,8 +838,8 @@ function symv!( beta::Number, y::StridedCuVector{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = symv!(uplo, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -890,10 +864,10 @@ for (fname, fname_64, elty) in ((:cublasZhemv_v2, :cublasZhemv_v2_64, :ComplexF6 (:cublasChemv_v2, :cublasChemv_v2_64, :ComplexF32)) @eval begin function hemv!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) # TODO: fix dimension check bug in julia m, n = size(A) @@ -919,8 +893,8 @@ function hemv!( beta::Number, y::StridedCuVector{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = hemv!(uplo, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -948,10 +922,10 @@ for (fname, fname_64, elty) in ((:cublasDsbmv_v2, :cublasDsbmv_v2_64, :Float64), @eval begin function sbmv!(uplo::Char, k::Integer, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) m, n = size(A) #if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end @@ -979,8 +953,8 @@ function sbmv!( beta::Number, y::StridedCuVector{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = sbmv!(uplo, k, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -1007,10 +981,10 @@ for (fname, fname_64, elty) in ((:cublasZhbmv_v2, :cublasZhbmv_v2_64, :ComplexF6 @eval begin function hbmv!(uplo::Char, k::Integer, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, x::StridedCuVector{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, y::StridedCuVector{$elty}) m, n = size(A) if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end @@ -1037,8 +1011,8 @@ function hbmv!( beta::Number, y::StridedCuVector{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) y = hbmv!(uplo, k, gpu_α, A, x, gpu_β, y) synchronize() return y @@ -1194,7 +1168,7 @@ for (fname, fname_64, elty) in ((:cublasDger_v2, :cublasDger_v2_64, :Float64), (:cublasCgerc_v2, :cublasCgerc_v2_64, :ComplexF32)) @eval begin function ger!( - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, x::StridedCuVector{$elty}, y::StridedCuVector{$elty}, A::StridedCuMatrix{$elty}) @@ -1219,7 +1193,7 @@ function ger!( y::StridedCuVector{T}, A::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) A = ger!(gpu_α, x, y, A) synchronize() return A @@ -1230,7 +1204,7 @@ for (fname, fname_64, elty) in ((:cublasDspr_v2, :cublasDspr_v2_64, :Float64), (:cublasSspr_v2, :cublasSspr_v2_64, :Float32)) @eval begin function spr!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, x::StridedCuVector{$elty}, AP::StridedCuVector{$elty}) n = round(Int, (sqrt(8*length(AP))-1)/2) @@ -1250,7 +1224,7 @@ function spr!( x::StridedCuVector{T}, AP::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) AP = spr!(gpu_α, x, AP) synchronize() return AP @@ -1264,7 +1238,7 @@ for (fname, fname_64, elty) in ((:cublasDsyr_v2, :cublasDsyr_v2_64, :Float64), (:cublasCsyr_v2, :cublasCsyr_v2_64, :ComplexF32)) @eval begin function syr!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, x::StridedCuVector{$elty}, A::StridedCuMatrix{$elty}) m, n = size(A) @@ -1287,7 +1261,7 @@ function syr!( x::StridedCuVector{T}, A::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) A = syr!(uplo, gpu_α, x, A) synchronize() return A @@ -1300,7 +1274,7 @@ for (fname, fname_64, elty, relty) in ( ) @eval begin function her!(uplo::Char, - alpha::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, + alpha::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, x::StridedCuVector{$elty}, A::StridedCuMatrix{$elty}) m, n = size(A) @@ -1323,7 +1297,7 @@ function her!( x::StridedCuVector{T}, A::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef(convert(real(T), alpha)) + gpu_α = CuRef{real(T)}(alpha) A = her!(uplo, gpu_α, x, A) synchronize() return A @@ -1334,11 +1308,11 @@ for (fname, fname_64, elty) in ((:cublasZher2_v2, :cublasZher2_v2_64, :ComplexF6 (:cublasCher2_v2, :cublasCher2_v2_64, :ComplexF32)) @eval begin function her2!(uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, - x::StridedCuVector{$elty}, - y::StridedCuVector{$elty}, - A::StridedCuMatrix{$elty} - ) + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + x::StridedCuVector{$elty}, + y::StridedCuVector{$elty}, + A::StridedCuMatrix{$elty} + ) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions")) @@ -1362,7 +1336,7 @@ function her2!( y::StridedCuVector{T}, A::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) A = her2!(uplo, gpu_α, x, y, A) synchronize() return A @@ -1378,10 +1352,10 @@ for (fname, fname_64, elty) in ((:cublasDgemm_v2, :cublasDgemm_v2_64, :Float64), @eval begin function gemm!(transA::Char, transB::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, B::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuVecOrMat{$elty}) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) @@ -1410,8 +1384,8 @@ function gemm!( beta::Number, C::StridedCuVecOrMat{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = gemm!(transA, transB, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -1519,10 +1493,10 @@ function gemmExComputeType(TA, TB, TC, m, k, n) end function gemmEx!(transA::Char, transB::Char, - @nospecialize(alpha::CuRefArray), + @nospecialize(alpha::CuRefArray), @nospecialize(A::StridedCuVecOrMat), @nospecialize(B::StridedCuVecOrMat), - @nospecialize(beta::CuRefArray), + @nospecialize(beta::CuRefArray), @nospecialize(C::StridedCuVecOrMat); algo::cublasGemmAlgo_t=CUBLAS_GEMM_DEFAULT) m = size(A, transA == 'N' ? 1 : 2) @@ -1577,10 +1551,10 @@ end # TODO for device mode pointers function gemmBatchedEx!(transA::Char, transB::Char, - @nospecialize(alpha::CuRefArray), + @nospecialize(alpha::CuRefArray), @nospecialize(A::Vector{<:StridedCuVecOrMat}), @nospecialize(B::Vector{<:StridedCuVecOrMat}), - @nospecialize(beta::CuRefArray), + @nospecialize(beta::CuRefArray), @nospecialize(C::Vector{<:StridedCuVecOrMat}); algo::cublasGemmAlgo_t=CUBLAS_GEMM_DEFAULT) if length(A) != length(B) || length(A) != length(C) @@ -1625,7 +1599,7 @@ end function gemmBatchedEx!( transA::Char, transB::Char, - @nospecialize(alpha::Number), + @nospecialize(alpha::Number), @nospecialize(A::Vector{<:StridedCuVecOrMat}), @nospecialize(B::Vector{<:StridedCuVecOrMat}), @nospecialize(beta::Number), @@ -1640,19 +1614,19 @@ function gemmBatchedEx!( isnothing(computeType) && throw(ArgumentError("gemmEx does not support $(eltype(C))=$(eltype(A))*$(eltype(B))")) computeT = juliaStorageType(eltype(C[1]), computeType) - gpu_α = CuRef{computeT}(convert(computeT, alpha)) - gpu_β = CuRef{computeT}(convert(computeT, beta)) + gpu_α = CuRef{computeT}(alpha) + gpu_β = CuRef{computeT}(beta) C = gemmBatchedEx!(transA, transB, gpu_α, A, B, gpu_β, C; algo = algo) synchronize() return C end function gemmStridedBatchedEx!( - transA::Char, transB::Char, - @nospecialize(alpha::CuRefArray), + transA::Char, transB::Char, + @nospecialize(alpha::CuRefArray), @nospecialize(A::AbstractArray{Ta, 3}), @nospecialize(B::AbstractArray{Tb, 3}), - @nospecialize(beta::CuRefArray), + @nospecialize(beta::CuRefArray), @nospecialize(C::AbstractArray{Tc, 3}); algo::cublasGemmAlgo_t=CUBLAS_GEMM_DEFAULT) where {Ta, Tb, Tc} if size(A, 3) != size(B, 3) || size(A, 3) != size(C, 3) @@ -1705,8 +1679,8 @@ function gemmStridedBatchedEx!( isnothing(computeType) && throw(ArgumentError("gemmEx does not support $(eltype(C))=$(eltype(A))*$(eltype(B))")) computeT = juliaStorageType(eltype(C), computeType) - gpu_α = CuRef{computeT}(convert(computeT, alpha)) - gpu_β = CuRef{computeT}(convert(computeT, beta)) + gpu_α = CuRef{computeT}(alpha) + gpu_β = CuRef{computeT}(beta) C = gemmStridedBatchedEx!(transA, transB, gpu_α, A, B, gpu_β, C; algo = algo) synchronize() return C @@ -1885,16 +1859,16 @@ end ## (GE) general matrix-matrix multiplication batched for (fname, fname_64, elty) in ((:cublasDgemmBatched, :cublasDgemmBatched_64, :Float64), (:cublasSgemmBatched, :cublasSgemmBatched_64, :Float32), - (:cublasHgemmBatched, :cublasHgemmBatched, :Float16), + (:cublasHgemmBatched, :cublasHgemmBatched, :Float16), (:cublasZgemmBatched, :cublasZgemmBatched_64, :ComplexF64), (:cublasCgemmBatched, :cublasCgemmBatched_64, :ComplexF32)) @eval begin function gemm_batched!(transA::Char, transB::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::Vector{<:StridedCuMatrix{$elty}}, B::Vector{<:StridedCuMatrix{$elty}}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::Vector{<:StridedCuMatrix{$elty}}) if length(A) != length(B) || length(A) != length(C) throw(DimensionMismatch("")) @@ -1942,8 +1916,8 @@ function gemm_batched!( beta::Number, C::Vector{<:StridedCuMatrix{T}} ) where {T} - gpu_α = CuRef(convert(T, alpha)) - gpu_β = CuRef(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = gemm_batched!(transA, transB, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -1968,16 +1942,16 @@ end ## (GE) general matrix-matrix multiplication strided batched for (fname, fname_64, elty) in ((:cublasDgemmStridedBatched, :cublasDgemmStridedBatched_64, :Float64), (:cublasSgemmStridedBatched, :cublasSgemmStridedBatched_64, :Float32), - (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched, :Float16), + (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched, :Float16), (:cublasZgemmStridedBatched, :cublasZgemmStridedBatched_64, :ComplexF64), (:cublasCgemmStridedBatched, :cublasCgemmStridedBatched_64, :ComplexF32)) @eval begin function gemm_strided_batched!(transA::Char, transB::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::AbstractArray{$elty, 3}, # allow PermutedDimsArray B::AbstractArray{$elty, 3}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::AbstractArray{$elty, 3}) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) @@ -2017,8 +1991,8 @@ function gemm_strided_batched!( beta::Number, C::AbstractArray{T, 3} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = gemm_strided_batched!(transA, transB, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2057,10 +2031,10 @@ for (fname, fname_64, elty) in ((:cublasDsymm_v2, :cublasDsymm_v2_64, :Float64), @eval begin function symm!(side::Char, uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuMatrix{$elty}) k, nA = size(A) if k != nA throw(DimensionMismatch("Matrix A must be square")) end @@ -2090,8 +2064,8 @@ function symm!( beta::Number, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = symm!(side, uplo, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2119,9 +2093,9 @@ for (fname, fname_64, elty) in ((:cublasDsyrk_v2, :cublasDsyrk_v2_64, :Float64), @eval begin function syrk!(uplo::Char, trans::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuMatrix{$elty}) mC, n = size(C) if mC != n throw(DimensionMismatch("C must be square")) end @@ -2147,8 +2121,8 @@ function syrk!( beta::Number, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = syrk!(uplo, trans, gpu_α, A, gpu_β, C) synchronize() return C @@ -2172,10 +2146,10 @@ for (fname, fname_64, elty) in ((:cublasDsyrkx, :cublasDsyrkx_64, :Float64), @eval begin function syrkx!(uplo::Char, trans::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, B::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuMatrix{$elty}) mC, n = size(C) if mC != n throw(DimensionMismatch("C must be square")) end @@ -2203,8 +2177,8 @@ function syrkx!( beta::Number, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = syrkx!(uplo, trans, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2231,10 +2205,10 @@ for (fname, fname_64, elty) in ((:cublasZhemm_v2, :cublasZhemm_v2_64, :ComplexF6 @eval begin function hemm!(side::Char, uplo::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuMatrix{$elty}) mA, nA = size(A) m, n = size(B) @@ -2264,8 +2238,8 @@ function hemm!( beta::Number, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = hemm!(side, uplo, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2294,9 +2268,9 @@ for (fname, fname_64, elty, relty) in ( @eval begin function herk!(uplo::Char, trans::Char, - alpha::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, + alpha::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, + beta::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, C::StridedCuMatrix{$elty}) mC, n = size(C) if mC != n throw(DimensionMismatch("C must be square")) end @@ -2320,8 +2294,8 @@ for (fname, fname_64, elty, relty) in ( beta::Real, C::StridedCuMatrix{$elty} ) - gpu_α = CuRef{$relty}(convert($relty, alpha)) - gpu_β = CuRef{$relty}(convert($relty, beta)) + gpu_α = CuRef{$relty}(alpha) + gpu_β = CuRef{$relty}(beta) C = herk!(uplo, trans, gpu_α, A, gpu_β, C) synchronize() return C @@ -2353,10 +2327,10 @@ for (fname, fname_64, elty) in ((:cublasDsyr2k_v2, :cublasDsyr2k_v2_64, :Float64 @eval begin function syr2k!(uplo::Char, trans::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, B::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, C::StridedCuMatrix{$elty}) # TODO: check size of B in julia (syr2k!) m, n = size(C) @@ -2389,8 +2363,8 @@ function syr2k!( beta::Number, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = syr2k!(uplo, trans, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2403,7 +2377,7 @@ function syr2k( B::StridedCuVecOrMat{T} ) where {T} n = size(A, trans == 'N' ? 1 : 2) - return syr2k!(uplo, trans, convert(T, alpha), A, B, CuRef{T}(zero(T)), similar(A, T, (n, n))) + return syr2k!(uplo, trans, alpha, A, B, CuRef{T}(zero(T)), similar(A, T, (n, n))) end function syr2k(uplo::Char, trans::Char, @@ -2412,7 +2386,7 @@ function syr2k(uplo::Char, B::StridedCuVecOrMat) T = eltype(A) n = size(A, trans == 'N' ? 1 : 2) - syr2k!(uplo, trans, convert(T,alpha), A, B, zero(T), similar(A, T, (n, n))) + syr2k!(uplo, trans, convert(T, alpha), A, B, zero(T), similar(A, T, (n, n))) end function syr2k(uplo::Char, trans::Char, A::StridedCuVecOrMat, B::StridedCuVecOrMat) syr2k(uplo, trans, one(eltype(A)), A, B) @@ -2426,10 +2400,10 @@ for (fname, fname_64, elty, relty) in ( @eval begin function her2k!(uplo::Char, trans::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuVecOrMat{$elty}, B::StridedCuVecOrMat{$elty}, - beta::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, + beta::CuRefArray{$relty, CuVector{$relty, DeviceMemory}}, C::StridedCuMatrix{$elty}) # TODO: check size of B in julia (her2k!) m, n = size(C) @@ -2461,8 +2435,8 @@ for (fname, fname_64, elty, relty) in ( beta::Real, C::StridedCuMatrix{$elty} ) - gpu_α = CuRef{$elty}(convert($elty, alpha)) - gpu_β = CuRef{$relty}(convert($relty, beta)) + gpu_α = CuRef{$elty}(alpha) + gpu_β = CuRef{$relty}(beta) C = her2k!(uplo, trans, gpu_α, A, B, gpu_β, C) synchronize() return C @@ -2503,7 +2477,7 @@ for (mmname, smname, elty) in uplo::Char, transa::Char, diag::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}, C::StridedCuMatrix{$elty}) @@ -2525,7 +2499,7 @@ for (mmname, smname, elty) in uplo::Char, transa::Char, diag::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}) m, n = size(B) @@ -2550,7 +2524,7 @@ function trmm!( B::StridedCuMatrix{T}, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) C = trmm!(side, uplo, transa, diag, gpu_α, A, B, C) synchronize() return C @@ -2564,7 +2538,7 @@ function trsm!( A::StridedCuMatrix{T}, B::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) B = trsm!(side, uplo, transa, diag, gpu_α, A, B) synchronize() return B @@ -2590,7 +2564,7 @@ for (fname, fname_64, elty) in ((:cublasDtrsmBatched, :cublasDtrsmBatched_64, :F uplo::Char, transa::Char, diag::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::Vector{<:StridedCuMatrix{$elty}}, B::Vector{<:StridedCuMatrix{$elty}}) if length(A) != length(B) @@ -2624,7 +2598,7 @@ function trsm_batched!( side::Char, uplo::Char, transa::Char, diag::Char, alpha::Number, A::Vector{<:StridedCuMatrix{T}}, B::Vector{<:StridedCuMatrix{T}}, ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) + gpu_α = CuRef{T}(alpha) B = trsm_batched!(side, uplo, transa, diag, gpu_α, A, B) synchronize() return B @@ -2646,9 +2620,9 @@ for (fname, fname_64, elty) in ((:cublasDgeam, :cublasDgeam_64, :Float64), @eval begin function geam!(transa::Char, transb::Char, - alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + alpha::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, A::StridedCuMatrix{$elty}, - beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, + beta::CuRefArray{$elty, CuVector{$elty, DeviceMemory}}, B::StridedCuMatrix{$elty}, C::StridedCuMatrix{$elty}) mA, nA = size(A) @@ -2679,8 +2653,8 @@ function geam!( B::StridedCuMatrix{T}, C::StridedCuMatrix{T} ) where {T} - gpu_α = CuRef{T}(convert(T, alpha)) - gpu_β = CuRef{T}(convert(T, beta)) + gpu_α = CuRef{T}(alpha) + gpu_β = CuRef{T}(beta) C = geam!(transa, transb, gpu_α, A, gpu_β, B, C) synchronize() return C @@ -2886,8 +2860,8 @@ for (fname, elty) in ((:cublasDgetriBatched, :Float64), end function getri_batched!(n, Aptrs::CuVector{CuPtr{$elty}}, - lda, Cptrs::CuVector{CuPtr{$elty}},ldc, - pivotArray::CuArray{Cint}) + lda, Cptrs::CuVector{CuPtr{$elty}},ldc, + pivotArray::CuArray{Cint}) batchSize = length(Aptrs) info = CuArray{Cint}(undef, batchSize) $fname(handle(), n, Aptrs, lda, pivotArray, Cptrs, ldc, info, batchSize)