From ae37ef649f2000b4b7b95e9118f8dafdf645e96e Mon Sep 17 00:00:00 2001 From: Lee Killough Date: Thu, 2 Apr 2020 18:43:34 -0400 Subject: [PATCH] Clean up text files --- .../Current-Release-Notes.rst | 48 +- Deep_learning/Deep-learning.rst | 24 +- Deep_learning/GCN-asm-tutorial.rst | 22 +- Deep_learning/MXNet.rst | 44 +- Deep_learning/caffe.rst | 50 +- Deep_learning/hipCaffe .rst | 62 +- Doxyfile | 2 +- FAQ/FAQ_HIP.rst | 42 +- GCN_ISA_Manuals/GCN-ISA-Manuals.rst | 24 +- GCN_ISA_Manuals/PCIe-features.rst | 50 +- GCN_ISA_Manuals/caffe.rst | 48 +- GCN_ISA_Manuals/testdocbook.rst | 172 +-- Installation_Guide/FAQ-on-Installation.rst | 34 +- Installation_Guide/HCC-Compiler.rst | 2 +- Installation_Guide/HIP.rst | 14 +- Installation_Guide/Installation-Guide.rst | 136 +-- ...ist-of-ROCm-Packages-for-Ubuntu-Fedora.rst | 38 +- .../More-about-how-ROCm-uses-PCIe-Atomics.rst | 52 +- .../Quick Start Installation Guide.rst | 54 +- Installation_Guide/QuickStartGuideOpenCL.rst | 20 +- Installation_Guide/ROC-smi.rst | 32 +- .../ROCK-Kernel-Driver_readme.rst | 2 +- Installation_Guide/ROCR-Runtime.rst | 2 +- Installation_Guide/ROCk-kernel.rst | 10 +- Installation_Guide/atmi.rst | 10 +- Other_Solutions/Other-Solutions.rst | 30 +- Other_Solutions/PCIe-Debug.rst | 24 +- Other_Solutions/ROCm_PCIe_Debug.md | 24 +- Programming_Guides/CUDAAPIHIP.rst | 4 +- Programming_Guides/CUDAAPIHIPTEXTURE.rst | 2 +- Programming_Guides/HIP-FAQ.rst | 6 +- Programming_Guides/HIP-GUIDE.rst | 508 ++++---- Programming_Guides/HIP-porting-guide.rst | 160 +-- Programming_Guides/HIP_Debugging.rst | 32 +- Programming_Guides/Kernel_language.rst | 596 +++++----- Programming_Guides/LanguageInto.rst | 14 +- Programming_Guides/Opencl-optimization.rst | 582 +++++----- .../Opencl-programming-guide.rst | 640 +++++----- Programming_Guides/Programming-Guides.rst | 24 +- Programming_Guides/hcc-guide.rst | 2 +- Programming_Guides/hcc-profile.rst | 8 +- Programming_Guides/hip-programming-guide.rst | 4 +- Programming_Guides/hip-programming.rst | 6 +- Programming_Guides/hip_install.rst | 4 +- Programming_Guides/hip_port.rst | 14 +- Programming_Guides/hip_profiling.rst | 58 +- Programming_Guides/hipporting-driver-api.rst | 84 +- README.md | 6 +- ROCm.rst | 36 +- ROCm_API_References/BLAS1.rst | 16 +- ROCm_API_References/BLAS2.rst | 20 +- ROCm_API_References/BLAS3.rst | 12 +- ROCm_API_References/HCC-API.rst | 4 +- ROCm_API_References/HIP-MATH.rst | 1012 ++++++++-------- .../HIP_API/Context-Management.rst | 50 +- ROCm_API_References/HIP_API/Control.rst | 4 +- .../HIP_API/Device-Memory-Access.rst | 8 +- .../HIP_API/Device-management.rst | 24 +- ROCm_API_References/HIP_API/Error.rst | 12 +- .../HIP_API/Event-Management.rst | 10 +- .../HIP_API/Initialization-and-Version.rst | 24 +- .../HIP_API/Memory-Management.rst | 40 +- .../HIP_API/Stream-Management.rst | 16 +- ROCm_API_References/ROCr-API.rst | 10 +- ROCm_API_References/Thrust.rst | 44 +- ROCm_API_References/api.rst | 8 +- ROCm_API_References/clBLAS.rst | 8 +- ROCm_API_References/clSPARSE_API.rst | 6 +- ROCm_API_References/clSPARSE_api.rst | 10 +- ROCm_API_References/rocBLAS.rst | 4 +- ROCm_Audio_Video_Tutorials/ROCm_videos.rst | 4 +- ROCm_Compiler_SDK/ROCm-Codeobj-format.rst | 36 +- ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst | 4 +- ROCm_Compiler_SDK/ROCm-Native-ISA.rst | 462 ++++---- ROCm_Compiler_SDK/ocml.rst | 30 +- ROCm_Glossary/ROCm-Glossary.rst | 4 +- ROCm_Libraries/ROCm_Libraries.rst | 192 +-- ROCm_Libraries/dep-lib.rst | 80 +- ROCm_Libraries/hipsparse_wiki.rst | 84 +- ROCm_Libraries/rocALUTION/Doxyfile | 126 +- .../rocALUTION/src/base/base_matrix.hpp | 2 +- .../rocALUTION/src/base/host/CMakeLists.txt | 2 +- .../src/base/host/host_matrix_csr.cpp | 2 +- .../solvers/multigrid/ruge_stueben_amg.cpp | 4 +- .../preconditioners/preconditioner.hpp | 2 +- .../preconditioners/preconditioner_ai.hpp | 4 +- ROCm_Libraries/rocBLAS/Doxyfile | 124 +- .../rocBLAS/src/include/rocblas-functions.h | 14 +- .../rocBLAS/src/src/blas1/rocblas_copy.cpp | 2 +- .../rocBLAS/src/src/blas1/rocblas_scal.cpp | 2 +- .../rocBLAS/src/src/blas1/rocblas_swap.cpp | 2 +- .../src/src/blas_ex/rocblas_gemm_ex.hpp | 32 +- ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp | 82 +- ROCm_Libraries/rocFFT/Doxyfile | 124 +- ROCm_Libraries/rocSOLVER/API.rst | 8 +- ROCm_Libraries/rocSOLVER/Doxyfile | 124 +- ROCm_Libraries/rocSOLVER/Introduction.rst | 84 +- ROCm_Libraries/rocSOLVER/Jenkinsfile | 10 +- ROCm_Libraries/rocSOLVER/LICENSE.md | 2 +- .../rocSOLVER/bump_develop_version.sh | 4 +- .../rocSOLVER/bump_master_version.sh | 2 +- .../rocSOLVER/cmake/get-cli-arguments.cmake | 2 +- ROCm_Libraries/rocSOLVER/debian/postinst | 1 - ROCm_Libraries/rocSOLVER/debian/prerm | 1 - .../rocSOLVER/deps/external-lapack.cmake | 2 +- ROCm_Libraries/rocSOLVER/docs/Doxyfile | 124 +- .../library/include/rocsolver-functions.h | 1026 ++++++++--------- .../docs/library/include/rocsolver-types.h | 20 +- .../rocSOLVER/docs/library/src/CMakeLists.txt | 2 +- .../src/auxiliary/rocauxiliary_larf.cpp | 10 +- .../src/auxiliary/rocauxiliary_larf.hpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.cpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.hpp | 58 +- .../src/auxiliary/rocauxiliary_larfg.cpp | 2 +- .../src/auxiliary/rocauxiliary_larfg.hpp | 16 +- .../src/auxiliary/rocauxiliary_larft.cpp | 6 +- .../src/auxiliary/rocauxiliary_larft.hpp | 38 +- .../src/auxiliary/rocauxiliary_laswp.cpp | 4 +- .../src/auxiliary/rocauxiliary_laswp.hpp | 6 +- .../src/auxiliary/rocauxiliary_org2r.cpp | 2 +- .../src/auxiliary/rocauxiliary_org2r.hpp | 32 +- .../src/auxiliary/rocauxiliary_orgbr.cpp | 4 +- .../src/auxiliary/rocauxiliary_orgbr.hpp | 70 +- .../src/auxiliary/rocauxiliary_orgl2.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgl2.hpp | 32 +- .../src/auxiliary/rocauxiliary_orglq.cpp | 2 +- .../src/auxiliary/rocauxiliary_orglq.hpp | 40 +- .../src/auxiliary/rocauxiliary_orgqr.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgqr.hpp | 40 +- .../src/auxiliary/rocauxiliary_orm2r.cpp | 6 +- .../src/auxiliary/rocauxiliary_orm2r.hpp | 18 +- .../src/auxiliary/rocauxiliary_ormqr.cpp | 6 +- .../src/auxiliary/rocauxiliary_ormqr.hpp | 14 +- .../docs/library/src/common/rocblas.cpp | 2 +- .../library/src/include/common_device.hpp | 8 +- .../docs/library/src/include/ideal_sizes.hpp | 2 +- .../src/include/rocsolver_unique_ptr.hpp | 48 +- .../library/src/lapack/roclapack_gelq2.cpp | 14 +- .../library/src/lapack/roclapack_gelq2.hpp | 18 +- .../src/lapack/roclapack_gelq2_batched.cpp | 14 +- .../roclapack_gelq2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_gelqf.cpp | 14 +- .../library/src/lapack/roclapack_gelqf.hpp | 24 +- .../src/lapack/roclapack_gelqf_batched.cpp | 14 +- .../roclapack_gelqf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_geqr2.cpp | 14 +- .../library/src/lapack/roclapack_geqr2.hpp | 18 +- .../src/lapack/roclapack_geqr2_batched.cpp | 14 +- .../roclapack_geqr2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_geqrf.cpp | 14 +- .../library/src/lapack/roclapack_geqrf.hpp | 24 +- .../src/lapack/roclapack_geqrf_batched.cpp | 14 +- .../roclapack_geqrf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_getf2.cpp | 18 +- .../library/src/lapack/roclapack_getf2.hpp | 22 +- .../src/lapack/roclapack_getf2_batched.cpp | 18 +- .../roclapack_getf2_strided_batched.cpp | 20 +- .../library/src/lapack/roclapack_getrf.cpp | 14 +- .../library/src/lapack/roclapack_getrf.hpp | 14 +- .../src/lapack/roclapack_getrf_batched.cpp | 16 +- .../roclapack_getrf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_getrs.cpp | 16 +- .../library/src/lapack/roclapack_getrs.hpp | 8 +- .../src/lapack/roclapack_getrs_batched.cpp | 18 +- .../roclapack_getrs_strided_batched.cpp | 18 +- .../library/src/lapack/roclapack_potf2.cpp | 12 +- .../library/src/lapack/roclapack_potf2.hpp | 22 +- .../src/lapack/roclapack_potf2_batched.cpp | 14 +- .../roclapack_potf2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_potrf.cpp | 12 +- .../library/src/lapack/roclapack_potrf.hpp | 28 +- .../src/lapack/roclapack_potrf_batched.cpp | 14 +- .../roclapack_potrf_strided_batched.cpp | 14 +- .../library/src/rocsolver-config.cmake.in | 2 +- ROCm_Libraries/rocSOLVER/docs/source/api.rst | 8 +- .../rocSOLVER/docs/source/index.rst | 4 +- .../rocSOLVER/docs/source/library.rst | 90 +- ROCm_Libraries/rocSOLVER/index.rst | 4 +- .../library/include/rocsolver-functions.h | 1026 ++++++++--------- .../library/include/rocsolver-types.h | 20 +- .../rocSOLVER/library/src/CMakeLists.txt | 2 +- .../src/auxiliary/rocauxiliary_larf.cpp | 10 +- .../src/auxiliary/rocauxiliary_larf.hpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.cpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.hpp | 58 +- .../src/auxiliary/rocauxiliary_larfg.cpp | 2 +- .../src/auxiliary/rocauxiliary_larfg.hpp | 16 +- .../src/auxiliary/rocauxiliary_larft.cpp | 6 +- .../src/auxiliary/rocauxiliary_larft.hpp | 38 +- .../src/auxiliary/rocauxiliary_laswp.cpp | 4 +- .../src/auxiliary/rocauxiliary_laswp.hpp | 6 +- .../src/auxiliary/rocauxiliary_org2r.cpp | 2 +- .../src/auxiliary/rocauxiliary_org2r.hpp | 32 +- .../src/auxiliary/rocauxiliary_orgbr.cpp | 4 +- .../src/auxiliary/rocauxiliary_orgbr.hpp | 70 +- .../src/auxiliary/rocauxiliary_orgl2.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgl2.hpp | 32 +- .../src/auxiliary/rocauxiliary_orglq.cpp | 2 +- .../src/auxiliary/rocauxiliary_orglq.hpp | 40 +- .../src/auxiliary/rocauxiliary_orgqr.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgqr.hpp | 40 +- .../src/auxiliary/rocauxiliary_orm2r.cpp | 6 +- .../src/auxiliary/rocauxiliary_orm2r.hpp | 18 +- .../src/auxiliary/rocauxiliary_ormqr.cpp | 6 +- .../src/auxiliary/rocauxiliary_ormqr.hpp | 14 +- .../rocSOLVER/library/src/common/rocblas.cpp | 2 +- .../library/src/include/common_device.hpp | 8 +- .../library/src/include/ideal_sizes.hpp | 2 +- .../src/include/rocsolver_unique_ptr.hpp | 48 +- .../library/src/lapack/roclapack_gelq2.cpp | 14 +- .../library/src/lapack/roclapack_gelq2.hpp | 18 +- .../src/lapack/roclapack_gelq2_batched.cpp | 14 +- .../roclapack_gelq2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_gelqf.cpp | 14 +- .../library/src/lapack/roclapack_gelqf.hpp | 24 +- .../src/lapack/roclapack_gelqf_batched.cpp | 14 +- .../roclapack_gelqf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_geqr2.cpp | 14 +- .../library/src/lapack/roclapack_geqr2.hpp | 18 +- .../src/lapack/roclapack_geqr2_batched.cpp | 14 +- .../roclapack_geqr2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_geqrf.cpp | 14 +- .../library/src/lapack/roclapack_geqrf.hpp | 24 +- .../src/lapack/roclapack_geqrf_batched.cpp | 14 +- .../roclapack_geqrf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_getf2.cpp | 18 +- .../library/src/lapack/roclapack_getf2.hpp | 22 +- .../src/lapack/roclapack_getf2_batched.cpp | 18 +- .../roclapack_getf2_strided_batched.cpp | 20 +- .../library/src/lapack/roclapack_getrf.cpp | 14 +- .../library/src/lapack/roclapack_getrf.hpp | 14 +- .../src/lapack/roclapack_getrf_batched.cpp | 16 +- .../roclapack_getrf_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_getrs.cpp | 16 +- .../library/src/lapack/roclapack_getrs.hpp | 8 +- .../src/lapack/roclapack_getrs_batched.cpp | 18 +- .../roclapack_getrs_strided_batched.cpp | 18 +- .../library/src/lapack/roclapack_potf2.cpp | 12 +- .../library/src/lapack/roclapack_potf2.hpp | 22 +- .../src/lapack/roclapack_potf2_batched.cpp | 14 +- .../roclapack_potf2_strided_batched.cpp | 14 +- .../library/src/lapack/roclapack_potrf.cpp | 12 +- .../library/src/lapack/roclapack_potrf.hpp | 28 +- .../src/lapack/roclapack_potrf_batched.cpp | 14 +- .../roclapack_potrf_strided_batched.cpp | 14 +- .../library/src/rocsolver-config.cmake.in | 2 +- ROCm_Libraries/rocSOLVER/src/CMakeLists.txt | 2 +- .../src/auxiliary/rocauxiliary_larf.cpp | 10 +- .../src/auxiliary/rocauxiliary_larf.hpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.cpp | 12 +- .../src/auxiliary/rocauxiliary_larfb.hpp | 58 +- .../src/auxiliary/rocauxiliary_larfg.cpp | 2 +- .../src/auxiliary/rocauxiliary_larfg.hpp | 16 +- .../src/auxiliary/rocauxiliary_larft.cpp | 6 +- .../src/auxiliary/rocauxiliary_larft.hpp | 38 +- .../src/auxiliary/rocauxiliary_laswp.cpp | 4 +- .../src/auxiliary/rocauxiliary_laswp.hpp | 6 +- .../src/auxiliary/rocauxiliary_org2r.cpp | 2 +- .../src/auxiliary/rocauxiliary_org2r.hpp | 32 +- .../src/auxiliary/rocauxiliary_orgbr.cpp | 4 +- .../src/auxiliary/rocauxiliary_orgbr.hpp | 70 +- .../src/auxiliary/rocauxiliary_orgl2.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgl2.hpp | 32 +- .../src/auxiliary/rocauxiliary_orglq.cpp | 2 +- .../src/auxiliary/rocauxiliary_orglq.hpp | 40 +- .../src/auxiliary/rocauxiliary_orgqr.cpp | 2 +- .../src/auxiliary/rocauxiliary_orgqr.hpp | 40 +- .../src/auxiliary/rocauxiliary_orm2r.cpp | 6 +- .../src/auxiliary/rocauxiliary_orm2r.hpp | 18 +- .../src/auxiliary/rocauxiliary_ormqr.cpp | 6 +- .../src/auxiliary/rocauxiliary_ormqr.hpp | 14 +- .../rocSOLVER/src/common/rocblas.cpp | 2 +- .../rocSOLVER/src/include/common_device.hpp | 8 +- .../rocSOLVER/src/include/ideal_sizes.hpp | 2 +- .../src/include/rocsolver_unique_ptr.hpp | 48 +- .../rocSOLVER/src/lapack/roclapack_gelq2.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_gelq2.hpp | 18 +- .../src/lapack/roclapack_gelq2_batched.cpp | 14 +- .../roclapack_gelq2_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_gelqf.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_gelqf.hpp | 24 +- .../src/lapack/roclapack_gelqf_batched.cpp | 14 +- .../roclapack_gelqf_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_geqr2.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_geqr2.hpp | 18 +- .../src/lapack/roclapack_geqr2_batched.cpp | 14 +- .../roclapack_geqr2_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_geqrf.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_geqrf.hpp | 24 +- .../src/lapack/roclapack_geqrf_batched.cpp | 14 +- .../roclapack_geqrf_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_getf2.cpp | 18 +- .../rocSOLVER/src/lapack/roclapack_getf2.hpp | 22 +- .../src/lapack/roclapack_getf2_batched.cpp | 18 +- .../roclapack_getf2_strided_batched.cpp | 20 +- .../rocSOLVER/src/lapack/roclapack_getrf.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_getrf.hpp | 14 +- .../src/lapack/roclapack_getrf_batched.cpp | 16 +- .../roclapack_getrf_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_getrs.cpp | 16 +- .../rocSOLVER/src/lapack/roclapack_getrs.hpp | 8 +- .../src/lapack/roclapack_getrs_batched.cpp | 18 +- .../roclapack_getrs_strided_batched.cpp | 18 +- .../rocSOLVER/src/lapack/roclapack_potf2.cpp | 12 +- .../rocSOLVER/src/lapack/roclapack_potf2.hpp | 22 +- .../src/lapack/roclapack_potf2_batched.cpp | 14 +- .../roclapack_potf2_strided_batched.cpp | 14 +- .../rocSOLVER/src/lapack/roclapack_potrf.cpp | 12 +- .../rocSOLVER/src/lapack/roclapack_potrf.hpp | 28 +- .../src/lapack/roclapack_potrf_batched.cpp | 14 +- .../roclapack_potrf_strided_batched.cpp | 14 +- .../rocSOLVER/src/rocsolver-config.cmake.in | 2 +- ROCm_Libraries/rocSPARSE/Doxyfile | 124 +- ROCm_Libraries/rocr/Doxyfile | 122 +- ROCm_Libraries/rocr/src/README.md | 4 +- .../src/cmake_modules/COPYING-CMAKE-SCRIPTS | 2 +- .../rocr/src/cmake_modules/utils.cmake | 12 +- ROCm_Libraries/rocr/src/core/common/shared.h | 16 +- ROCm_Libraries/rocr/src/core/inc/agent.h | 16 +- .../rocr/src/core/inc/amd_blit_kernel.h | 16 +- .../rocr/src/core/inc/amd_blit_sdma.h | 16 +- .../rocr/src/core/inc/amd_cpu_agent.h | 16 +- .../rocr/src/core/inc/amd_elf_image.hpp | 16 +- .../rocr/src/core/inc/amd_gpu_agent.h | 16 +- .../rocr/src/core/inc/amd_hsa_code.hpp | 16 +- .../rocr/src/core/inc/amd_hsa_loader.hpp | 16 +- .../rocr/src/core/inc/amd_loader_context.hpp | 16 +- .../rocr/src/core/inc/amd_memory_region.h | 16 +- .../rocr/src/core/inc/amd_topology.h | 16 +- ROCm_Libraries/rocr/src/core/inc/blit.h | 16 +- ROCm_Libraries/rocr/src/core/inc/checked.h | 16 +- .../rocr/src/core/inc/hsa_api_trace_int.h | 16 +- .../rocr/src/core/inc/hsa_ext_interface.h | 16 +- .../rocr/src/core/inc/hsa_internal.h | 16 +- .../rocr/src/core/inc/hsa_table_interface.h | 16 +- ROCm_Libraries/rocr/src/core/inc/isa.h | 16 +- .../rocr/src/core/inc/memory_region.h | 16 +- ROCm_Libraries/rocr/src/core/inc/registers.h | 16 +- .../rocr/src/core/runtime/amd_blit_kernel.cpp | 16 +- .../rocr/src/core/runtime/amd_cpu_agent.cpp | 22 +- .../rocr/src/core/runtime/amd_gpu_agent.cpp | 8 +- .../src/core/runtime/amd_loader_context.cpp | 16 +- .../src/core/runtime/amd_memory_region.cpp | 16 +- .../src/core/runtime/hsa_ext_interface.cpp | 40 +- .../src/core/runtime/interrupt_signal.cpp | 2 +- ROCm_Libraries/rocr/src/core/runtime/isa.cpp | 16 +- .../rocr/src/core/runtime/runtime.cpp | 2 +- .../rocr/src/core/runtime/signal.cpp | 16 +- .../rocr/src/core/util/atomic_helpers.h | 16 +- .../rocr/src/core/util/lnx/os_linux.cpp | 16 +- ROCm_Libraries/rocr/src/core/util/locks.h | 16 +- ROCm_Libraries/rocr/src/core/util/os.h | 16 +- .../rocr/src/core/util/small_heap.cpp | 16 +- .../rocr/src/core/util/small_heap.h | 18 +- ROCm_Libraries/rocr/src/core/util/timer.cpp | 16 +- ROCm_Libraries/rocr/src/core/util/timer.h | 16 +- ROCm_Libraries/rocr/src/core/util/utils.h | 16 +- ROCm_Libraries/rocr/src/inc/amd_hsa_common.h | 16 +- ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h | 16 +- .../rocr/src/inc/amd_hsa_kernel_code.h | 16 +- ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h | 16 +- ROCm_Libraries/rocr/src/inc/hsa.h | 22 +- ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h | 2 +- .../rocr/src/inc/hsa_ext_finalize.h | 16 +- ROCm_Libraries/rocr/src/inc/hsa_ext_image.h | 28 +- .../rocr/src/inc/hsa_ven_amd_aqlprofile.h | 10 +- .../rocr/src/libamdhsacode/amd_elf_image.cpp | 2 +- ROCm_Libraries/rocr/src/loader/loaders.hpp | 2 +- ROCm_Network_Based_Programing/ROCm_RDMA.rst | 4 +- ROCm_Solutions/ROCr_Error_Codes.rst | 2 +- .../ROCm-System-Managment.rst | 126 +- ROCm_System_Managment/topo1.rst | 2 +- ROCm_System_Managment/topo2.rst | 44 +- ROCm_Tools/HCC-Native-GCN-ISA.rst | 10 +- ROCm_Tools/HCC_WIKI.rst | 28 +- ROCm_Tools/ROCm-Tools.rst | 330 +++--- ROCm_Tools/clBLA.rst | 4 +- ROCm_Tools/clFFT.rst | 4 +- ROCm_Tools/clRNG.rst | 8 +- ROCm_Tools/clSPARSE.rst | 8 +- ROCm_Tools/hcFFT.rst | 20 +- ROCm_Tools/hcRNG.rst | 32 +- ROCm_Tools/hipBLAS.rst | 2 +- ROCm_Tools/hipeigen.rst | 4 +- ROCm_Tools/hipinstall.rst | 8 +- ROCm_Tools/rocFFT.rst | 2 +- ROCm_Tools/rocFFTAPI.rst | 10 +- ROCm_Tools/rocblaswiki.rst | 18 +- ROCm_Tools/rocm-debug.rst | 16 +- ROCm_Tools/tensile.rst | 18 +- ROCm_Tools/tutorial.rst | 16 +- .../ROCm-Virtualization-&-Containers.rst | 46 +- .../ROCm-Virtualization-&-Containers.rst~ | 262 ----- ROCm_Virtualization_Containers/quickstart.rst | 2 +- Remote_Device_Programming/Memoryhooks.rst | 2 +- .../Performancemeasurement.rst | 6 +- Remote_Device_Programming/PrintUCXinfo.rst | 10 +- .../Remote-Device-Programming.rst | 82 +- Remote_Device_Programming/UCP-Design.rst | 2 +- Remote_Device_Programming/UCT-Design.rst | 4 +- Remote_Device_Programming/logging.rst | 4 +- Remote_Device_Programming/profiling.rst | 4 +- Remote_Device_Programming/reference | 2 +- .../sideprogresscompletion.rst | 6 +- Tutorial/GCN-asm-tutorial.rst | 22 +- Tutorial/Optimizing-Dispatches.rst | 14 +- Tutorial/ROCm-MultiGPU.rst | 4 +- Tutorial/Tutorial.rst | 6 +- Tutorial/caffe.rst | 50 +- Tutorial/hipCaffe .rst | 62 +- Tutorial/rocncloc.rst | 30 +- _templates/breadcrumbs.html | 2 +- amdstyles.css | 16 +- cleanup_text.sh | 45 + conf.py | 8 +- index.rst | 30 +- outline.rst | 40 +- 417 files changed, 7359 insertions(+), 7578 deletions(-) delete mode 100644 ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~ create mode 100755 cleanup_text.sh diff --git a/Current_Release_Notes/Current-Release-Notes.rst b/Current_Release_Notes/Current-Release-Notes.rst index 55f644a8..a513330a 100644 --- a/Current_Release_Notes/Current-Release-Notes.rst +++ b/Current_Release_Notes/Current-Release-Notes.rst @@ -10,7 +10,7 @@ April 1st, 2020 What Is ROCm? ============== -ROCm is designed to be a universal platform for gpu-accelerated computing. This modular design allows hardware vendors to build drivers that support the ROCm framework. ROCm is also designed to integrate multiple programming languages and makes it easy to add support for other languages. +ROCm is designed to be a universal platform for gpu-accelerated computing. This modular design allows hardware vendors to build drivers that support the ROCm framework. ROCm is also designed to integrate multiple programming languages and makes it easy to add support for other languages. Note: You can also clone the source code for individual ROCm components from the GitHub repositories. @@ -20,13 +20,13 @@ ROCm Components The following components for the ROCm platform are released and available for the v3.3 release: -• Drivers +o Drivers -• Tools +o Tools -• Libraries +o Libraries -• Source Code +o Source Code You can access the latest supported version of drivers, tools, libraries, and source code for the ROCm platform at the following location: https://github.com/RadeonOpenCompute/ROCm @@ -44,7 +44,7 @@ The ROCm v3.3.x platform is designed to support the following operating systems: * RHEL v7.7 (Using devtoolset-7 runtime support) -* SLES 15 SP1 +* SLES 15 SP1 What\'s New in This Release @@ -55,16 +55,16 @@ What\'s New in This Release Users can install and access multiple versions of the ROCm toolkit simultaneously. -Previously, users could install only a single version of the ROCm toolkit. +Previously, users could install only a single version of the ROCm toolkit. Now, users have the option to install multiple versions simultaneously and toggle to the desired version of the ROCm toolkit. From the v3.3 release, multiple versions of ROCm packages can be installed in the */opt/rocm-* folder. - + **Prerequisites** ############################### Ensure the existing installations of ROCm, including */opt/rocm*, are completely removed before the v3.3 ROCm toolkit installation. The ROCm v3.3 package requires a clean installation. -* To install a single instance of ROCm, use the rocm-dkms or rocm-dev packages to install all the required components. This creates a symbolic link */opt/rocm* pointing to the corresponding version of ROCm installed on the system. +* To install a single instance of ROCm, use the rocm-dkms or rocm-dev packages to install all the required components. This creates a symbolic link */opt/rocm* pointing to the corresponding version of ROCm installed on the system. * To install individual ROCm components, create the */opt/rocm* symbolic link pointing to the version of ROCm installed on the system. For example, *# ln -s /opt/rocm-3.3.0 /opt/rocm* @@ -82,7 +82,7 @@ Review the following important notes: To install a single instance of the ROCm package, access the non-versioned packages. You must not install any components from the multi-instance set. -For example, +For example, * rocm-dkms @@ -96,7 +96,7 @@ A fresh installation or an upgrade of the single-version installation will remov **Multi Version Installation** -* To install a multi-instance of the ROCm package, access the versioned packages and components. +* To install a multi-instance of the ROCm package, access the versioned packages and components. For example, @@ -118,19 +118,19 @@ For example, .. image:: /Current_Release_Notes/MultiIns.png -**IMPORTANT**: A single instance ROCm package cannot co-exist with the multi-instance package. +**IMPORTANT**: A single instance ROCm package cannot co-exist with the multi-instance package. -**NOTE**: The multi-instance installation applies only to ROCm v3.3 and above. This package requires a fresh installation after the complete removal of existing ROCm packages. The multi-version installation is not backward compatible. +**NOTE**: The multi-instance installation applies only to ROCm v3.3 and above. This package requires a fresh installation after the complete removal of existing ROCm packages. The multi-version installation is not backward compatible. **GPU Process Information** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A new functionality to display process information for GPUs is available in this release. For example, you can view the process details to determine if the GPU(s) must be reset. +A new functionality to display process information for GPUs is available in this release. For example, you can view the process details to determine if the GPU(s) must be reset. To display the GPU process details, you can: -* Invoke the API +* Invoke the API or @@ -143,15 +143,15 @@ https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/docs/ROCm_SMI_Manu **Support for 3D Pooling Layers** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -AMD ROCm is enhanced to include support for 3D pooling layers. The implementation of 3D pooling layers now allows users to run 3D convolutional networks, such as ResNext3D, on AMD Radeon Instinct GPUs. +AMD ROCm is enhanced to include support for 3D pooling layers. The implementation of 3D pooling layers now allows users to run 3D convolutional networks, such as ResNext3D, on AMD Radeon Instinct GPUs. **ONNX Enhancements** ~~~~~~~~~~~~~~~~~~~~~~~~~ -Open Neural Network eXchange (ONNX) is a widely-used neural net exchange format. The AMD model compiler & optimizer support the pre-trained models in ONNX, NNEF, & Caffe formats. Currently, ONNX versions 1.3 and below are supported. +Open Neural Network eXchange (ONNX) is a widely-used neural net exchange format. The AMD model compiler & optimizer support the pre-trained models in ONNX, NNEF, & Caffe formats. Currently, ONNX versions 1.3 and below are supported. -The AMD Neural Net Intermediate Representation (NNIR) is enhanced to handle the rapidly changing ONNX versions and its layers. +The AMD Neural Net Intermediate Representation (NNIR) is enhanced to handle the rapidly changing ONNX versions and its layers. .. image:: /Current_Release_Notes/onnx.png @@ -164,12 +164,12 @@ Code Object Manager (Comgr) Functions The following Code Object Manager (Comgr) functions are deprecated. -* `amd_comgr_action_info_set_options` -* `amd_comgr_action_info_get_options` +* `amd_comgr_action_info_set_options` +* `amd_comgr_action_info_get_options` -These functions were originally deprecated in version 1.3 of the Comgr library as they no longer support options with embedded spaces. +These functions were originally deprecated in version 1.3 of the Comgr library as they no longer support options with embedded spaces. -The deprecated functions are now replaced with the array-oriented options API, which includes +The deprecated functions are now replaced with the array-oriented options API, which includes * `amd_comgr_action_info_set_option_list` * `amd_comgr_action_info_get_option_list_count` @@ -179,9 +179,9 @@ The deprecated functions are now replaced with the array-oriented options API, w Hardware and Software Support Information ========================================== -AMD ROCm is focused on using AMD GPUs to accelerate computational tasks such as machine learning, engineering workloads, and scientific computing. In order to focus our development efforts on these domains of interest, ROCm supports a targeted set of hardware configurations. +AMD ROCm is focused on using AMD GPUs to accelerate computational tasks such as machine learning, engineering workloads, and scientific computing. In order to focus our development efforts on these domains of interest, ROCm supports a targeted set of hardware configurations. -For more information, see +For more information, see https://github.com/RadeonOpenCompute/ROCm diff --git a/Deep_learning/Deep-learning.rst b/Deep_learning/Deep-learning.rst index 5a6b65a3..20c9f43d 100644 --- a/Deep_learning/Deep-learning.rst +++ b/Deep_learning/Deep-learning.rst @@ -13,17 +13,17 @@ ROCm Tensorflow v1.14 Release We are excited to announce the release of ROCm enabled TensorFlow v1.14 for AMD GPUs. In this release we have the following features enabled on top of upstream TF1.14 enhancements: * We integrated ROCm RCCL library for mGPU communication, details in `RCCL github repo `_ - * XLA backend is enabled for AMD GPUs, the functionality is complete, performance optimization is in progress. + * XLA backend is enabled for AMD GPUs, the functionality is complete, performance optimization is in progress. ROCm Tensorflow v2.0.0-beta1 Release ***************************** In addition to Tensorflow v1.14 release, we also enabled Tensorflow v2.0.0-beta1 for AMD GPUs. The TF-ROCm 2.0.0-beta1 release supports Tensorflow V2 API. -Both whl packages and docker containers are available below. +Both whl packages and docker containers are available below. Tensorflow Installation *********************** -First, you’ll need to install the open-source ROCm 3.0 stack. Details can be found `here `_ +First, you'll need to install the open-source ROCm 3.0 stack. Details can be found `here `_ Then, install these other relevant ROCm packages: @@ -50,10 +50,10 @@ MIOpen ROCm MIOpen v2.0.1 Release ************************* -Announcing our new Foundation for Deep Learning acceleration MIOpen 2.0 which introduces support for Convolution Neural Network (CNN) acceleration — built to run on top of the ROCm software stack! +Announcing our new Foundation for Deep Learning acceleration MIOpen 2.0 which introduces support for Convolution Neural Network (CNN) acceleration -- built to run on top of the ROCm software stack! This release includes the following: - + * This release contains bug fixes and performance improvements. * Additionally, the convolution algorithm Implicit GEMM is now enabled by default * Known issues: @@ -81,7 +81,7 @@ The `porting guide `_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following: +The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog `_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following: * The program may be written in a high level language that does not expose all of the features available on the hardware. - * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. + * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. -Consider a program that uses one of GCN’s new features (source code is available on `GitHub `_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. +Consider a program that uses one of GCN's new features (source code is available on `GitHub `_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. DS Permute Instructions ************************** -Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: +Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: :: @@ -28,7 +28,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov Passing Parameters to a Kernel ******************************* -Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: +Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: :: @@ -50,7 +50,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg aql->kernarg_address = args; /* * Write the args directly to the kernargs buffer; - * the code assumes that memory is already allocated for the + * the code assumes that memory is already allocated for the * buffers that in_ptr, index_ptr and out_ptr point to */ args->in = in_ptr; @@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to “in” buffer - Kernarg(index); // Append base pointer to “index” buffer - Kernarg(out); // Append base pointer to “out” buffer + Kernarg(in); // Add base pointer to "in" buffer + Kernarg(index); // Append base pointer to "index" buffer + Kernarg(out); // Append base pointer to "out" buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as @@ -91,7 +91,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru .text .p2align 8 .amdgpu_hsa_kernel hello_world - + hello_world: .amd_kernel_code_t @@ -131,7 +131,7 @@ Currently, a programmer must manually set all non-default values to provide the The GPR Counting ****************** -The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: +The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: :: diff --git a/Deep_learning/MXNet.rst b/Deep_learning/MXNet.rst index 82d880a9..a8187bd2 100644 --- a/Deep_learning/MXNet.rst +++ b/Deep_learning/MXNet.rst @@ -1,7 +1,7 @@ .. _mxnet: ========= -MXNet +MXNet ========= .. image:: MXNet_image1.png @@ -28,11 +28,11 @@ Prerequisites * Install ROCm Libraries :: - + sudo apt install -y rocm-device-libs rocm-libs rocblas hipblas rocrand rocfft * Install ROCm opencl - + :: sudo apt install -y rocm-opencl rocm-opencl-dev @@ -48,16 +48,16 @@ Prerequisites :: sudo apt install -y rocthrust rocprim hipcub - - + + **Install Dependencies to build mxnet for HIP/CUDA** -Install CUDA following the NVIDIA’s `installation guide `_ to setup MXNet with GPU support +Install CUDA following the NVIDIA's `installation guide `_ to setup MXNet with GPU support -.. note:: - * Make sure to add CUDA install path to LD_LIBRARY_PATH +.. note:: + * Make sure to add CUDA install path to LD_LIBRARY_PATH * Example - export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH - + Install the dependencies hipblas, rocrand, hcfft from source. Build the MXNet library @@ -66,9 +66,9 @@ Build the MXNet library **Step 1: Install build tools.** :: $ sudo apt-get update - $ sudo apt-get install -y build-essential - -**Step 2: Install OpenBLAS.** + $ sudo apt-get install -y build-essential + +**Step 2: Install OpenBLAS.** MXNet uses BLAS and LAPACK libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - OpenBLAS, ATLAS and MKL. In this step we install OpenBLAS. You can choose to install ATLAS or MKL. :: $ sudo apt-get install -y libopenblas-dev liblapack-dev libomp-dev libatlas-dev libatlas-base-dev @@ -78,9 +78,9 @@ Install OpenCV `_ here. MXNet uses OpenCV for efficient image loading and augmentation operations. :: $ sudo apt-get install -y libopencv-dev - - + + **Step 4: Download MXNet sources and build MXNet core shared library.** :: $ git clone --recursive https://github.com/ROCmSoftwarePlatform/mxnet.git @@ -96,25 +96,25 @@ MXNet uses OpenCV for efficient image loading and augmentation operations. **To compile on NVCC PLATFORM(HIP/CUDA):** :: $ export HIP_PLATFORM=nvcc - - + + **Step 6: To enable MIOpen for higher acceleration :** :: - USE_CUDNN=1 - + USE_CUDNN=1 + **Step 7:** **If building on CPU:** :: make -jn(n=number of cores) USE_GPU=0 (For Ubuntu 16.04) make -jn(n=number of cores) CXX=g++-6 USE_GPU=0 (For Ubuntu 18.04) - + **If building on GPU:** :: make -jn(n=number of cores) USE_GPU=1 (For Ubuntu 16.04) - make -jn(n=number of cores) CXX=g++-6 USE_GPU=1 (For Ubuntu 18.04) - + make -jn(n=number of cores) CXX=g++-6 USE_GPU=1 (For Ubuntu 18.04) + On succesfull compilation a library called libmxnet.so is created in mxnet/lib path. @@ -137,7 +137,7 @@ Install the MXNet Python binding **Step 2: Install the MXNet Python binding.** :: $ cd python - $ sudo python setup.py install + $ sudo python setup.py install **Step 3: Execute sample example** :: diff --git a/Deep_learning/caffe.rst b/Deep_learning/caffe.rst index 3f0da7d8..b39a379a 100644 --- a/Deep_learning/caffe.rst +++ b/Deep_learning/caffe.rst @@ -30,38 +30,38 @@ Installing ROCm Debian packages: :: PKG_REPO="http://repo.radeon.com/rocm/apt/debian/" - + wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add - - + sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list" - + sudo apt-get update - + sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc - + echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc source $HOME/.bashrc - + sudo reboot - + Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name):: - + uname -r - + In addition, check that you can run the simple HSA vector_copy sample application:: - + cd /opt/rocm/hsa/sample make ./vector_copy - + Pre-requisites Installation ++++++++++++++++++++++++++++ Install Caffe dependencies:: - + sudo apt-get install \ pkg-config \ protobuf-compiler \ @@ -78,24 +78,24 @@ Install Caffe dependencies:: libopencv-dev \ libfftw3-dev \ libelf-dev - + Install the necessary ROCm compute libraries:: - + sudo apt-get install rocm-libs miopen-hip miopengemm hipCaffe Build Steps +++++++++++++++++++++ Clone hipCaffe:: - - git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git - + + git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git + cd hipCaffe - + You may need to modify the Makefile.config file for your own installation. Then, build it:: - + cp ./Makefile.config.example ./Makefile.config - make + make To improve build time, consider invoking parallel make with the "-j$(nproc)" flag. @@ -103,7 +103,7 @@ Unit Testing ------------- Run the following commands to perform unit testing of different components of Caffe. -:: +:: make test ./build/test/test_all.testbin @@ -114,7 +114,7 @@ MNIST training ++++++++++++++++ Steps:: - + ./data/mnist/get_mnist.sh ./examples/mnist/create_mnist.sh ./examples/mnist/train_lenet.sh @@ -123,7 +123,7 @@ CIFAR-10 training ++++++++++++++++++ Steps:: - + ./data/cifar10/get_cifar10.sh ./examples/cifar10/create_cifar10.sh ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt @@ -142,7 +142,7 @@ Soumith's Convnet benchmarks Steps: :: - + git clone https://github.com/soumith/convnet-benchmarks.git cd convnet-benchmarks/caffe @@ -183,7 +183,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature: @ 0x8015c3 caffe::Solver<>::Solve() @ 0x71a277 caffe::P2PSync<>::Run() @ 0x42dcbc train() - + See this `comment `_. diff --git a/Deep_learning/hipCaffe .rst b/Deep_learning/hipCaffe .rst index 1d4ae7a8..051b9fb5 100644 --- a/Deep_learning/hipCaffe .rst +++ b/Deep_learning/hipCaffe .rst @@ -4,18 +4,18 @@ hipCaffe Quickstart Guide ########################### -In this quickstart guide, we’ll walk through the steps for ROCm installation. Then, we’ll run a few training and inference experiments and check their accuracy. +In this quickstart guide, we'll walk through the steps for ROCm installation. Then, we'll run a few training and inference experiments and check their accuracy. Install ROCm ------------- -Here are the main ROCm components we’ll be using:: +Here are the main ROCm components we'll be using:: sudo apt-get install rocm sudo apt-get install rocm-libs sudo apt-get install miopen-hip miopengemm - + And some misc packages:: - + sudo apt-get install -y \ g++-multilib \ libunwind-dev \ @@ -28,65 +28,65 @@ And some misc packages:: rpm \ unzip \ bc - + Verify ROCm ------------ Test a simple HIP sample:: - + cp -r /opt/rocm/hip/samples ~/hip-samples && cd ~/hip-samples/0_Intro/square/ - + make - + ./square.hip.out - + Install hipCaffe ---------------- Handle the Caffe dependencies first:: - + sudo apt-get install -y \ pkg-config \ protobuf-compiler \ libprotobuf-dev \ libleveldb-dev \ libsnappy-dev \ - libhdf5-serial-dev \ + libhdf5-serial-dev \ libatlas-base-dev \ libboost-all-dev \ libgflags-dev \ libgoogle-glog-dev \ - liblmdb-dev \ + liblmdb-dev \ python-numpy python-scipy python3-dev python-yaml python-pip \ python-skimage python-opencv python-protobuf \ libopencv-dev \ libfftw3-dev \ libelf-dev - + Note that you might need minor changes to Makefile.config (system dependent):: - + cd ~ - + git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git - + cd hipCaffe - + cp ./Makefile.config.example ./Makefile.config - + make -j$(nproc) - + Workloads ----------- MNIST training +++++++++++++++ -Details on MNIST training can be found at this `link `_. - +Details on MNIST training can be found at this `link `_. + Here are the basic instructions:: ./data/mnist/get_mnist.sh ./examples/mnist/create_mnist.sh ./examples/mnist/train_lenet.sh - + Expected result: >99% accuracy after 10000 iterations :: @@ -104,7 +104,7 @@ Expected result: >99% accuracy after 10000 iterations I0717 21:06:58.701591 9965 solver.cpp:404] Test net output #0: accuracy = 0.9917 I0717 21:06:58.701642 9965 solver.cpp:404] Test net output #1: loss = 0.0269806 (* 1 = 0.0269806 loss) I0717 21:06:58.701668 9965 solver.cpp:322] Optimization Done. - + CIFAR-10 training ++++++++++++++++++ @@ -112,14 +112,14 @@ CIFAR-10 training Details on CIFAR-10 training can be found at this `link `_. Here are the basic instructions:: - + ./data/cifar10/get_cifar10.sh ./examples/cifar10/create_cifar10.sh ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt - + Expected result: >70% accuracy after 4000 iterations :: - + I0727 18:29:35.248363 33 solver.cpp:279] Solving CIFAR10_quick I0727 18:29:35.248366 33 solver.cpp:280] Learning Rate Policy: fixed I0727 18:29:35.248883 33 solver.cpp:337] Iteration 0, Testing net (#0) @@ -134,7 +134,7 @@ Expected result: >70% accuracy after 4000 iterations I0727 18:30:13.722070 33 solver.cpp:404] Test net output #0: accuracy = 0.7124 I0727 18:30:13.722090 33 solver.cpp:404] Test net output #1: loss = 0.848089 (* 1 = 0.848089 loss) I0727 18:30:13.722095 33 solver.cpp:322] Optimization Done. - + CaffeNet inference +++++++++++++++++++ @@ -142,20 +142,20 @@ CaffeNet inference Details on CaffeNet inference can be found at this `link `_. Here are the basic instructions:: - + ./data/ilsvrc12/get_ilsvrc_aux.sh ./scripts/download_model_binary.py models/bvlc_reference_caffenet ./build/examples/cpp_classification/classification.bin models/bvlc_reference_caffenet/deploy.prototxt models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel data/ilsvrc12/imagenet_mean.binaryproto data/ilsvrc12/synset_words.txt examples/images/cat.jpg - + Expected result: (note the ordering and associated percentages) :: - + ---------- Prediction for examples/images/cat.jpg ---------- 0.3134 - "n02123045 tabby, tabby cat" 0.2380 - "n02123159 tiger cat" 0.1235 - "n02124075 Egyptian cat" 0.1003 - "n02119022 red fox, Vulpes vulpes" 0.0715 - "n02127052 lynx, catamount" - + diff --git a/Doxyfile b/Doxyfile index ea5b10a3..0a743e43 100644 --- a/Doxyfile +++ b/Doxyfile @@ -802,7 +802,7 @@ FILE_PATTERNS = *.h *.cpp # be searched for input files as well. # The default value is: NO. -#YES -> NO for rocblas_handle to come along with Enums +#YES -> NO for rocblas_handle to come along with Enums RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be diff --git a/FAQ/FAQ_HIP.rst b/FAQ/FAQ_HIP.rst index 15bfceb6..a911c63c 100644 --- a/FAQ/FAQ_HIP.rst +++ b/FAQ/FAQ_HIP.rst @@ -31,7 +31,7 @@ Runtime/Driver API features At a high*level, the following features are not supported: -* Textures +* Textures * Dynamic parallelism (CUDA 5.0) * Managed memory (CUDA 6.5) * Graphics interoperability with OpenGL or Direct3D @@ -48,9 +48,9 @@ Kernel language features * Device*side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0) * Virtual functions, indirect functions and try/catch (CUDA 4.0) -* `__prof_trigger` +* `__prof_trigger` * PTX assembly (CUDA 4.0). HCC supports inline GCN assembly. -* Several kernel features are under development. See the `HIP Kernel Language `_ for more information. +* Several kernel features are under development. See the `HIP Kernel Language `_ for more information. These include @@ -66,23 +66,23 @@ Is HIP a drop*in replacement for CUDA? ****************************** No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs. -Most developers will port their code from CUDA to HIP and then maintain the HIP version. +Most developers will port their code from CUDA to HIP and then maintain the HIP version. HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms. What specific version of CUDA does HIP support? ************************************* -HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can +HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions * this is useful for identifying the specific features required by a given application. However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP: -* CUDA 4.0 and earlier : +* CUDA 4.0 and earlier : * HIP supports CUDA 4.0 except for the limitations described above. -* CUDA 5.0 : - * Dynamic Parallelism (not supported) +* CUDA 5.0 : + * Dynamic Parallelism (not supported) * cuIpc functions (under development). -* CUDA 5.5 : +* CUDA 5.5 : * CUPTI (not directly supported), `AMD GPUPerfAPI `_ can be used as an alternative in some cases) * CUDA 6.0 * Managed memory (under development) @@ -100,15 +100,15 @@ What libraries does HIP support? ***************************** HIP includes growing support for the 4 key math libraries using hcBlas, hcFft, hcrng and hcsparse. -These offer pointer*based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HCC applications. Developers should use conditional compilation if portability to nvcc systems is desired * using calls to cu* routines on one path and hc* routines on the other. +These offer pointer*based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HCC applications. Developers should use conditional compilation if portability to nvcc systems is desired * using calls to cu* routines on one path and hc* routines on the other. * `rocblas `_ * `rocfft `_ * `MIOpen `_ -* hipRAND Under Development - +* hipRAND Under Development + Additionally, some of the cublas routines are automatically converted to hipblas equivalents by the hipify*clang tool. These APIs use cublas or hcblas depending on the platform, and replace the need -to use conditional compilation. +to use conditional compilation. How does HIP compare with OpenCL? ***************************** @@ -137,10 +137,10 @@ HIP and CUDA provide similar math library calls as well. In summary, the HIP ph This reduces the potential for error, and also makes it easy to automate the translation. HIP's goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations. -There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99*based kernel language (rather than C++) and also does not support single*source compilation. +There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99*based kernel language (rather than C++) and also does not support single*source compilation. As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap. -The tools also struggle with more complex CUDA applications, in particular those that use templates, classes, or other C++ features inside the kernel. +The tools also struggle with more complex CUDA applications, in particular those that use templates, classes, or other C++ features inside the kernel. What hardware does HIP support? @@ -152,12 +152,12 @@ What hardware does HIP support? Does Hipify automatically convert all source code? ***************************** -Typically, hipify can automatically convert almost all run*time code, and the coordinate indexing device code ( threadIdx.x *> hipThreadIdx_x ). +Typically, hipify can automatically convert almost all run*time code, and the coordinate indexing device code ( threadIdx.x *> hipThreadIdx_x ). -Most device code needs no additional conversion, since HIP and CUDA have similar names for math and built*in functions. +Most device code needs no additional conversion, since HIP and CUDA have similar names for math and built*in functions. The hipify*clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually) -Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support. +Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support. In general, developers should always expect to perform some platform*specific tuning and optimization. @@ -175,8 +175,8 @@ Why use HIP rather than supporting CUDA directly? ***************************** While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented. -Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms. -In addition, HIP defines portable mechanisms to query architectural features, and supports a larger 64*bit wavesize which expands the return type for cross*lane functions like ballot and shuffle from 32*bit ints to 64*bit ints. +Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms. +In addition, HIP defines portable mechanisms to query architectural features, and supports a larger 64*bit wavesize which expands the return type for cross*lane functions like ballot and shuffle from 32*bit ints to 64*bit ints. Can I develop HIP code on an Nvidia CUDA platform? ***************************** @@ -192,7 +192,7 @@ In some cases CUDA has a richer set of modes for some APIs, and some C++ capabil Can I develop HIP code on an AMD HCC platform? ***************************** -Yes. HIP's HCC path only exposes the APIs and functions that work on both NVCC and HCC back ends. "Extra" APIs, parameters and features that appear in HCC but not CUDA will typically cause compile* or run*time errors. Developers must use the HIP API for most accelerator code and bracket any HCC*specific code with preprocessor conditionals. +Yes. HIP's HCC path only exposes the APIs and functions that work on both NVCC and HCC back ends. "Extra" APIs, parameters and features that appear in HCC but not CUDA will typically cause compile* or run*time errors. Developers must use the HIP API for most accelerator code and bracket any HCC*specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HCC supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the hc path. diff --git a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst index 55aedd3d..a72a38de 100644 --- a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst +++ b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst @@ -26,16 +26,16 @@ Inline GCN ISA Assembly Guide The Art of AMDGCN Assembly: How to Bend the Machine to Your Will ****************************************************************** -The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog `_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following: +The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog `_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following: * The program may be written in a high level language that does not expose all of the features available on the hardware. - * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. + * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. -Consider a program that uses one of GCN’s new features (source code is available on `GitHub `_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. +Consider a program that uses one of GCN's new features (source code is available on `GitHub `_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. DS Permute Instructions ************************** -Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: +Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: :: @@ -47,7 +47,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov Passing Parameters to a Kernel ******************************* -Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: +Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: :: @@ -69,7 +69,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg aql->kernarg_address = args; /* * Write the args directly to the kernargs buffer; - * the code assumes that memory is already allocated for the + * the code assumes that memory is already allocated for the * buffers that in_ptr, index_ptr and out_ptr point to */ args->in = in_ptr; @@ -90,9 +90,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to “in” buffer - Kernarg(index); // Append base pointer to “index” buffer - Kernarg(out); // Append base pointer to “out” buffer + Kernarg(in); // Add base pointer to "in" buffer + Kernarg(index); // Append base pointer to "index" buffer + Kernarg(out); // Append base pointer to "out" buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as @@ -110,7 +110,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru .text .p2align 8 .amdgpu_hsa_kernel hello_world - + hello_world: .amd_kernel_code_t @@ -146,13 +146,13 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru flat_store_dword v[3:4], v1 s_endpgm -Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. +Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. .. image:: initial_state-768x387.png The GPR Counting ****************** -The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: +The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: :: diff --git a/GCN_ISA_Manuals/PCIe-features.rst b/GCN_ISA_Manuals/PCIe-features.rst index 09440f69..935e77db 100644 --- a/GCN_ISA_Manuals/PCIe-features.rst +++ b/GCN_ISA_Manuals/PCIe-features.rst @@ -17,15 +17,15 @@ The new PCIe AtomicOps operate as completers for CAS(Compare and Swap), FetchADD Currently ROCm use this capability as following: -* Update HSA queue’s read_dispatch_id: 64bit atomic add used by the command processor on the GPU agent to update the packet ID it processed. -* Update HSA queue’s write_dispatch_id: 64bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions. -* Update HSA Signals – 64bit atomic ops are used for CPU & GPU synchronization. +* Update HSA queue's read_dispatch_id: 64bit atomic add used by the command processor on the GPU agent to update the packet ID it processed. +* Update HSA queue's write_dispatch_id: 64bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions. +* Update HSA Signals - 64bit atomic ops are used for CPU & GPU synchronization. The PCIe 3.0 AtomicOp feature allows atomic transactions to be requested by, routed through and completed by PCIe components. Routing and completion does not require software support. Component support for each is detectable via the DEVCAP2 register. Upstream bridges need to have AtomicOp routing enabled or the Atomic Operations will fall even though PCIe endpoint and PCIe I/O Devices has the capability to Atomics Operations. To do AtomicOp routing capability between two or more Root Ports, each associated Root Port must indicate that capability via the AtomicOp Routing Supported bit in the Device Capabilities 2 register. -If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component’s DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported. +If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component's DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported. Atomic Operation is a Non-Posted transaction supporting 32- and 64-bit address formats, there must be a response for Completion containing the result of the operation. Errors associated with the operation (uncorrectable error accessing the target location or carrying out the Atomic operation) are signaled to the requester by setting the Completion Status field in the completion descriptor, they are set to to Completer Abort (CA) or Unsupported Request (UR). @@ -51,12 +51,12 @@ Future bus technology with richer I/O Atomics Operation Support * `GenZ `_ -New PCIe Endpoints with support beyond AMD Ryzen and EPIC CPU; Intel Haswell or newer CPU’s with PCIe Generation 3.0 support. +New PCIe Endpoints with support beyond AMD Ryzen and EPIC CPU; Intel Haswell or newer CPU's with PCIe Generation 3.0 support. * `Mellanox Bluefield SOC `_ * `Cavium Thunder X2 `_ -In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets: +In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets: 1. write to another GPU memory 2. then write to system memory to indicate transfer complete @@ -86,36 +86,36 @@ For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual add * BAR4 register: Optional, not a boot device. * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed < 4GB. -Here is how our BAR works on GFX 8 GPU’s with 40 bit Physical Address Limit +Here is how our BAR works on GFX 8 GPU's with 40 bit Physical Address Limit :: 11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO Series] (rev c1) - + Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35 - + Flags: bus master, fast devsel, latency 0, IRQ 119 - + Memory at bf40000000 (64-bit, prefetchable) [size=256M] - + Memory at bf50000000 (64-bit, prefetchable) [size=2M] - + I/O ports at 3000 [size=256] - + Memory at c7400000 (32-bit, non-prefetchable) [size=256K] - + Expansion ROM at c7440000 [disabled] [size=128K] Legend: -**1** : GPU Frame Buffer BAR – In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs. +**1** : GPU Frame Buffer BAR - In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs. -**2** : Doorbell BAR – The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs. +**2** : Doorbell BAR - The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs. **3** : IO BAR - This is for legacy VGA and boot device support, but since this the GPUs in this project are not VGA devices (headless), this is not a concern even if the SBIOS does not setup. -**4** : MMIO BAR – This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB. +**4** : MMIO BAR - This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB. -**5** : Expansion ROM – This is required for the AMD Driver SW to access the GPU’s video-bios. This is currently fixed at 128KB. +**5** : Expansion ROM - This is required for the AMD Driver SW to access the GPU's video-bios. This is currently fixed at 128KB. =============================================================== Excepts form Overview of Changes to PCI Express 3.0 @@ -126,20 +126,20 @@ By Mike Jackson, Senior Staff Architect, MindShare, Inc. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Atomic Operations – Goal: +Atomic Operations - Goal: ------------------------- Support SMP-type operations across a PCIe network to allow for things like offloading tasks between CPU cores and accelerators like a GPU. The spec says this enables advanced synchronization mechanisms that are particularly useful with multiple producers or consumers that need to be synchronized in a non-blocking fashion. Three new atomic non-posted requests were added, plus the corresponding completion (the address must be naturally aligned with the operand size or the TLP is malformed): -* Fetch and Add – uses one operand as the “add” value. Reads the target location, adds the operand, and then writes the result back to the original location. -* Unconditional Swap – uses one operand as the “swap” value. Reads the target location and then writes the swap value to it. -* Compare and Swap – uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location. -* AtomicOpCompletion – new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained. +* Fetch and Add - uses one operand as the "add" value. Reads the target location, adds the operand, and then writes the result back to the original location. +* Unconditional Swap - uses one operand as the "swap" value. Reads the target location and then writes the swap value to it. +* Compare and Swap - uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location. +* AtomicOpCompletion - new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained. -Since AtomicOps are not locked they don’t have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide “lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic.” The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation. +Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide "lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic." The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation. AtomicOps can go from device to device, device to host, or host to device. Each completer indicates whether it supports this capability and guarantees atomic access if it does. The ability to route AtomicOps is also indicated in the registers for a given port. -ID-based Ordering – Goal: +ID-based Ordering - Goal: ------------------------- Improve performance by avoiding stalls caused by ordering rules. For example, posted writes are never normally allowed to pass each other in a queue, but if they are requested by different functions, we can have some confidence that the requests are not dependent on each other. The previously reserved Attribute bit [2] is now combined with the RO bit to indicate ID ordering with or without relaxed ordering. diff --git a/GCN_ISA_Manuals/caffe.rst b/GCN_ISA_Manuals/caffe.rst index 3eef79e3..070603b6 100644 --- a/GCN_ISA_Manuals/caffe.rst +++ b/GCN_ISA_Manuals/caffe.rst @@ -30,38 +30,38 @@ Installing ROCm Debian packages: :: PKG_REPO="http://repo.radeon.com/rocm/apt/debian/" - + wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add - - + sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list" - + sudo apt-get update - + sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc - + echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc source $HOME/.bashrc - + sudo reboot - + Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name):: - + uname -r - + In addition, check that you can run the simple HSA vector_copy sample application:: - + cd /opt/rocm/hsa/sample make ./vector_copy - + Pre-requisites Installation ++++++++++++++++++++++++++++ Install Caffe dependencies:: - + sudo apt-get install \ pkg-config \ protobuf-compiler \ @@ -78,24 +78,24 @@ Install Caffe dependencies:: libopencv-dev \ libfftw3-dev \ libelf-dev - + Install the necessary ROCm compute libraries:: - + sudo apt-get install rocm-libs miopen-hip miopengemm hipCaffe Build Steps +++++++++++++++++++++ Clone hipCaffe:: - - git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git - + + git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git + cd hipCaffe - + You may need to modify the Makefile.config file for your own installation. Then, build it:: - + cp ./Makefile.config.example ./Makefile.config - make + make To improve build time, consider invoking parallel make with the "-j$(nproc)" flag. @@ -103,7 +103,7 @@ Unit Testing ------------- Run the following commands to perform unit testing of different components of Caffe. -:: +:: make test ./build/test/test_all.testbin @@ -114,7 +114,7 @@ MNIST training ++++++++++++++++ Steps:: - + ./data/mnist/get_mnist.sh ./examples/mnist/create_mnist.sh ./examples/mnist/train_lenet.sh @@ -123,7 +123,7 @@ CIFAR-10 training ++++++++++++++++++ Steps:: - + ./data/cifar10/get_cifar10.sh ./examples/cifar10/create_cifar10.sh ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt @@ -163,7 +163,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature: @ 0x8015c3 caffe::Solver<>::Solve() @ 0x71a277 caffe::P2PSync<>::Run() @ 0x42dcbc train() - + See this `comment `_. diff --git a/GCN_ISA_Manuals/testdocbook.rst b/GCN_ISA_Manuals/testdocbook.rst index af71ad61..0154b02d 100644 --- a/GCN_ISA_Manuals/testdocbook.rst +++ b/GCN_ISA_Manuals/testdocbook.rst @@ -101,7 +101,7 @@ Summary of kernel instruction changes in Vega GPUs: - New packed 16-bit math instructions. :: - + V_PK_MAD_I16 V_PK_MUL_LO_U16 V_PK_ADD_I16 V_PK_SUB_I16 V_PK_LSHLREV_B16 V_PK_LSHRREV_B16 V_PK_ASHRREV_I16 V_PK_MAX_I16 V_PK_MIN_I16 V_PK_MAD_U16 V_PK_ADD_U16 V_PK_SUB_U16 @@ -159,7 +159,7 @@ The figure below shows a block diagram of the AMD GCN Vega Generation series pro AMD GCN VEGA Generation Series Block Diagram -The GCN device includes a data-parallel processor (DPP) array, a command processor, a memory controller, and other logic (not shown). The GCN command processor reads commands that the host has written to memory-mapped GCN registers in the system-memory address space. The command processor sends hardware-generated interrupts to the host when the command is completed. The GCN memory controller has direct access to all GCN device memory and the host-specified areas of system memory. To satisfy read and write requests, the memory controller performs the functions of a direct-memory access (DMA) controller, including computing memory-address offsets based on the format of the requested data in memory. In the GCN environment, a complete application includes two parts: +The GCN device includes a data-parallel processor (DPP) array, a command processor, a memory controller, and other logic (not shown). The GCN command processor reads commands that the host has written to memory-mapped GCN registers in the system-memory address space. The command processor sends hardware-generated interrupts to the host when the command is completed. The GCN memory controller has direct access to all GCN device memory and the host-specified areas of system memory. To satisfy read and write requests, the memory controller performs the functions of a direct-memory access (DMA) controller, including computing memory-address offsets based on the format of the requested data in memory. In the GCN environment, a complete application includes two parts: - a program running on the host processor, and - programs, called kernels, running on the GCN processor. @@ -175,16 +175,16 @@ The GCN programs are controlled by host commands that - cause the GCN GPU to begin execution of a program. The GCN driver program runs on the host. - + The DPP array is the heart of the GCN processor. The array is organized as a set of compute unit pipelines, each independent from the others, that operate in parallel on streams of floating-point or integer data.The compute unit pipelines can process data or, through the memory controller, transfer data to, or from, memory. Computation in a compute unit pipeline can be made conditional. Outputs written to memory can also be made conditional. -When it receives a request, the compute unit pipeline loads instructions and data from memory, begins execution, and continues until the end of the kernel. As kernels are running, the GCN hardware automatically fetches instructions from memory into on-chip caches; GCN software plays no role in this. GCN kernels can load data from off-chip memory into on-chip general-purpose registers (GPRs) and caches. +When it receives a request, the compute unit pipeline loads instructions and data from memory, begins execution, and continues until the end of the kernel. As kernels are running, the GCN hardware automatically fetches instructions from memory into on-chip caches; GCN software plays no role in this. GCN kernels can load data from off-chip memory into on-chip general-purpose registers (GPRs) and caches. The AMD GCN devices can detect floating point exceptions and can generate interrupts. In particular, they detect IEEE floating-point exceptions in hardware; these can be recorded for post-execution analysis. The software interrupts shown in the previous figure from the command processor to the host represent hardware-generated interrupts for signaling command-completion and related management functions. The GCN processor hides memory latency by keeping track of potentially hundreds of work-items in different stages of execution, and by -overlapping compute operations with memory-access operations. +overlapping compute operations with memory-access operations. The figure below shows the dataflow for a GCN application. For general-purpose applications, only one processing block performs all computation. @@ -247,7 +247,7 @@ Terminology | | address, data format, stride, etc. | +-----------------------+----------------------------------------------------+ - **Table : Basic Terms Uses** + **Table : Basic Terms Uses** Program Organization ==================== @@ -701,7 +701,7 @@ SGPR Allocation and storage ~~~~~~~~~~~~~~~~~~~~~~~~~~~ A wavefront can be allocated 16 to 102 SGPRs, in units of 16 GPRs (Dwords). These are logically viewed as SGPRs 0-101. The VCC is -physically stored as part of the wavefront’s SGPRs in the highest numbered two SGPRs (SGPR 106 and 107; the source/destination VCC is an alias for those two SGPRs). When a trap handler is present, 16 additional SGPRs are reserved after VCC to hold the trap addresses, as well as saved-PC and trap-handler temps. These all are privileged (cannot be written to unless privilege is set). Note that if a wavefront allocates 16 SGPRs, 2 SGPRs are normally used as VCC, the remaining 14 are available to the shader. Shader hardware does not prevent use of all 16 SGPRs. +physically stored as part of the wavefront's SGPRs in the highest numbered two SGPRs (SGPR 106 and 107; the source/destination VCC is an alias for those two SGPRs). When a trap handler is present, 16 additional SGPRs are reserved after VCC to hold the trap addresses, as well as saved-PC and trap-handler temps. These all are privileged (cannot be written to unless privilege is set). Note that if a wavefront allocates 16 SGPRs, 2 SGPRs are normally used as VCC, the remaining 14 are available to the shader. Shader hardware does not prevent use of all 16 SGPRs. SGPR Alignment ~~~~~~~~~~~~~~ @@ -736,13 +736,13 @@ for: - Local Data Share (LDS) - - Interpolation: holds { 1’b0, new\_prim\_mask[15:1], + - Interpolation: holds { 1'b0, new\_prim\_mask[15:1], parameter\_offset[15:0] } // in bytes - - LDS direct-read offset and data type: { 13’b0, DataType[2:0], + - LDS direct-read offset and data type: { 13'b0, DataType[2:0], LDS\_address[15:0] } // addr in bytes - - LDS addressing for Memory/Vfetch → LDS: {16’h0, lds\_offset[15:0]} + - LDS addressing for Memory/Vfetch -> LDS: {16'h0, lds\_offset[15:0]} // in bytes - Global Data Share (GDS) @@ -791,7 +791,7 @@ The EXEC mask determines which threads execute an instruction. The VCC indicates
-V\_CMP\_\* ⇒ VCC[n] = EXEC[n] & (test passed for thread[n]) +V\_CMP\_\* => VCC[n] = EXEC[n] & (test passed for thread[n]) .. raw:: html @@ -811,7 +811,7 @@ SGPRs that happen to hold VCC). Trap and Exception registers ---------------------------- -Each type of exception can be enabled or disabled independently by setting, or clearing, bits in the TRAPSTS register’s EXCP\_EN field.This section describes the registers which control and report kernel exceptions. +Each type of exception can be enabled or disabled independently by setting, or clearing, bits in the TRAPSTS register's EXCP\_EN field.This section describes the registers which control and report kernel exceptions. All Trap temporary SGPRs (TTMP\*) are privileged for writes - they can be written only when in the trap handler (status.priv = 1). When not privileged, writes to these are ignored. TMA and TBA are read-only; they can be accessed through S\_GETREG\_B32. @@ -829,7 +829,7 @@ PC of the faulting instruction will be: (PC - PC\_rewind\*4). **STATUS . TRAP\_EN** - This bit indicates to the shader whether or not a trap handler is present. When one is not present, traps are not taken, -no matter whether they’re floating point, user-, or host-initiated +no matter whether they're floating point, user-, or host-initiated traps. When the trap handler is present, the wavefront uses an extra 16 SGPRs for trap processing. If trap\_en == 0, all traps and exceptions are ignored, and s\_trap is converted by hardware to NOP. @@ -940,7 +940,7 @@ Memory violations are not reported for instruction or scalar-data accesses. Memory Buffer to LDS does NOT return a memory violation if the LDS address is out of range, but masks off EXEC bits of threads that would go out of range. -When a memory access is in violation, the appropriate memory (LDS or TC) returns MEM\_VIOL to the wave. This is stored in the wave’s +When a memory access is in violation, the appropriate memory (LDS or TC) returns MEM\_VIOL to the wave. This is stored in the wave's TRAPSTS.mem\_viol bit. This bit is sticky, so once set to 1, it remains at 1 until the user clears it. There is a corresponding exception enable bit (EXCP\_EN.mem\_viol). If this bit is set when the memory returns with a violation, the wave jumps to the trap handler. @@ -1235,7 +1235,7 @@ This method compares how many of the 64 threads go down the PASS path instead of The following pseudo-code shows the details of CBRANCH Fork and Join operations. :: - + S_CBRANCH_G_FORK arg0, arg1 // arg1 is an sgpr-pair which holds 64bit (48bit) target address @@ -1270,8 +1270,8 @@ The following pseudo-code shows the details of CBRANCH Fork and Join operations. else CSP -- // this is the 1st time to JOIN: jump to other FORK path {PC, EXEC} = SGPR[CSP*4] // read 128-bits from 4 consecutive SGPRs - - + + Scalar ALU Operations ===================== @@ -1550,7 +1550,7 @@ comparison yielded a TRUE result. | S\_BITCMP1\_{B32,B64 | SOPC | y | Test for "is a bit one". SCC = | | } | | | S0[S1]. | +----------------------+----------+----------+------------------------------------+ - + **Table : Conditional Instructions** Bit-Wise Instructions @@ -1628,7 +1628,7 @@ below, SCC is set if the result is nonzero. | | S\_FLBIT\_I32 | SOP1 | n | | Count how many bits in a row | | | S\_FLBIT\_I32\_I64 | | | (from MSB to LSB) are the same | | | | | as the sign bit. Return -1 if | -| | | | the input is zero or all 1’s | +| | | | the input is zero or all 1's | | | | | (-1). 32-bit pseudo-code: | | | | | | if (S0 == 0 \|\| S0 == -1) D = | | | | | -1 | @@ -1973,7 +1973,7 @@ bits; codes 0 to 255 can be the scalar source if it is eight bits; codes +-----------+--------------------+-----------------------------------------------+ | 236 | SHARED\_LIMIT | | +-----------+--------------------+-----------------------------------------------+ -| 237 | PRIVATE\_BASE | | +| 237 | PRIVATE\_BASE | | +-----------+--------------------+-----------------------------------------------+ | 238 | PRIVATE\_LIMIT | | +-----------+--------------------+-----------------------------------------------+ @@ -1994,7 +1994,7 @@ bits; codes 0 to 255 can be the scalar source if it is eight bits; codes +-----------+--------------------+-----------------------------------------------+ | 243 | -1.0 | | +-----------+--------------------+-----------------------------------------------+ -| 244 | 2.0 | | +| 244 | 2.0 | | +-----------+--------------------+-----------------------------------------------+ | 245 | -2.0 | | +-----------+--------------------+-----------------------------------------------+ @@ -2200,7 +2200,7 @@ encoding. Table: VALU Instruction Set -| +| | The next table lists the compare instructions. +----------------+----------------+------------------------------+------------------------------+ @@ -2209,7 +2209,7 @@ Table: VALU Instruction Set | V\_CMP | I16, I32, I64, | F, LT, EQ, LE, GT, LG, GE, T | Write VCC.. | | | U16, U32, U64 | | | +----------------+----------------+------------------------------+------------------------------+ -| V\_CMPX | Write VCC and | | | +| V\_CMPX | Write VCC and | | | | | exec. | | | +----------------+----------------+------------------------------+------------------------------+ | V\_CMP | F16, F32, F64 | | F, LT, EQ,LE, GT, LG, GE, | Write VCC. | @@ -2823,7 +2823,7 @@ VGPRs. | | TBUFFER\_STORE\_FORMAT\_{x, | | | xy,xyz,xyzw} | | +-------------------------------+--------------------------------------------+ -| MUBUF Instructions | | +| MUBUF Instructions | | +-------------------------------+--------------------------------------------+ | | BUFFER\_LOAD\_FORMAT\_{x,xy | | Read to, or write from, an untyped | | ,xyz,xyzw} | buffer object. | @@ -3051,14 +3051,14 @@ Dst\_sel comes from the resource, but is ignored for many operations. Table: Buffer Instructions -**Instruction** : The instruction’s dfmt and nfmt fields are used -instead of the resource’s fields. +**Instruction** : The instruction's dfmt and nfmt fields are used +instead of the resource's fields. **Data format derived** : The data format is derived from the opcode and ignores the resource definition. For example, buffer\_load\_ubyte sets the data-format to 8 and number-format to uint. -.. note:: The resource’s data format must not be INVALID; that format has special meaning (unbound resource), and for that case the data format is not replaced by the instruction’s implied data format. +.. note:: The resource's data format must not be INVALID; that format has special meaning (unbound resource), and for that case the data format is not replaced by the instruction's implied data format. **DST\_SEL identity** : Depending on the number of components in the data-format, this is: X000, XY00, XYZ0, or XYZW. @@ -3271,7 +3271,7 @@ Swizzled Buffer Addressing Swizzled addressing rearranges the data in the buffer to help provide improved cache locality for arrays of structures. Swizzled addressing also requires Dword-aligned accesses. A single fetch instruction cannot -attempt to fetch a unit larger than const-element-size. The buffer’s +attempt to fetch a unit larger than const-element-size. The buffer's STRIDE must be a multiple of element\_size. :: @@ -3434,7 +3434,7 @@ the following subset of MUBUF instructions. - BUFFER\_LOAD\_{ubyte, sbyte, ushort, sshort, dword, format\_x}. -- It is illegal to set the instruction’s TFE bit for loads to LDS. +- It is illegal to set the instruction's TFE bit for loads to LDS. .. raw:: html @@ -3783,19 +3783,19 @@ image opcodes. | 1 | 1D | x | slice | | | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 1 | 2D | x | y | | | | +| 1 | 2D | x | y | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | 2D | x | y | fragid | | | | | MSAA | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 2 | 2D | x | y | slice | | | -| | Array | | | | | | +| 2 | 2D | x | y | slice | | | +| | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 2D | x | y | slice | fragid | | | | Array | | | | | | -| | MSAA | | | | | | +| | MSAA | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 2 | 3D | x | y | z | | | +| 2 | 3D | x | y | z | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | Cube | x | y | face\_id | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ @@ -3807,7 +3807,7 @@ image opcodes. +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | 2D | x | y | mipid | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | 2D | x | y | slice | mipid | | +| 3 | 2D | x | y | slice | mipid | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 3D | x | y | z | mipid | | @@ -3847,11 +3847,11 @@ gradients. +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | 3D | x | y | z | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 2 | Cube | x | y | face\_id | | | +| 2 | Cube | x | y | face\_id | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | sample\_l | 1 | 1D | x | lod | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 2 | 1D | x | slice | lod | | | +| 2 | 1D | x | slice | lod | | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | 2D | x | y | lod | | | @@ -3860,20 +3860,20 @@ gradients. | | interl | | | | | | | | aced | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | 2D | x | y | slice | lod | | +| 3 | 2D | x | y | slice | lod | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | 3D | x | y | z | lod | | +| 3 | 3D | x | y | z | lod | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | Cube | x | y | face\_id | lod | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | sample\_cl | 1 | 1D | x | clamp | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 2 | 1D | x | slice | clamp | | | -| | Array | | | | | | +| | Array | | | | | | | 2 | 2D | x | y | clamp | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | 2D | x | y | field | clamp | | +| 3 | 2D | x | y | field | clamp | | | | interl | | | | | | | | aced | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ @@ -3882,7 +3882,7 @@ gradients. +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 3D | x | y | z | clamp | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | Cube | x | y | face\_id | clamp | | +| 3 | Cube | x | y | face\_id | clamp | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | gather4 | 1 | 2D | x | y | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ @@ -3893,29 +3893,29 @@ gradients. | 2 | 2D | x | y | slice | | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 2 | Cube | x | y | face\_id | | | +| 2 | Cube | x | y | face\_id | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | gather4\_l | 2 | 2D | x | y | lod | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | 2D | x | y | field | lod | | +| 3 | 2D | x | y | field | lod | | | | interl | | | | | | | | aced | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 2D | x | y | slice | lod | | | | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | Cube | x | y | face\_id | lod | | +| 3 | Cube | x | y | face\_id | lod | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | gather4\_cl | 2 | 2D | x | y | clamp | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 2D | x | y | field | clamp | | -| | interl | | | | | | +| | interl | | | | | | | | aced | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ | 3 | 2D | x | y | slice | clamp | | -| | Array | | | | | | +| | Array | | | | | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ -| 3 | Cube | x | y | face\_id | clamp | | +| 3 | Cube | x | y | face\_id | clamp | | +--------------------+--------+-------------+-----------+-----------+-----------+-----------+ Table: Image Opcodes with Sampler @@ -3954,7 +3954,7 @@ instructions: | | | | for it to used in | | | | | LOD computation. | +--------------------+--------------------+--------------------+--------------------+ -| \_CD | Coarse Derivative | Send dx/dv, dx/dy, | | +| \_CD | Coarse Derivative | Send dx/dv, dx/dy, | | | | | etc. slopes to TA | | | | | for it to used in | | | | | LOD computation. | | @@ -4030,7 +4030,7 @@ These are all packed into consecutive VGPRs. - Writes: When writing an image object, it is only possible to write an entire element (all components), not just individual components. The components come from consecutive VGPRs, and the texture system fills - in the value zero for any missing components of the image’s data + in the value zero for any missing components of the image's data format; it ignores any values that are not part of the stored data format. For example, if the DMASK=1001, the shader sends Red from VGPR\_N, and Alpha from VGPR\_N+1, to the texture unit. If the image @@ -4075,7 +4075,7 @@ MIMG instructions. +----------+--------+---------------+---------------------------------------------+ | Bits | Size | Name | Comments | +==========+========+===============+=============================================+ -| **128-bi | | | | +| **128-bi | | | | | t | | | | | Resource | | | | | : | | | | @@ -4102,16 +4102,16 @@ MIMG instructions. +----------+--------+---------------+---------------------------------------------+ | 91:78 | 14 | height | height-1 of mip0 in texels | +----------+--------+---------------+---------------------------------------------+ -| 94:92 | 3 | perf | Scales sampler’s perf\_z, perf\_mip, | +| 94:92 | 3 | perf | Scales sampler's perf\_z, perf\_mip, | | | | modulation | aniso\_bias, lod\_bias\_sec. | +----------+--------+---------------+---------------------------------------------+ | 98:96 | 3 | dst\_sel\_x | 0 = 0, 1 = 1, 4 = R, 5 = G, 6 = B, 7 = A. | +----------+--------+---------------+---------------------------------------------+ -| 101:99 | 3 | dst\_sel\_y | | +| 101:99 | 3 | dst\_sel\_y | | +----------+--------+---------------+---------------------------------------------+ | 104:102 | 3 | dst\_sel\_z | | -+----------+--------+---------------+---------------------------------------------+ -| 107:105 | 3 | dst\_sel\_w | | ++----------+--------+---------------+---------------------------------------------+ +| 107:105 | 3 | dst\_sel\_w | | +----------+--------+---------------+---------------------------------------------+ | 111:108 | 4 | base level | largest mip level in the resource view. For | | | | | msaa, set to zero. | @@ -4129,7 +4129,7 @@ MIMG instructions. | | | | 2d-msaa, 15 = 2d-msaa-array. 1-7 are | | | | | reserved. | +----------+--------+---------------+---------------------------------------------+ -| **256-bi | | | | +| **256-bi | | | | | t | | | | | Resource | | | | | : | | | | @@ -4181,7 +4181,7 @@ MIMG instructions. | 213 | 1 | Compression | enable delta color compression | | | | Enable | | +----------+--------+---------------+---------------------------------------------+ -| 214 | 1 | Alpha is on | Set to 1 if the surface’s component swap is | +| 214 | 1 | Alpha is on | Set to 1 if the surface's component swap is | | | | MSB | not reversed (DCC) | +----------+--------+---------------+---------------------------------------------+ | 215 | 1 | Color | Auto=0, none=1 (DCC) | @@ -4217,7 +4217,7 @@ with every sample instruction. +====================+====================+====================+====================+ | 2:0 | 3 | clamp x | Clamp/wrap mode. | +--------------------+--------------------+--------------------+--------------------+ -| 5:3 | 3 | clamp y | | +| 5:3 | 3 | clamp y | | +--------------------+--------------------+--------------------+--------------------+ | 8:6 | 3 | clamp z | | +--------------------+--------------------+--------------------+--------------------+ @@ -4312,11 +4312,11 @@ VGPRs and sent to the texture cache. Any texture or buffer resources and samplers are also sent immediately. However, write-data is not immediately sent to the texture cache. -The shader developer’s responsibility to avoid data hazards associated +The shader developer's responsibility to avoid data hazards associated with VMEM instructions include waiting for VMEM read instruction completion before reading data fetched from the TC (VMCNT). -This is explained in the section: +This is explained in the section: :ref:`Vector Memory Operations` @@ -4516,10 +4516,10 @@ Table: Flat, Global and Scratch Microcode Formats +-------------------------+-------------------------+--------------------------+ | FLAT\_ATOMIC\_DEC | GLOBAL\_ATOMIC\_DEC | none | +-------------------------+-------------------------+--------------------------+ -| The atomic instructions | | | -| above are also | | | -| available in "\_X2" | | | -| versions (64-bit). | | | +| The atomic instructions | | | +| above are also | | | +| available in "\_X2" | | | +| versions (64-bit). | | | +-------------------------+-------------------------+--------------------------+ Table: Flat, Global and Scratch Opcodes @@ -4657,7 +4657,7 @@ The policy for threads with bad addresses is: writes outside this range do not write a value, and reads return zero. Addressing errors from either LDS or TA are returned on their respective -"instruction done" busses as MEM\_VIOL. This sets the wave’s MEM\_VIOL +"instruction done" busses as MEM\_VIOL. This sets the wave's MEM\_VIOL TrapStatus bit and causes an exception (trap) if the corresponding EXCPEN bit is set. @@ -4741,9 +4741,9 @@ memory structure. |fig 10 2| To load data into LDS from global memory, it is read from global memory -and placed into the work-item’s registers; then, a store is performed to +and placed into the work-item's registers; then, a store is performed to LDS. Similarly, to store data into global memory, data is read from LDS -and placed into the workitem’s registers, then placed into global +and placed into the workitem's registers, then placed into global memory. To make effective use of the LDS, an algorithm must perform many operations on what is transferred between global memory and LDS. It also is possible to load data from a memory buffer directly into LDS, @@ -4851,7 +4851,7 @@ number (0 to 32) and the component number (0=x, 1=y, 2=z and 3=w). | | | v\_interp\_p1 as a macro of two instructions. | +-------------+-------------+--------------------------------------------------+ | ( M0 ) | 32 | Use of the M0 register is automatic. M0 must | -| | | contain: { 1’b0, new\_prim\_mask[15:1], | +| | | contain: { 1'b0, new\_prim\_mask[15:1], | | | | lds\_param\_offset[15:0] } | +-------------+-------------+--------------------------------------------------+ @@ -4906,7 +4906,7 @@ The table below lists and briefly describes the LDS instruction fields. | | | ops treat the offset as a 16-bit signed Dword | | | | offset. | +-------------+-------------+--------------------------------------------------+ -| OFFSET1 | 8 | | +| OFFSET1 | 8 | | +-------------+-------------+--------------------------------------------------+ | VDST | 8 | VGPR to which result is written: either from | | | | LDS-load or atomic return value. | @@ -5120,7 +5120,7 @@ The export instruction uses the EXP microcode format. | | | | MRT: vsrc0=R, 1=G, | | | | 2=B, 3=A | +-------------------------+-------------------------+--------------------------+ -| VSRC2 | 8 | | +| VSRC2 | 8 | | +-------------------------+-------------------------+--------------------------+ | VSRC1 | 8 | | +-------------------------+-------------------------+--------------------------+ @@ -5188,7 +5188,7 @@ Multiple export instructions can be outstanding at one time. Exports of the same type (for example: position) are completed in order, but exports of different types can be completed out of order. -If the STATUS register’s SKIP\_EXPORT bit is set to one, the hardware +If the STATUS register's SKIP\_EXPORT bit is set to one, the hardware treats all EXPORT instructions as if they were NOPs. Instructions @@ -6132,7 +6132,7 @@ send data from the SIMM16 field and in some cases from EXEC. | | | 2=emit, | | | | | 3=emit-cut | | +------------+------------+------------+-------------------------------------------+ -| GS-done | 3 | | | +| GS-done | 3 | | | +------------+------------+------------+-------------------------------------------+ | save wave | 4 | - | used in context switching | +------------+------------+------------+-------------------------------------------+ @@ -7292,7 +7292,7 @@ The bitfield map for VOPC is: for which the bitfield is: Compare instructions perform the same compare operation on each lane -(workItem or thread) using that lane’s private data, and producing a 1 +(workItem or thread) using that lane's private data, and producing a 1 bit result per lane into VCC or EXEC. Instructions in this format may use a 32-bit literal constant which @@ -10918,14 +10918,14 @@ sections that follow provide details | SOP2 | `section\_title <#_so | 32 | | | p2>`__ | | +-----------------------------------------+-----------------------+------------+ -| SOP1 | `section\_title <#_so | | +| SOP1 | `section\_title <#_so | | | | p1>`__ | | +-----------------------------------------+-----------------------+------------+ | SOPK | `section\_title <#_so | | | | pk>`__ | | +-----------------------------------------+-----------------------+------------+ | SOPP | `section\_title <#_so | | -| | pp>`__ | | +| | pp>`__ | | +-----------------------------------------+-----------------------+------------+ | SOPC | `section\_title <#_so | | | | pc>`__ | | @@ -10961,7 +10961,7 @@ sections that follow provide details | SDWA | `section\_title <#_vo | 32 | | | p2>`__ | | +-----------------------------------------+-----------------------+------------+ -| **Vector Parameter Interpolation | | | +| **Vector Parameter Interpolation | | | | Format** | | | +-----------------------------------------+-----------------------+------------+ | VINTRP | `section\_title <#_vi | 32 | @@ -10979,7 +10979,7 @@ sections that follow provide details | MUBUF | `section\_title <#_mu | 64 | | | buf>`__ | | +-----------------------------------------+-----------------------+------------+ -| **Vector Memory Image Format** | | | +| **Vector Memory Image Format** | | | +-----------------------------------------+-----------------------+------------+ | MIMG | `section\_title <#_mi | 64 | | | mg>`__ | | @@ -12389,8 +12389,8 @@ VOP3 format. | Operation | Offset | | +=================+=========+===================================================+ | Sixteen Compare | | | -| Operations | | | -| (OP16) | | | +| Operations | | | +| (OP16) | | | +-----------------+---------+---------------------------------------------------+ | F | 0 | D.u = 0 | +-----------------+---------+---------------------------------------------------+ @@ -13500,7 +13500,7 @@ SDWA | | | the VGPR that are not selected by DST\_SEL: | | | | | 0 = pad with zeros + 1 = sign extend upper / | | | | zero lower | -| | | | 2 = preserve (don’t modify) | +| | | | 2 = preserve (don't modify) | | | | | 3 = reserved | +-----------------+---------+---------------------------------------------------+ | CLMP | [45] | 1 = clamp result | @@ -14192,7 +14192,7 @@ MTBUF | | | read-data. | +-----------------+---------+---------------------------------------------------+ | SRSRC | [52:48] | SGPR to supply V# (resource constant) in 4 or 8 | -| | | consecutive SGPRs. It is missing 2 LSB’s of | +| | | consecutive SGPRs. It is missing 2 LSB's of | | | | SGPR-address since must be aligned to 4. | +-----------------+---------+---------------------------------------------------+ | SLC | [54] | System level coherent: bypass L2 cache. | @@ -14288,7 +14288,7 @@ MUBUF | | | read-data. | +-----------------+---------+---------------------------------------------------+ | SRSRC | [52:48] | SGPR to supply V# (resource constant) in 4 or 8 | -| | | consecutive SGPRs. It is missing 2 LSB’s of | +| | | consecutive SGPRs. It is missing 2 LSB's of | | | | SGPR-address since must be aligned to 4. | +-----------------+---------+---------------------------------------------------+ | TFE | [55] | Partially resident texture, texture fail enable. | @@ -14471,7 +14471,7 @@ MIMG | | | VGPRn+1. | | | | | For D16 writes, DMASK is only used as a word | | | | count: each bit represents 16 bits of data to | -| | | be written starting at the LSB’s of VADDR, then | +| | | be written starting at the LSB's of VADDR, then | | | | MSBs, then VADDR+1 etc. Bit position is | | | | ignored. | +-----------------+---------+---------------------------------------------------+ @@ -14518,11 +14518,11 @@ MIMG | | | read-data. | +-----------------+---------+---------------------------------------------------+ | SRSRC | [52:48] | SGPR to supply V# (resource constant) in 4 or 8 | -| | | consecutive SGPRs. It is missing 2 LSB’s of | +| | | consecutive SGPRs. It is missing 2 LSB's of | | | | SGPR-address since must be aligned to 4. | +-----------------+---------+---------------------------------------------------+ | SSAMP | [57:53] | SGPR to supply V# (resource constant) in 4 or 8 | -| | | consecutive SGPRs. It is missing 2 LSB’s of | +| | | consecutive SGPRs. It is missing 2 LSB's of | | | | SGPR-address since must be aligned to 4. | +-----------------+---------+---------------------------------------------------+ | D16 | [63] | Address offset, unsigned byte. | @@ -14769,7 +14769,7 @@ FLAT | ENCODING | [31:26] | Must be: 110111 | +-----------------+---------+---------------------------------------------------+ | ADDR | [39:32] | | VGPR which holds address or offset. For 64-bit | -| | | addresses, ADDR has the LSB’s and ADDR+1 has | +| | | addresses, ADDR has the LSB's and ADDR+1 has | | | | the MSBs. For offset a single VGPR has a 32 bit | | | | unsigned offset. | | | | | For FLAT\_\*: always specifies an address. | diff --git a/Installation_Guide/FAQ-on-Installation.rst b/Installation_Guide/FAQ-on-Installation.rst index c9055017..a56cfbfa 100644 --- a/Installation_Guide/FAQ-on-Installation.rst +++ b/Installation_Guide/FAQ-on-Installation.rst @@ -10,7 +10,7 @@ Determining if the video card is installed correctly The ROCm software stack has specific requirements regarding the type of GPU supported and how it is installed in the system. The card must be installed in a PCIe slot that supports the 3.0 PCIe specification and the atomics extension. Preferably the slot is x16; x8 an x4 slots will work, but data transfer rates between host memory and GPU memory will be reduced. If the card is not installed in a compatible PCIe slot applications that dispatch a compute kernel will hang waiting for a completion signal from the GPU, which is an atomic operation. -After booting the system with the new driver installed the dmesg output will indicate if there were any problems initializing the GPU. The output of the command ‘sudo dmesg | grep kfd’ will indicate if there were any initialization problems. A properly initialized system will have dmesg output similar to this +After booting the system with the new driver installed the dmesg output will indicate if there were any problems initializing the GPU. The output of the command 'sudo dmesg | grep kfd' will indicate if there were any initialization problems. A properly initialized system will have dmesg output similar to this :: dmesg | grep kfd [ 0.000000] Linux version 4.11.0-kfd-compute-roc-master-5051 (jenkins@jenkins-raptor-5) (gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.4) ) #1 SMP Thu Jun 29 21:00:37 CDT 2017 @@ -30,27 +30,27 @@ If the GPU is installed in a PCIe slot that is not supported there will be error Meta package Installation issues, rpm and dpkg *********************************************** -The ROCm repository uses several “meta” packages that provide easy installation for several components of ROCm that do not have natural dependencies. The “meta” packages are empty debian or rpm files that have dependencies on several, unrelated, ROCm components. They are useful in installing or uninstalling the entire ROCm stack with one apt-get or dnf command, and also provide automatic configuration of the /dev/kfd file permissions using the udev service. +The ROCm repository uses several "meta" packages that provide easy installation for several components of ROCm that do not have natural dependencies. The "meta" packages are empty debian or rpm files that have dependencies on several, unrelated, ROCm components. They are useful in installing or uninstalling the entire ROCm stack with one apt-get or dnf command, and also provide automatic configuration of the /dev/kfd file permissions using the udev service. -In some cases users can “break” a ROCm installation by removing one of the “meta” packages using the rpm or dpkg command directly. The rpm and dpkg commands do not resolve dependencies like the dnf and apt-get commands do, and should not be used to remove any ‘meta’ packages, or any other ROCm package. For example, a user can remove the rocm package with the command ‘sudo dpkg –r rocm’ on Ubuntu, but that will not remove any of its dependencies. This is also true for the ‘sudo apt-get remove rocm’ command which will only remove the rocm ‘meta’ package and not its dependencies. To remove a ROCm installation completely, use ‘sudo apt-get autoremove rocm’ for Ubuntu and ‘sudo dnf remove rocm’ for Fedora. +In some cases users can "break" a ROCm installation by removing one of the "meta" packages using the rpm or dpkg command directly. The rpm and dpkg commands do not resolve dependencies like the dnf and apt-get commands do, and should not be used to remove any 'meta' packages, or any other ROCm package. For example, a user can remove the rocm package with the command 'sudo dpkg -r rocm' on Ubuntu, but that will not remove any of its dependencies. This is also true for the 'sudo apt-get remove rocm' command which will only remove the rocm 'meta' package and not its dependencies. To remove a ROCm installation completely, use 'sudo apt-get autoremove rocm' for Ubuntu and 'sudo dnf remove rocm' for Fedora. -The current meta packages are: rocm – Depends on the kernel drivers, firmware and the rocm-dev packages. rocm-dev – Depends on the roct, rocr, rocr extension, hcc and hip packages. rocm-libs – Depends on the hcBLAS, hcFFT, hcRNG, rocBLAS and hipBLAS packages. +The current meta packages are: rocm - Depends on the kernel drivers, firmware and the rocm-dev packages. rocm-dev - Depends on the roct, rocr, rocr extension, hcc and hip packages. rocm-libs - Depends on the hcBLAS, hcFFT, hcRNG, rocBLAS and hipBLAS packages. -If an installation has its ‘meta’ packages removed they can be reinstall using the standard apt-get or dnf command. Reinstall the ‘meta’ packages will not reinstall already installed dependencies +If an installation has its 'meta' packages removed they can be reinstall using the standard apt-get or dnf command. Reinstall the 'meta' packages will not reinstall already installed dependencies Linux Kernels are not uninstalled by default ********************************************** -If ROCm is uninstalled using dnf or apt-get the kernel packages are not uninstalled by default. This is a Linux convention, and isn’t unique the ROCm stack. To remove the kernel packages, they will have to be removed explicitly: +If ROCm is uninstalled using dnf or apt-get the kernel packages are not uninstalled by default. This is a Linux convention, and isn't unique the ROCm stack. To remove the kernel packages, they will have to be removed explicitly: -For debian – ‘sudo apt-get autoremove ’ For RPM – ‘sudo dnf remove ’ +For debian - 'sudo apt-get autoremove ' For RPM - 'sudo dnf remove ' -The rpm or dpkg command can also be used, but isn’t recommended. +The rpm or dpkg command can also be used, but isn't recommended. Updating firmware may not trigger a rebuilding of ramfs ******************************************************** -If a device isn’t detected by the ROCm kernel drivers, it is possible there is an issue loading required device firmware. This can happen if the system has downlevel firmware or if the firmware is updated, but the ramfs hasn’t been initialized with the new firmware images. To see if this is a problem, check the dmesg of the system: +If a device isn't detected by the ROCm kernel drivers, it is possible there is an issue loading required device firmware. This can happen if the system has downlevel firmware or if the firmware is updated, but the ramfs hasn't been initialized with the new firmware images. To see if this is a problem, check the dmesg of the system: :: dmesg | grep amdgpu [ 4.434129] [drm] amdgpu kernel modesetting enabled. @@ -60,10 +60,10 @@ If a device isn’t detected by the ROCm kernel drivers, it is possible there is [ 4.517733] amdgpu 0000:05:00.0: Fatal error during GPU init [ 4.517757] [drm] amdgpu: finishing device. [ 4.517914] amdgpu: probe of 0000:05:00.0 failed with error -2 - + The error displayed above indicates the kernel is having trouble loading the firmware. -If the firmware version isn’t correct, please install updated firmware packages, which should be available on the repository server. If the correct firmware is installed, reinitialize the ramfs as follows: +If the firmware version isn't correct, please install updated firmware packages, which should be available on the repository server. If the correct firmware is installed, reinitialize the ramfs as follows: **Ubuntu** :: @@ -84,27 +84,27 @@ This problem can occur on Fedora installation if several previous kernels are cu ------------- Disk Requirements: At least 17MB more space needed on the /boot filesystem. - + This is not an issue with the YUM repository; it is caused by the size of the /boot filesystem and the size of the kernels already installed on it. This issue can be fixed by uninstalling previous versions of the rocm Linux kernel: :: sudo dnf remove rocm - rpm -qa | grep kfd | xargs sudo rpm –e + rpm -qa | grep kfd | xargs sudo rpm -e sudo dnf install rocm - + Installing from an archived repository ************************************** The Radeon repo server stores several archived releases, supporting both debian and rpm repositories. These archives are located here at http://repo.radeon.com/rocm/archive. Users can install with an archive by downloading the desired archive and then updating the package configuration file to point at the localized repo. Debian Archive Example -*********************** +*********************** Here is an Example: :: cd /temp && wget http://repo.radeon.com/rocm/archive/apt_1.6.3.tar.bz2 tar -xvf apt_1.6.3.tar.bz2 - sudo echo “deb [amd64] file://temp/apt_1.6.3 xenial main” > /etc/apt/sources.lists.d/rocm.local.list + sudo echo "deb [amd64] file://temp/apt_1.6.3 xenial main" > /etc/apt/sources.lists.d/rocm.local.list sudo apt-get update && sudo apt-get install rocm Users should make sure that no other list files contain another rocm repo configuration. @@ -119,7 +119,7 @@ Add a /etc/yum.d/rocm.local.repo file with the following contents: :: enabled=1 gpgcheck=0 cd /temp && wget http://repo.radeon.com/rocm/archive/yum_1.6.3.tar.bz2 - tar –xvf yum_1.6.3.tar.bz2 + tar -xvf yum_1.6.3.tar.bz2 Then execute: :: diff --git a/Installation_Guide/HCC-Compiler.rst b/Installation_Guide/HCC-Compiler.rst index 8a350a6c..70336a10 100644 --- a/Installation_Guide/HCC-Compiler.rst +++ b/Installation_Guide/HCC-Compiler.rst @@ -173,4 +173,4 @@ For applications compiled using hcc, ThinLTO could significantly improve link-ti ThinLTO Phase 2 - Under development ************************************** -This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel. \ No newline at end of file +This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel. diff --git a/Installation_Guide/HIP.rst b/Installation_Guide/HIP.rst index 8b7affc9..1d5fc8ca 100644 --- a/Installation_Guide/HIP.rst +++ b/Installation_Guide/HIP.rst @@ -60,16 +60,16 @@ Programmers familiar with CUDA will also be able to quickly learn and start codi :: hipMalloc(&A_d, Nbytes)); hipMalloc(&C_d, Nbytes)); - + hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice); - + const unsigned blocks = 512; const unsigned threadsPerBlock = 256; hipLaunchKernelGGL(vector_square, /* compute kernel*/ dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/, /* launch config*/ - C_d, A_d, N); /* arguments to the compute kernel */ - - hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); + C_d, A_d, N); /* arguments to the compute kernel */ + + hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. It also specifies additional defines and keywords for function types, address spaces, and optimization controls. (See the HIP Kernel Language for a full description). Here's an example of defining a simple 'vector_square' kernel. @@ -80,7 +80,7 @@ The HIP kernel language defines builtins for determining grid and block coordina { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - + for (size_t i=offset; i`_ that uses hipify to convert a simple app from CUDA to HIP: :: - + cd samples/01_Intro/square # follow README / blog steps to hipify the application. diff --git a/Installation_Guide/Installation-Guide.rst b/Installation_Guide/Installation-Guide.rst index 94dd0bc8..ee782bf1 100644 --- a/Installation_Guide/Installation-Guide.rst +++ b/Installation_Guide/Installation-Guide.rst @@ -7,12 +7,12 @@ AMD ROCm QuickStart Installation Guide v3.3.0 - `Deploying ROCm`_ - `Ubuntu`_ - + - `Centos RHEL v7.7`_ - + - `SLES 15 Service Pack 1`_ - - + + - `ROCm Installation Known Issues and Workarounds`_ @@ -50,7 +50,7 @@ To install from a Debian Repository: sudo apt install libnuma-dev - sudo reboot + sudo reboot 2. Add the ROCm apt repository. @@ -84,7 +84,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b :: groups - + 5. To add your user to the video group, use the following command for the sudo password: @@ -115,7 +115,7 @@ Note: To run the ROCm programs more efficiently, add the ROCm binaries in your P :: - echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | + echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | sudo tee -a /etc/profile.d/rocm.sh @@ -151,9 +151,9 @@ You can install the ROCm user-level software without installing the AMD's custom :: - sudo apt update - sudo apt install rocm-dev - echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' + sudo apt update + sudo apt install rocm-dev + echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' sudo tee /etc/udev/rules.d/70-kfd.rules @@ -177,8 +177,8 @@ Note: The following steps do not apply to the CentOS installation. 2. Enable the following repositories: :: - - sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms + + sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms sudo subscription-manager repos --enable rhel-7-server-optional-rpms sudo subscription-manager repos --enable rhel-7-server-extras-rpms @@ -221,13 +221,13 @@ To install ROCm on your system, follow the instructions below: :: - [ROCm] + [ROCm] name=ROCm - baseurl=http://repo.radeon.com/rocm/yum/rpm + baseurl=http://repo.radeon.com/rocm/yum/rpm enabled=1 gpgcheck=0 -Note: The URL of the repository must point to the location of the repositories’ repodata database. +Note: The URL of the repository must point to the location of the repositories' repodata database. 3. Install ROCm components using the following command: @@ -325,7 +325,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke :: sudo yum install rocm-dev - echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' + echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' sudo tee /etc/udev/rules.d/70-kfd.rules **Note**: You can use this command instead of installing rocm-dkms. @@ -336,7 +336,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke SLES 15 Service Pack 1 ^^^^^^^^^^^^^^^^^^^^^^^ -The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. +The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. **Installation** @@ -347,13 +347,13 @@ The following section tells you how to perform an install and uninstall ROCm on sudo SUSEConnect --product PackageHub/15.1/x86_64 sudo zypper install dkms - + 2. Add the ROCm repo. - + :: - sudo zypper clean –all - sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm + sudo zypper clean -all + sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm sudo zypper ref zypper install rocm-dkms sudo zypper install rocm-dkms @@ -372,7 +372,7 @@ The following section tells you how to perform an install and uninstall ROCm on 5. Run /opt/rocm/bin/rocminfo and /opt/rocm/opencl/bin/x86_64/clinfo commands to list the GPUs and verify that the ROCm installation is successful. -6. Set permissions. +6. Set permissions. To access the GPU, you must be a user in the video group. Ensure your user account is a member of the video group prior to using ROCm. To identify the groups you are a member of, use the following command: @@ -381,11 +381,11 @@ To access the GPU, you must be a user in the video group. Ensure your user accou groups 7. To add your user to the video group, use the following command for the sudo password: - + :: sudo usermod -a -G video $LOGNAME - + 8. By default, add any future users to the video group. Run the following command to add users to the video group: :: @@ -428,9 +428,9 @@ Some users may want to install a subset of the full ROCm installation. If you ar :: sudo yum install rock-dkms rocm-opencl-devel - -ROCm Installation Known Issues and Workarounds + +ROCm Installation Known Issues and Workarounds ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Closed source components @@ -438,7 +438,7 @@ Closed source components The ROCm platform relies on some closed source components to provide functionalities like HSA image support. These components are only available through the ROCm repositories, and they may be deprecated or become open source components in the future. These components are made available in the following packages: -• hsa-ext-rocr-dev +o hsa-ext-rocr-dev Getting the ROCm Source Code @@ -449,7 +449,7 @@ AMD ROCm is built from open source software. It is, therefore, possible to modif Installing the Repo ^^^^^^^^^^^^^^^^^^^^^ -The repo tool from Google® allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo: +The repo tool from Google(R) allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo: :: @@ -515,37 +515,37 @@ ROCm Support Software - ROCm cmake: ``rocm-cmake`` - rocminfo: ``rocminfo`` - ROCm Bandwidth Test: ``rocm_bandwidth_test`` - - + + ROCm Development ToolChain =========================== - HCC compiler: ``hcc`` - + - HIP: ``hip_base``, ``hip_doc``, ``hip_hcc``, ``hip_samples`` - + - ROCm Device Libraries: ``rocm-device-libs`` - + - ROCm OpenCL: ``rocm-opencl``, ``rocm-opencl-devel`` (on RHEL/CentOS), ``rocm-opencl-dev`` (on Ubuntu) - + - ROCM Clang-OCL Kernel Compiler: ``rocm-clang-ocl`` - + - Asynchronous Task and Memory Interface (ATMI): ``atmi`` - + - ROCm Debug Agent: ``rocm_debug_agent`` - + - ROCm Code Object Manager: ``comgr`` - + - ROC Profiler: ``rocprofiler-dev`` - + - ROC Tracer: ``roctracer-dev`` - + - Radeon Compute Profiler: ``rocm-profiler`` - + ROCm Libraries ============== - + - rocALUTION: ``rocalution`` - rocBLAS: ``rocblas`` - hipBLAS: ``hipblas`` @@ -564,9 +564,9 @@ ROCm Libraries To make it easier to install ROCm, the AMD binary repositories provide a number of meta-packages that will automatically install multiple other packages. For example, ``rocm-dkms`` is the primary meta-package that is used to install most of the base technology needed for ROCm to operate. -It will install the ``rock-dkms`` kernel driver, and another meta-package +It will install the ``rock-dkms`` kernel driver, and another meta-package (``rocm-dev``) which installs most of the user-land ROCm core components, support software, and development tools. - + The *rocm-utils* meta-package will install useful utilities that, while not required for ROCm to operate, may still be beneficial to have. Finally, the *rocm-libs* meta-package will install some (but not all) of the libraries that are part of ROCm. @@ -653,7 +653,7 @@ The latest supported version of the drivers, tools, libraries and source code fo - `ROCm OpenCL Runtime`_ - `ROCm LLVM OCL`_ - `ROCm Device Libraries OCL`_ - + - `ROCM Clang-OCL Kernel Compiler`_ - `Asynchronous Task and Memory Interface`_ - `ROCr Debug Agent`_ @@ -716,7 +716,7 @@ ROCm Development ToolChain ============================ -.. _HCC compiler: https://github.com/RadeonOpenCompute/hcc/tree/rocm-3.3.0 +.. _HCC compiler: https://github.com/RadeonOpenCompute/hcc/tree/rocm-3.3.0 .. _HIP: https://github.com/ROCm-Developer-Tools/HIP/tree/rocm-3.3.0 @@ -783,7 +783,7 @@ ROCm Libraries .. _MIVisionX: https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/tree/1.7 -.. _AMDMIGraphX: https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/commit/d1e945dabce0078d44c78de67b00232b856e18bc +.. _AMDMIGraphX: https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/commit/d1e945dabce0078d44c78de67b00232b856e18bc @@ -806,16 +806,16 @@ New features and enhancements in ROCm v3.1 **Change in ROCm Installation Directory Structure** -A fresh installation of the ROCm toolkit installs the packages in the /opt/rocm- folder. +A fresh installation of the ROCm toolkit installs the packages in the /opt/rocm- folder. Previously, ROCm toolkit packages were installed in the /opt/rocm folder. **Reliability, Accessibility, and Serviceability Support for Vega 7nm** -The Reliability, Accessibility, and Serviceability (RAS) support for Vega7nm is now available. +The Reliability, Accessibility, and Serviceability (RAS) support for Vega7nm is now available. **SLURM Support for AMD GPU** -SLURM (Simple Linux Utility for Resource Management) is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters. +SLURM (Simple Linux Utility for Resource Management) is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters. New features and enhancements in ROCm v3.0 @@ -836,11 +836,11 @@ The Fast Fourier Transform (FFT) is an efficient algorithm for computing the Dis Other improvements: -• More 2D test coverage sizes. +o More 2D test coverage sizes. -• Fix buffer allocation error for large 1D transforms. +o Fix buffer allocation error for large 1D transforms. -• C++ compatibility improvements. +o C++ compatibility improvements. MemCopy Enhancement for rocProf In the v3.0 release, the rocProf tool is enhanced with an additional capability to dump asynchronous GPU memcopy information into a .csv file. You can use the '-hsa-trace' option to create the results_mcopy.csv file. Future enhancements will include column labels. @@ -856,7 +856,7 @@ In the AMD ROCm release v2.10, support is extended to the General Matrix Multipl Support for SLES 15 SP1 -In the AMD ROCm v2.10 release, support is added for SUSE Linux® Enterprise Server (SLES) 15 SP1. SLES is a modular operating system for both multimodal and traditional IT. +In the AMD ROCm v2.10 release, support is added for SUSE Linux(R) Enterprise Server (SLES) 15 SP1. SLES is a modular operating system for both multimodal and traditional IT. Code Marker Support for rocProfiler and rocTracer Libraries @@ -882,7 +882,7 @@ ROCm 2.9 adds support for Singularity container version 2.5.2. Initial release of rocTX -ROCm 2.9 introduces rocTX, which provides a C API for code markup for performance profiling. This initial release of rocTX supports annotation of code ranges and ASCII markers. +ROCm 2.9 introduces rocTX, which provides a C API for code markup for performance profiling. This initial release of rocTX supports annotation of code ranges and ASCII markers. * Added support for Ubuntu 18.04.3 * Ubuntu 18.04.3 is now supported in ROCm 2.9. @@ -986,9 +986,9 @@ Bloat16 software support in rocBLAS/Tensile Added mixed precision bfloat16/IEEE f32 to gemm_ex. The input and output matrices are bfloat16. All arithmetic is in IEEE f32. -AMD Infinity Fabric™ Link enablement +AMD Infinity Fabric(TM) Link enablement -The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in two hives or two Radeon Instinct MI60 or Radeon Instinct MI50 boards in four hives via AMD Infinity Fabric™ Link GPU interconnect technology has been added. +The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in two hives or two Radeon Instinct MI60 or Radeon Instinct MI50 boards in four hives via AMD Infinity Fabric(TM) Link GPU interconnect technology has been added. ROCm-smi features and bug fixes @@ -1008,7 +1008,7 @@ Improvements to *name_get functions RCCL2 Enablement -RCCL2 supports collectives intranode communication using PCIe, Infinity Fabric™, and pinned host memory, as well as internode communication using Ethernet (TCP/IP sockets) and Infiniband/RoCE (Infiniband Verbs). Note: For Infiniband/RoCE, RDMA is not currently supported. +RCCL2 supports collectives intranode communication using PCIe, Infinity Fabric(TM), and pinned host memory, as well as internode communication using Ethernet (TCP/IP sockets) and Infiniband/RoCE (Infiniband Verbs). Note: For Infiniband/RoCE, RDMA is not currently supported. rocFFT enhancements @@ -1055,9 +1055,9 @@ Support overlapping kernel execution in same HIP stream HIP API has been enhanced to allow independent kernels to run in parallel on the same stream. -AMD Infinity Fabric™ Link enablement +AMD Infinity Fabric(TM) Link enablement -The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in one hive via AMD Infinity Fabric™ Link GPU interconnect technology has been added. +The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in one hive via AMD Infinity Fabric(TM) Link GPU interconnect technology has been added. New features and enhancements in ROCm 2.4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1066,9 +1066,9 @@ TensorFlow 2.0 support ROCm 2.4 includes the enhanced compilation toolchain and a set of bug fixes to support TensorFlow 2.0 features natively -AMD Infinity Fabric™ Link enablement +AMD Infinity Fabric(TM) Link enablement -ROCm 2.4 adds support to connect two Radeon Instinct MI60 or Radeon Instinct MI50 boards via AMD Infinity Fabric™ Link GPU interconnect technology. +ROCm 2.4 adds support to connect two Radeon Instinct MI60 or Radeon Instinct MI50 boards via AMD Infinity Fabric(TM) Link GPU interconnect technology. New features and enhancements in ROCm 2.3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1134,7 +1134,7 @@ Added support for multi-GPU training New features and enhancements in ROCm 2.1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -RocTracer v1.0 preview release – 'rocprof' HSA runtime tracing and statistics support - +RocTracer v1.0 preview release - 'rocprof' HSA runtime tracing and statistics support - Supports HSA API tracing and HSA asynchronous GPU activity including kernels execution and memory copy Improvements to ROCM-SMI tool - @@ -1181,9 +1181,9 @@ Creates a stream with the specified priority. It creates a stream on which enque OpenCL 2.0 support -ROCm 2.0 introduces full support for kernels written in the OpenCL 2.0 C language on certain devices and systems. Applications can detect this support by calling the “clGetDeviceInfo” query function with “parame_name” argument set to “CL_DEVICE_OPENCL_C_VERSION”. +ROCm 2.0 introduces full support for kernels written in the OpenCL 2.0 C language on certain devices and systems. Applications can detect this support by calling the "clGetDeviceInfo" query function with "parame_name" argument set to "CL_DEVICE_OPENCL_C_VERSION". -In order to make use of OpenCL 2.0 C language features, the application must include the option “-cl-std=CL2.0” in options passed to the runtime API calls responsible for compiling or building device programs. The complete specification for the OpenCL 2.0 C language can be obtained using the following link: https://www.khronos.org/registry/OpenCL/specs/opencl-2.0-openclc.pdf +In order to make use of OpenCL 2.0 C language features, the application must include the option "-cl-std=CL2.0" in options passed to the runtime API calls responsible for compiling or building device programs. The complete specification for the OpenCL 2.0 C language can be obtained using the following link: https://www.khronos.org/registry/OpenCL/specs/opencl-2.0-openclc.pdf Improved Virtual Addressing (48 bit VA) management for Vega 10 and later GPUs @@ -1232,7 +1232,7 @@ Added DPM support to Vega 7nm Dynamic Power Management feature is enabled on Vega 7nm. -Fix for 'ROCm profiling' that used to fail with a “Version mismatch between HSA runtime and libhsa-runtime-tools64.so.1” error +Fix for 'ROCm profiling' that used to fail with a "Version mismatch between HSA runtime and libhsa-runtime-tools64.so.1" error New features and enhancements in ROCm 1.9.0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1293,7 +1293,7 @@ IPC To try ROCm with an upstream kernel, install ROCm as normal, but do not install the rock-dkms package. Also add a udev rule to control /dev/kfd permissions: echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' | sudo tee /etc/udev/rules.d/70-kfd.rules - + New features as of ROCm 1.8.3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1353,4 +1353,4 @@ Binary Package support for Fedora 24 is not currently available Dropping binary package support for Ubuntu 14.04, Fedora 23 IPC support - + diff --git a/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst b/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst index f9b7d756..3ca0f0a6 100644 --- a/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst +++ b/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst @@ -5,47 +5,47 @@ List of ROCm Packages for Ubuntu and Fedora ============================================ +-----------------------------------+-----------------------+---------------------------------------------------------+ -|Package | Debian | RPM | +|Package | Debian | RPM | +===================================+=======================+=========================================================+ -|ROCm Master Package | rocm | rocm-1.6.77-Linux.rpm | +|ROCm Master Package | rocm | rocm-1.6.77-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|ROCm Developer Master Package | rocm-dev | rocm-dev-1.6.77-Linux.rpm | +|ROCm Developer Master Package | rocm-dev | rocm-dev-1.6.77-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|ROCm Libraries Master Package | rocm-libs | rocm-libs-1.6.77-Linux.rpm | +|ROCm Libraries Master Package | rocm-libs | rocm-libs-1.6.77-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|ATMI | atmi | atmi-0.3.7-45-gde867f2-Linux.rpm | +|ATMI | atmi | atmi-0.3.7-45-gde867f2-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|HCC | hcc | hcc-1.0.17262-Linux.rpm | +|HCC | hcc | hcc-1.0.17262-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|hcBLAS | hcblas | hcblas-master-482646f-Linux.rpm | +|hcBLAS | hcblas | hcblas-master-482646f-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|hcFFT | hcfft. | hcfft-master-1a96022-Linux.rpm | +|hcFFT | hcfft. | hcfft-master-1a96022-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|hcRNG | hcrng. | hcrng-master-c2ada99-Linux.rpm | +|hcRNG | hcrng. | hcrng-master-c2ada99-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|HIP Core | hip_base | hip_base-1.2.17263.rpm | +|HIP Core | hip_base | hip_base-1.2.17263.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ |HIP Documents | hip_doc | hip_doc-1.2.17263.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ |HIP Compiler | hip_hcc | hip_hcc-1.2.17263.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|HIP Samples | hip_samples | hip_samples-1.2.17263.rpm. | +|HIP Samples | hip_samples | hip_samples-1.2.17263.rpm. | +-----------------------------------+-----------------------+---------------------------------------------------------+ |HIPBLAS | hipblas | hipblas-0.4.0.3-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|MIOpen OpenCL Lib | miopen-opencl. | MIOpen-OpenCL-1.0.0-Linux.rpm | +|MIOpen OpenCL Lib | miopen-opencl. | MIOpen-OpenCL-1.0.0-Linux.rpm | ++-----------------------------------+-----------------------+---------------------------------------------------------+ +|rocBLAS | rocblas | rocblas-0.4.2.3-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|rocBLAS | rocblas | rocblas-0.4.2.3-Linux.rpm | -+-----------------------------------+-----------------------+---------------------------------------------------------+ |rocFFT | rocfft | rocm-device-libs-0.0.1-Linux.rpm | -+-----------------------------------+-----------------------+---------------------------------------------------------+ -|ROCm Device Libs | rocm-device-libs | rocm-device-libs-0.0.1-Linux.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|ROCm OpenCL for Dev with CL headers| rocm-opencl-dev | rocm-opencl-devel-1.2.0-1424893.x86_64.rpm | +|ROCm Device Libs | rocm-device-libs | rocm-device-libs-0.0.1-Linux.rpm | ++-----------------------------------+-----------------------+---------------------------------------------------------+ +|ROCm OpenCL for Dev with CL headers| rocm-opencl-dev | rocm-opencl-devel-1.2.0-1424893.x86_64.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|ROCm GDB | rocm-gdb | rocm-gdb-1.5.265-gc4fb045.x86_64.rpm | +|ROCm GDB | rocm-gdb | rocm-gdb-1.5.265-gc4fb045.x86_64.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ -|RCP profiler | rocm-profiler | rocm-profiler-5.1.6386-gbaddcc9.x86_64.rpm | +|RCP profiler | rocm-profiler | rocm-profiler-5.1.6386-gbaddcc9.x86_64.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ |ROCm SMI Tool | rocm-smi | rocm-smi-1.0.0_24_g68893bc-1.x86_64.rpm | +-----------------------------------+-----------------------+---------------------------------------------------------+ diff --git a/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst b/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst index d31929d3..768b5c8d 100644 --- a/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst +++ b/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst @@ -20,15 +20,15 @@ I/O device which support 32-bit, 64-bit and 128-bit operand which target address For ROCm the Platform atomics are used in ROCm in the following ways: - * Update HSA queue’s read_dispatch_id: 64 bit atomic add used by the command processor on the GPU agent to update the packet ID it processed. - * Update HSA queue’s write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions. - * Update HSA Signals – 64bit atomic ops are used for CPU & GPU synchronization. + * Update HSA queue's read_dispatch_id: 64 bit atomic add used by the command processor on the GPU agent to update the packet ID it processed. + * Update HSA queue's write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions. + * Update HSA Signals - 64bit atomic ops are used for CPU & GPU synchronization. The PCIe 3.0 AtomicOp feature allows atomic transactions to be requested by, routed through and completed by PCIe components. Routing and completion does not require software support. Component support for each is detectable via the DEVCAP2 register. Upstream bridges need to have AtomicOp routing enabled or the Atomic Operations will fall even though PCIe endpoint and PCIe I/O Devices has the capability to Atomics Operations. To do AtomicOp routing capability between two or more Root Ports, each associated Root Port must indicate that capability via the AtomicOp Routing Supported bit in the Device Capabilities 2 register. -If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component’s DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported. +If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component's DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported. Atomic Operation is a Non-Posted transaction supporting 32-bit and 64-bit address formats, there must be a response for Completion containing the result of the operation. Errors associated with the operation (uncorrectable error accessing the target location or carrying out the Atomic operation) are signaled to the requester by setting the Completion Status field in the completion descriptor, they are set to to Completer Abort (CA) or Unsupported Request (UR). @@ -54,15 +54,15 @@ Future bus technology with richer I/O Atomics Operation Support * `GenZ `_ -New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPU’s with PCIe Generation 3.0 support. +New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPU's with PCIe Generation 3.0 support. * `Mellanox Bluefield SOC `_ * `Cavium Thunder X2 `_ -In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets: +In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets: | 1. write to another GPU memory, - + | 2. then write to system memory to indicate transfer complete. They are routed off to different ends of the computer but we want to make sure the write to system memory to indicate transfer complete occurs AFTER P2P write to GPU has complete. @@ -76,7 +76,7 @@ On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing, In SuperMicro system in the system bios you need to see the following * Advanced->PCIe/PCI/PnP configuration-> Above 4G Decoding = Enabled - + * Advanced->PCIe/PCI/PnP Configuration->MMIOH Base = 512G * Advanced->PCIe/PCI/PnP Configuration->MMIO High Size = 256G @@ -90,57 +90,57 @@ For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual add * BAR4 register: Optional, not a boot device. * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed < 4GB. -Here is how our BAR works on GFX 8 GPU’s with 40 bit Physical Address Limit :: +Here is how our BAR works on GFX 8 GPU's with 40 bit Physical Address Limit :: 11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO Series] (rev c1) Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35 - + Flags: bus master, fast devsel, latency 0, IRQ 119 - + Memory at bf40000000 (64-bit, prefetchable) [size=256M] - + Memory at bf50000000 (64-bit, prefetchable) [size=2M] - + I/O ports at 3000 [size=256] - + Memory at c7400000 (32-bit, non-prefetchable) [size=256K] - + Expansion ROM at c7440000 [disabled] [size=128K] Legend: -1 : GPU Frame Buffer BAR – In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs. +1 : GPU Frame Buffer BAR - In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs. -2 : Doorbell BAR – The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs. +2 : Doorbell BAR - The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs. 3 : IO BAR - This is for legacy VGA and boot device support, but since this the GPUs in this project are not VGA devices (headless), this is not a concern even if the SBIOS does not setup. -4 : MMIO BAR – This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB. +4 : MMIO BAR - This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB. -5 : Expansion ROM – This is required for the AMD Driver SW to access the GPU’s video-bios. This is currently fixed at 128KB. +5 : Expansion ROM - This is required for the AMD Driver SW to access the GPU's video-bios. This is currently fixed at 128KB. Excepts form Overview of Changes to PCI Express 3.0 =================================================== By Mike Jackson, Senior Staff Architect, MindShare, Inc. ******************************************************** -Atomic Operations – Goal: +Atomic Operations - Goal: ************************* Support SMP-type operations across a PCIe network to allow for things like offloading tasks between CPU cores and accelerators like a GPU. The spec says this enables advanced synchronization mechanisms that are particularly useful with multiple producers or consumers that need to be synchronized in a non-blocking fashion. Three new atomic non-posted requests were added, plus the corresponding completion (the address must be naturally aligned with the operand size or the TLP is malformed): - * Fetch and Add – uses one operand as the “add” value. Reads the target location, adds the operand, and then writes the result back to the original location. + * Fetch and Add - uses one operand as the "add" value. Reads the target location, adds the operand, and then writes the result back to the original location. - * Unconditional Swap – uses one operand as the “swap” value. Reads the target location and then writes the swap value to it. + * Unconditional Swap - uses one operand as the "swap" value. Reads the target location and then writes the swap value to it. - * Compare and Swap – uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location. + * Compare and Swap - uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location. - * AtomicOpCompletion – new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained. + * AtomicOpCompletion - new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained. -Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide “lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic.” The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation. +Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide "lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic." The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation. AtomicOps can go from device to device, device to host, or host to device. Each completer indicates whether it supports this capability and guarantees atomic access if it does. The ability to route AtomicOps is also indicated in the registers for a given port. -ID-based Ordering – Goal: +ID-based Ordering - Goal: ************************* Improve performance by avoiding stalls caused by ordering rules. For example, posted writes are never normally allowed to pass each other in a queue, but if they are requested by different functions, we can have some confidence that the requests are not dependent on each other. The previously reserved Attribute bit [2] is now combined with the RO bit to indicate ID ordering with or without relaxed ordering. diff --git a/Installation_Guide/Quick Start Installation Guide.rst b/Installation_Guide/Quick Start Installation Guide.rst index de5109eb..7763e9dc 100644 --- a/Installation_Guide/Quick Start Installation Guide.rst +++ b/Installation_Guide/Quick Start Installation Guide.rst @@ -12,7 +12,7 @@ AMD ROCm QuickStart Installation Guide v3.1.0 - `SLES 15 Service Pack 1`_ - `ROCm Installation Known Issues and Workarounds`_ - + - `Getting the ROCm Source Code`_ | @@ -53,7 +53,7 @@ To install from a Debian Repository: sudo apt install libnuma-dev - sudo reboot + sudo reboot 2. Add the ROCm apt repository. @@ -87,7 +87,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b :: groups - + 5. To add your user to the video group, use the following command for the sudo password: @@ -99,7 +99,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b :: - echo 'ADD_EXTRA_GROUPS=1' + echo 'ADD_EXTRA_GROUPS=1' sudo tee -a /etc/adduser.conf echo 'EXTRA_GROUPS=video' @@ -122,7 +122,7 @@ Note: To run the ROCm programs more efficiently, add the ROCm binaries in your P :: - echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | + echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | sudo tee -a /etc/profile.d/rocm.sh @@ -158,9 +158,9 @@ You can install the ROCm user-level software without installing the AMD's custom :: - sudo apt update - sudo apt install rocm-dev - echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' + sudo apt update + sudo apt install rocm-dev + echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' sudo tee /etc/udev/rules.d/70-kfd.rules @@ -186,8 +186,8 @@ Note: The following steps do not apply to the CentOS installation. 2. Enable the following repositories: :: - - sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms + + sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms sudo subscription-manager repos --enable rhel-7-server-optional-rpms sudo subscription-manager repos --enable rhel-7-server-extras-rpms @@ -230,13 +230,13 @@ To install ROCm on your system, follow the instructions below: :: - [ROCm] + [ROCm] name=ROCm - baseurl=http://repo.radeon.com/rocm/yum/rpm + baseurl=http://repo.radeon.com/rocm/yum/rpm enabled=1 gpgcheck=0 -Note: The URL of the repository must point to the location of the repositories’ repodata database. +Note: The URL of the repository must point to the location of the repositories' repodata database. 3. Install ROCm components using the following command: @@ -336,7 +336,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke :: sudo yum install rocm-dev - echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' + echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' sudo tee /etc/udev/rules.d/70-kfd.rules **Note**: You can use this command instead of installing rocm-dkms. @@ -347,7 +347,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke SLES 15 Service Pack 1 ^^^^^^^^^^^^^^^^^^^^^^^ -The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. +The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. **Installation** @@ -358,13 +358,13 @@ The following section tells you how to perform an install and uninstall ROCm on sudo SUSEConnect --product PackageHub/15.1/x86_64 sudo zypper install dkms - + 2. Add the ROCm repo. - + :: - sudo zypper clean –all - sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm + sudo zypper clean -all + sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm sudo zypper ref zypper install rocm-dkms sudo zypper install rocm-dkms @@ -383,7 +383,7 @@ The following section tells you how to perform an install and uninstall ROCm on 5. Run /opt/rocm/bin/rocminfo and /opt/rocm/opencl/bin/x86_64/clinfo commands to list the GPUs and verify that the ROCm installation is successful. -6. Set permissions. +6. Set permissions. To access the GPU, you must be a user in the video group. Ensure your user account is a member of the video group prior to using ROCm. To identify the groups you are a member of, use the following command: @@ -392,11 +392,11 @@ To access the GPU, you must be a user in the video group. Ensure your user accou groups 7. To add your user to the video group, use the following command for the sudo password: - + :: sudo usermod -a -G video $LOGNAME - + 8. By default, add any future users to the video group. Run the following command to add users to the video group: :: @@ -414,7 +414,7 @@ To access the GPU, you must be a user in the video group. Ensure your user accou /opt/rocm/opencl/bin/x86_64/clinfo Note: To run the ROCm programs more efficiently, add the ROCm binaries in your PATH. -echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | +echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | :: @@ -439,9 +439,9 @@ Some users may want to install a subset of the full ROCm installation. If you ar :: sudo yum install rock-dkms rocm-opencl-devel - -ROCm Installation Known Issues and Workarounds + +ROCm Installation Known Issues and Workarounds ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Closed source components @@ -449,7 +449,7 @@ Closed source components The ROCm platform relies on some closed source components to provide functionalities like HSA image support. These components are only available through the ROCm repositories, and they may be deprecated or become open source components in the future. These components are made available in the following packages: -• hsa-ext-rocr-dev +o hsa-ext-rocr-dev Getting the ROCm Source Code @@ -460,7 +460,7 @@ AMD ROCm is built from open source software. It is, therefore, possible to modif Installing the Repo ^^^^^^^^^^^^^^^^^^^^^ -The repo tool from Google® allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo: +The repo tool from Google(R) allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo: :: diff --git a/Installation_Guide/QuickStartGuideOpenCL.rst b/Installation_Guide/QuickStartGuideOpenCL.rst index 45c77f25..3a8390be 100644 --- a/Installation_Guide/QuickStartGuideOpenCL.rst +++ b/Installation_Guide/QuickStartGuideOpenCL.rst @@ -3,14 +3,14 @@ Quick Start Guide For OpenCL ============================ -* ROCm 1.7 introduces big updates to our OpenCL compiler and runtime implementation -- built on top of the ROCm software stack! +* ROCm 1.7 introduces big updates to our OpenCL compiler and runtime implementation -- built on top of the ROCm software stack! This developer release includes the following: ------------------------------ * OpenCL 2.0 compatible kernel language support with OpenCL 1.2 compatible runtime -* OpenCL compiler also has assembler and disassembler support, inline assembly support is now in place. -* Big improvements in the base compiler as we roll in new optimization for application in new Native LLVM code generator. +* OpenCL compiler also has assembler and disassembler support, inline assembly support is now in place. +* Big improvements in the base compiler as we roll in new optimization for application in new Native LLVM code generator. * We made our base compiler intrinsics source code available * OCML https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCML.md * Source code for the Intrinsic https://github.com/RadeonOpenCompute/ROCm-Device-Libs/tree/master/opencl/src @@ -29,7 +29,7 @@ Install the ROCm OpenCL implementation (assuming you already have the 'rocm' pac For a sample OpenCL application, let's use a simple vector-add example from the University of Bristol's very nice "Hands On OpenCL" lectures. -.. code-block:: +.. code-block:: git clone https://github.com/HandsOnOpenCL/Exercises-Solutions.git @@ -43,15 +43,15 @@ For a sample OpenCL application, let's use a simple vector-add example from the ./vadd -Not for all your application that supported the AMDGPU SDK for OpenCL to get the Header, rocm-opencl-dev now included the headerfiles. +Not for all your application that supported the AMDGPU SDK for OpenCL to get the Header, rocm-opencl-dev now included the headerfiles. -If your built all your code with the AMDAPPSDK you do not need to download anything else, you can just export environment variable to /opt/rocm/opencl +If your built all your code with the AMDAPPSDK you do not need to download anything else, you can just export environment variable to /opt/rocm/opencl -Do not install the AMDAPPSDK 3.0 on ROCm OpenCL it designed for old driver which need headers installed. rocm-opencl-dev package does this for you. +Do not install the AMDAPPSDK 3.0 on ROCm OpenCL it designed for old driver which need headers installed. rocm-opencl-dev package does this for you. Example 1 for AMDAPPSDKROOT :: - export AMDAPPSDKROOT=/opt/rocm/opencl + export AMDAPPSDKROOT=/opt/rocm/opencl Example 2 for AMDAPPSDK @@ -61,10 +61,10 @@ Example 2 for AMDAPPSDK Where is clinfo? :: - /opt/rocm/opencl/bin/x86_64/clinfo + /opt/rocm/opencl/bin/x86_64/clinfo -* That's it! Super easy. +* That's it! Super easy. Related Resources ----------------- diff --git a/Installation_Guide/ROC-smi.rst b/Installation_Guide/ROC-smi.rst index d4a98db1..7949f456 100644 --- a/Installation_Guide/ROC-smi.rst +++ b/Installation_Guide/ROC-smi.rst @@ -26,16 +26,16 @@ For convenience purposes, following is a quick excerpt: [--setsclk LEVEL [LEVEL ...]] [--setmclk LEVEL [LEVEL ...]] [--setfan LEVEL] [--setperflevel LEVEL] [--setoverdrive %] [--setprofile # # # # #] [--resetprofile] [--load FILE | --save FILE] [--autorespond RESPONSE] - + AMD ROCm System Management Interface - + optional arguments: -h, --help show this help message and exit --load FILE Load Clock, Fan, Performance and Profile settings from FILE --save FILE Save Clock, Fan, Performance and Profile settings to FILE - - -d DEVICE, --device DEVICE Execute command on specified device - + + -d DEVICE, --device DEVICE Execute command on specified device + -i, --showid Show GPU ID -t, --showtemp Show current temperature -c, --showclocks Show current clock frequencies @@ -46,8 +46,8 @@ For convenience purposes, following is a quick excerpt: -o, --showoverdrive Show current OverDrive level -l, --showprofile Show Compute Profile attributes -s, --showclkfrq Show supported GPU and Memory Clock - -a, --showallinfo Show all SMI-supported values values - + -a, --showallinfo Show all SMI-supported values values + -r, --resetclocks Reset clocks to default (auto) --setsclk LEVEL [LEVEL ...] Set GPU Clock Frequency Level Mask (manual) --setmclk LEVEL [LEVEL ...] Set GPU Memory Clock Frequency Mask (manual) @@ -55,8 +55,8 @@ For convenience purposes, following is a quick excerpt: --setperflevel LEVEL Set PowerPlay Performance Level --setoverdrive % Set GPU OverDrive level (manual|high) --setprofile # # # # # Specify Compute Profile attributes (auto) - --resetprofile Reset Compute Profile - + --resetprofile Reset Compute Profile + --autorespond RESPONSE Response to automatically provide for all prompts (NOT RECOMMENDED) @@ -69,13 +69,13 @@ Detailed Option Descriptions The clock levels will change dynamically based on GPU load based on the default Compute and Graphics profiles. The thresholds and delays for a custom mask cannot be controlled through the SMI tool - + This flag automatically sets the Performance Level to "manual" as the mask is not applied when the Performance level is set to auto --setfan LEVEL: This sets the fan speed to a value ranging from 0 to 255 (not from 0-100%). -:: +:: NOTE: While the hardware is usually capable of overriding this value when required, it is recommended to not set the fan level lower than the default value for extended periods of time @@ -87,18 +87,18 @@ Detailed Option Descriptions :: NOTES: This option can be used in conjunction with the --setsclk mask - + Operating the GPU outside of specifications can cause irreparable damage to your hardware Please observe the warning displayed when using this option - + This flag automatically sets the sclk to the highest level, as only the highest level is increased by the OverDrive value - + --setprofile # # # # #: The Compute Profile accepts 5 parameters, which are (in order): Minimum SCLK - Minimum GPU clock speed in MHz Minimum MCLK - Minimum GPU Memory clock speed in MHz Activity threshold - Workload required before clock levels change (%) Hysteresis Up - Delay before clock level is increased in milliseconds Hysteresis Down - Delay before clock level is decresed in milliseconds :: NOTES: When a compute queue is detected, these values will be automatically applied to the system - + Compute Power Profiles are only applied when the Performance Level is set to "auto" so using this flag will automatically set the performance level to "auto" @@ -115,7 +115,7 @@ Any new functionality added to the SMI should have a corresponding test added to GitHub ******** -For more information please refer `Github link `_. +For more information please refer `Github link `_. Disclaimer ************* diff --git a/Installation_Guide/ROCK-Kernel-Driver_readme.rst b/Installation_Guide/ROCK-Kernel-Driver_readme.rst index ec80ede1..3b004d7f 100644 --- a/Installation_Guide/ROCK-Kernel-Driver_readme.rst +++ b/Installation_Guide/ROCK-Kernel-Driver_readme.rst @@ -61,7 +61,7 @@ LICENSE ######### The following lists the different licenses that apply to the different components in this repository: - + | The Linux kernel images are covered by the modified GPL license in COPYING | The firmware image is covered by the license in LICENSE.ucode diff --git a/Installation_Guide/ROCR-Runtime.rst b/Installation_Guide/ROCR-Runtime.rst index 9427b56a..ffe04e6b 100644 --- a/Installation_Guide/ROCR-Runtime.rst +++ b/Installation_Guide/ROCR-Runtime.rst @@ -13,7 +13,7 @@ Initial target platform requirements * CPU: Intel Haswell or newer, Core i5, Core i7, Xeon E3 v4 & v5; Xeon E5 v3 * GPU: Fiji ASIC (AMD R9 Nano, R9 Fury and R9 Fury X) * GPU: Polaris ASIC (AMD RX480) - + Source code ************** The HSA core runtime source code for the ROCR runtime is located in the src subdirectory. Please consult the associated README.md file for contents and build instructions. diff --git a/Installation_Guide/ROCk-kernel.rst b/Installation_Guide/ROCk-kernel.rst index b7d986c0..0a3bf4a9 100644 --- a/Installation_Guide/ROCk-kernel.rst +++ b/Installation_Guide/ROCk-kernel.rst @@ -6,11 +6,11 @@ ROCk-Kernel The following is a sequence of commands to Install ROCk-Kernel into the system: -**# OPTIONAL :** +**# OPTIONAL :** upgrade your base kernel to 4.13.0-32-generic, **reboot required** :: sudo apt update && sudo apt install linux-headers-4.13.0-32-generic linux-image-4.13.0-32-generic linux-image-extra-4.13.0-32-generic linux-signed-image-4.13.0-32-generic - sudo reboot + sudo reboot Installation steps: ################### @@ -27,7 +27,7 @@ Install the ROCm compute firmware and rock-dkms kernel modules, **reboot require :: sudo adduser $LOGNAME video -Make sure to reboot the machine after installing the ROCm kernel package to force the new kernel to load on reboot. +Make sure to reboot the machine after installing the ROCm kernel package to force the new kernel to load on reboot. You can verify the ROCm kernel is loaded by typing the following command at a prompt: :: @@ -38,5 +38,5 @@ Printed on the screen should be similar as follows: amdkfd 270336 4 amd_iommu_v2 20480 1 amdkfd amdkcl 24576 3 amdttm,amdgpu,amdkfd - - + + diff --git a/Installation_Guide/atmi.rst b/Installation_Guide/atmi.rst index 8fb8d621..bed873bc 100644 --- a/Installation_Guide/atmi.rst +++ b/Installation_Guide/atmi.rst @@ -1,12 +1,12 @@ .. _Asynch: ===== -ATMI +ATMI ===== ATMI (Asynchronous Task and Memory Interface) Asynchronous Task and Memory Interface, or ATMI, is a runtime framework and declarative programming model for heterogeneous CPU-GPU systems. It provides a consistent API to create task graphs on CPUs and GPUs (integrated and discrete). ATMI is a declarative programming model, where high-level tasks can be simply described by using a few predefined C-style structures. The task description includes specifying its granularity, dependencies to other tasks, data requirements and so on. The ATMI runtime, based on the task graph, will perform task scheduling and memory management that is optimal for the underlying platform. ATMI provides a rich and flexible user interface so that the end user can relinquish scheduling to the runtime (default behavior) or take full control of scheduling and mapping, if desired. The target audience for ATMI is application programmers or middleware developers for high-level languages. -Compilation and Runtime Workflow +Compilation and Runtime Workflow ************************************ The below figure depicts the ATMI runtime workflow with CLOC as the compiler utility. @@ -39,12 +39,12 @@ ATMI v0.3 * Devices supported: AMD Carrizo and Kaveri APUs, and AMD Fiji dGPU * Runtimes used: ROCm v1.2 -License +License ********* MIT License -Copyright © 2016 Advanced Micro Devices, Inc. +Copyright (C) 2016 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: @@ -52,4 +52,4 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -Link to Github Repository `ATMI `_ +Link to Github Repository `ATMI `_ diff --git a/Other_Solutions/Other-Solutions.rst b/Other_Solutions/Other-Solutions.rst index 89b2611b..d356c021 100644 --- a/Other_Solutions/Other-Solutions.rst +++ b/Other_Solutions/Other-Solutions.rst @@ -5,7 +5,7 @@ System Level Debug ===================== -ROCm Language & System Level Debug, Flags and Environment Variables +ROCm Language & System Level Debug, Flags and Environment Variables ##################################################################### | Kernel options to avoid Ethernet port getting renamed every time you change graphics cards @@ -15,32 +15,32 @@ ROCr Error Code ****************** * 2 Invalid Dimension -* 4 Invalid Group Memory -* 8 Invalid (or Null) Code +* 4 Invalid Group Memory +* 8 Invalid (or Null) Code * 32 Invalid Format -* 64 Group is too large -* 128 Out of VGPR’s -* 0x80000000 Debug Trap +* 64 Group is too large +* 128 Out of VGPR's +* 0x80000000 Debug Trap -Command to dump firmware version and get Linux Kernel version +Command to dump firmware version and get Linux Kernel version ***************************************************************** -* sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info -* uname -a +* sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info +* uname -a -Debug Flags +Debug Flags *************** Debug messages when developing/debugging base ROCm dirver. You could enable the printing from libhsakmt.so by setting an environment variable, HSAKMT_DEBUG_LEVEL. Available debug levels are 3~7. The higher level you set, the more messages will print. * export HSAKMT_DEBUG_LEVEL=3 : only pr_err() will print. * export HSAKMT_DEBUG_LEVEL=4 : pr_err() and pr_warn() will print. -* export HSAKMT_DEBUG_LEVEL=5 : We currently don’t implement “notice”. Setting to 5 is same as setting to 4. +* export HSAKMT_DEBUG_LEVEL=5 : We currently don't implement "notice". Setting to 5 is same as setting to 4. * export HSAKMT_DEBUG_LEVEL=6 : pr_err(), pr_warn(), and pr_info will print. * export HSAKMT_DEBUG_LEVEL=7 : Everything including pr_debug will print. -ROCr level env variable for debug +ROCr level env variable for debug ************************************ * HSA_ENABLE_SDMA=0 @@ -51,9 +51,9 @@ ROCr level env variable for debug Turn Off Page Retry on GFX9/Vega devices ********************** - * sudo –s + * sudo -s * echo 1 > /sys/module/amdkfd/parameters/noretry - + HCC Debug Enviroment Varibles @@ -165,7 +165,7 @@ PCIe-Debug Refer here for :ref:`PCIe-Debug` -**There’s some more information here on how to debug and profile HIP applications** +**There's some more information here on how to debug and profile HIP applications** * `HIP-Debugging `_ * `HIP-Profiling `_ diff --git a/Other_Solutions/PCIe-Debug.rst b/Other_Solutions/PCIe-Debug.rst index c1470cc3..2b8c9469 100644 --- a/Other_Solutions/PCIe-Debug.rst +++ b/Other_Solutions/PCIe-Debug.rst @@ -3,7 +3,7 @@ ROCm PCIe Debug ================= -lspci helpfull options to help you debug ROCm install issue +lspci helpfull options to help you debug ROCm install issue ************************************************************** **To find if the Linux Kerenl is seeing your GPU and to get the the slot number of the device part number you want to look at** @@ -17,7 +17,7 @@ lspci helpfull options to help you debug ROCm install issue 63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 -**Show Device Slot** +**Show Device Slot** lspci -s _slot number_ @@ -36,7 +36,7 @@ Example :: ~$ sudo lspci -vs 63:00.0 - [sudo] password for rocm: + [sudo] password for rocm: 63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 (prog-if 00 [VGA controller]) Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c35 Flags: bus master, fast devsel, latency 0, IRQ 412 @@ -62,7 +62,7 @@ Example Kernel modules: amdgpu -**Display Vendor and Device Codes and numbers** +**Display Vendor and Device Codes and numbers** lspci -nvmms _slot number_ @@ -74,11 +74,11 @@ lspci -nvmms _slot number_ Vendor: 1002 Device: 6860 SVendor: 1002 - SDevice: 0c35 + SDevice: 0c35 + + +**To show kernel module running on device** - -**To show kernel module running on device** - lspci -ks _slot number_ :: @@ -89,11 +89,11 @@ lspci -nvmms _slot number_ Kernel driver in use: amdgpu Kernel modules: amdgpu -**When you need more information on the device** +**When you need more information on the device** sudo lspci -vvvs _slot number_ -Example +Example :: @@ -158,9 +158,9 @@ Example Kernel driver in use: amdgpu Kernel modules: amdgpu - + **To print PCIe root tree** - + :: ~$ lspci -tv diff --git a/Other_Solutions/ROCm_PCIe_Debug.md b/Other_Solutions/ROCm_PCIe_Debug.md index 633fe34b..44a02adc 100644 --- a/Other_Solutions/ROCm_PCIe_Debug.md +++ b/Other_Solutions/ROCm_PCIe_Debug.md @@ -1,5 +1,5 @@ -lspci helpfull options to help you debug ROCm install issue +lspci helpfull options to help you debug ROCm install issue ************************************************************** **To find if the Linux Kerenl is seeing your GPU and to get the the slot number of the device part number you want to look at** @@ -12,7 +12,7 @@ lspci helpfull options to help you debug ROCm install issue 43:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 ``` -**Show Device Slot** +**Show Device Slot** lspci -s _slot number_ @@ -31,7 +31,7 @@ Example :: ~$ sudo lspci -vs 63:00.0 - [sudo] password for rocm: + [sudo] password for rocm: 63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 (prog-if 00 [VGA controller]) Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c35 Flags: bus master, fast devsel, latency 0, IRQ 412 @@ -57,7 +57,7 @@ Example Kernel modules: amdgpu -**Display Vendor and Device Codes and numbers** +**Display Vendor and Device Codes and numbers** lspci -nvmms _slot number_ @@ -69,11 +69,11 @@ lspci -nvmms _slot number_ Vendor: 1002 Device: 6860 SVendor: 1002 - SDevice: 0c35 + SDevice: 0c35 + + +**To show kernel module running on device** - -**To show kernel module running on device** - lspci -ks _slot number_ :: @@ -84,11 +84,11 @@ lspci -nvmms _slot number_ Kernel driver in use: amdgpu Kernel modules: amdgpu -**When you need more information on the device** +**When you need more information on the device** sudo lspci -vvvs _slot number_ -Example +Example :: @@ -153,9 +153,9 @@ Example Kernel driver in use: amdgpu Kernel modules: amdgpu - + **To print PCIe root tree** - + :: ~$ lspci -tv diff --git a/Programming_Guides/CUDAAPIHIP.rst b/Programming_Guides/CUDAAPIHIP.rst index 88309746..926440ed 100644 --- a/Programming_Guides/CUDAAPIHIP.rst +++ b/Programming_Guides/CUDAAPIHIP.rst @@ -849,7 +849,7 @@ CUDA Driver API functions supported by HIP +----------------------+-----+ | cuGetErrorString | | +----------------------+-----+ - + 3. Initialization ------------------- @@ -1480,7 +1480,7 @@ CUDA Driver API functions supported by HIP +------------------------------------+--------------------------------------------------------+ | cuGraphicsD3D9RegisterResource | | +------------------------------------+--------------------------------------------------------+ - + 27.1. Direct3D 9 Interoperability [DEPRECATED] ------------------------------------------------ diff --git a/Programming_Guides/CUDAAPIHIPTEXTURE.rst b/Programming_Guides/CUDAAPIHIPTEXTURE.rst index 7b13131a..6bbcb136 100644 --- a/Programming_Guides/CUDAAPIHIPTEXTURE.rst +++ b/Programming_Guides/CUDAAPIHIPTEXTURE.rst @@ -623,7 +623,7 @@ CUDA Runtime API functions supported by HIP 28. C++ API Routines ----------------------- -(7.0 contains, 7.5 doesn’t) +(7.0 contains, 7.5 doesn't) +-------------------------------------------------------------+--------------------------------------------------+ | CUDA | HIP | diff --git a/Programming_Guides/HIP-FAQ.rst b/Programming_Guides/HIP-FAQ.rst index 0d58bc13..97c89028 100644 --- a/Programming_Guides/HIP-FAQ.rst +++ b/Programming_Guides/HIP-FAQ.rst @@ -45,7 +45,7 @@ See the `API Support Table `_, which utilizes `rocBlas `_. @@ -93,7 +93,7 @@ Additionally, some of the cublas routines are automatically converted to hipblas Both AMD and Nvidia support OpenCL 1.2 on their devices, so developers can write portable code. HIP offers several benefits over OpenCL: - * Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on. + * Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on. * The HIP API is less verbose than OpenCL and is familiar to CUDA developers. * Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL. * HIP uses the best available development tools on each platform: on Nvidia GPUs, HIP code compiles using NVCC and can employ the nSight profiler and debugger (unlike OpenCL on Nvidia GPUs). diff --git a/Programming_Guides/HIP-GUIDE.rst b/Programming_Guides/HIP-GUIDE.rst index f032101a..88accc95 100644 --- a/Programming_Guides/HIP-GUIDE.rst +++ b/Programming_Guides/HIP-GUIDE.rst @@ -10,7 +10,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly * Math functions resembling those in the "math.h" header included with standard C++ compilers * Built-in functions for accessing specific GPU hardware capabilities -This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. +This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. Features are marked with one of the following keywords: @@ -21,9 +21,9 @@ Features are marked with one of the following keywords: -Function-Type Qualifiers -************************* - +Function-Type Qualifiers +************************* + **__device__** Supported __device__ functions are @@ -70,34 +70,34 @@ __global__ functions are often referred to as kernels, and calling one is termed * hipStream_t: stream where the kernel should execute. A value of 0 corresponds to the NULL stream(see :ref:`Synchronization-Functions`). * Kernel arguments follow these first five parameters :: - + //Example pseudo code introducing hipLaunchKernelGGL __global__ MyKernel(float *A, float *B, float *C, size_t N) { ... - } + } //Replace MyKernel<<>> (a,b,c,n); hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n) -The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. +The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing. .. _Kernel: Kernel-Launch Example +++++++++++++++++++++++ - + :: - - // Example showing device function, __device__ __host__ - // <- compile for both device and host - float PlusOne(float x) + + // Example showing device function, __device__ __host__ + // <- compile for both device and host + float PlusOne(float x) { return x + 1.0; } - __global__ - void + __global__ + void MyKernel (const float *a, const float *b, float *c, unsigned N) { unsigned gid = hipThreadIdx_x; // <- coordinate index function @@ -110,18 +110,18 @@ Kernel-Launch Example float *a, *b, *c; // initialization not shown... unsigned N = 1000000; const unsigned blockSize = 256; - hipLaunchKernelGGL(MyKernel, + hipLaunchKernelGGL(MyKernel, (N/blockSize), dim3(blockSize), 0, 0, a,b,c,N); } - + Variable-Type Qualifiers ************************ **__constant__** - + The __constant__ keyword is supported. The host writes constant memory before launching the kernel; from the GPU, this memory is read-only during kernel execution. The functions for accessing constant memory (hipGetSymbolAddress(), hipGetSymbolSize(), hipMemcpyToSymbol(), hipMemcpyToSymbolAsync, hipMemcpyFromSymbol, hipMemcpyFromSymbolAsync) are under development. **__shared__** @@ -149,19 +149,19 @@ These built-ins determine the coordinate of the active work item in the executio hipThreadIdx_x threadIdx.x hipThreadIdx_y threadIdx.y hipThreadIdx_z threadIdx.z - + hipBlockIdx_x blockIdx.x hipBlockIdx_y blockIdx.y hipBlockIdx_z blockIdx.z - + hipBlockDim_x blockDim.x hipBlockDim_y blockDim.y hipBlockDim_z blockDim.z - + hipGridDim_x gridDim.x hipGridDim_y gridDim.y @@ -206,9 +206,9 @@ dim3 dim3 is a three-dimensional integer vector type commonly used to specify grid and group dimensions. Unspecified dimensions are initialized to 1. :: typedef struct dim3 { - uint32_t x; - uint32_t y; - uint32_t z; + uint32_t x; + uint32_t y; + uint32_t z; dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {}; }; @@ -243,357 +243,357 @@ Following is the list of supported single precision mathematical functions. +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ | Function | Supported on Host | Supported on Device | +====================================================================================================+===================+=====================+ -| float acosf ( float x ) | ✓ | ✓ | +| float acosf ( float x ) | ? | ? | | | | | | Calculate the arc cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float acoshf ( float x ) | ✓ | ✓ | +| float acoshf ( float x ) | ? | ? | | | | | | Calculate the nonnegative arc hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float asinf ( float x ) | ✓ | ✓ | +| float asinf ( float x ) | ? | ? | | | | | | Calculate the arc sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float asinhf ( float x ) | ✓ | ✓ | +| float asinhf ( float x ) | ? | ? | | | | | | Calculate the arc hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atan2f ( float y, float x ) | ✓ | ✓ | +| float atan2f ( float y, float x ) | ? | ? | | | | | | Calculate the arc tangent of the ratio of first and second input arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atanf ( float x ) | ✓ | ✓ | +| float atanf ( float x ) | ? | ? | | | | | | Calculate the arc tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atanhf ( float x ) | ✓ | ✓ | +| float atanhf ( float x ) | ? | ? | | | | | | Calculate the arc hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float cbrtf ( float x ) | ✓ | ✓ | +| float cbrtf ( float x ) | ? | ? | | | | | | Calculate the cube root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ceilf ( float x ) | ✓ | ✓ | +| float ceilf ( float x ) | ? | ? | | | | | | Calculate ceiling of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float copysignf ( float x, float y ) | ✓ | ✓ | +| float copysignf ( float x, float y ) | ? | ? | | | | | | Create value with given magnitude, copying sign of second value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float cosf ( float x ) | ✓ | ✓ | +| float cosf ( float x ) | ? | ? | | | | | | Calculate the cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float coshf ( float x ) | ✓ | ✓ | +| float coshf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcf ( float x ) | ✓ | ✓ | +| float erfcf ( float x ) | ? | ? | | | | | | Calculate the complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erff ( float x ) | ✓ | ✓ | +| float erff ( float x ) | ? | ? | | | | | | Calculate the error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float exp10f ( float x ) | ✓ | ✓ | +| float exp10f ( float x ) | ? | ? | | | | | | Calculate the base 10 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float exp2f ( float x ) | ✓ | ✓ | +| float exp2f ( float x ) | ? | ? | | | | | | Calculate the base 2 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float expf ( float x ) | ✓ | ✓ | +| float expf ( float x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float expm1f ( float x ) | ✓ | ✓ | +| float expm1f ( float x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument, minus 1. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fabsf ( float x ) | ✓ | ✓ | +| float fabsf ( float x ) | ? | ? | | | | | | Calculate the absolute value of its argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fdimf ( float x, float y ) | ✓ | ✓ | +| float fdimf ( float x, float y ) | ? | ? | | | | | | Compute the positive difference between x and y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float floorf ( float x ) | ✓ | ✓ | +| float floorf ( float x ) | ? | ? | | | | | | Calculate the largest integer less than or equal to x. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmaf ( float x, float y, float z ) | ✓ | ✓ | +| float fmaf ( float x, float y, float z ) | ? | ? | | | | | -| Compute x × y + z as a single operation. | | | +| Compute x x y + z as a single operation. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmaxf ( float x, float y ) | ✓ | ✓ | +| float fmaxf ( float x, float y ) | ? | ? | | | | | | Determine the maximum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fminf ( float x, float y ) | ✓ | ✓ | +| float fminf ( float x, float y ) | ? | ? | | | | | | Determine the minimum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmodf ( float x, float y ) | ✓ | ✓ | +| float fmodf ( float x, float y ) | ? | ? | | | | | | Calculate the floating-point remainder of x / y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float frexpf ( float x, int* nptr ) | ✓ | ✗ | +| float frexpf ( float x, int* nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float hypotf ( float x, float y ) | ✓ | ✓ | +| float hypotf ( float x, float y ) | ? | ? | | | | | | Calculate the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| int ilogbf ( float x ) | ✓ | ✓ | +| int ilogbf ( float x ) | ? | ? | | | | | | Compute the unbiased integer exponent of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isfinite ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isfinite ( float a ) | ? | ? | | | | | | Determine whether argument is finite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isinf ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isinf ( float a ) | ? | ? | | | | | | Determine whether argument is infinite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isnan ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isnan ( float a ) | ? | ? | | | | | | Determine whether argument is a NaN. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ldexpf ( float x, int exp ) | ✓ | ✓ | +| float ldexpf ( float x, int exp ) | ? | ? | | | | | -| Calculate the value of x ⋅ 2exp. | | | +| Calculate the value of x ? 2exp. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log10f ( float x ) | ✓ | ✓ | +| float log10f ( float x ) | ? | ? | | | | | | Calculate the base 10 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log1pf ( float x ) | ✓ | ✓ | +| float log1pf ( float x ) | ? | ? | | | | | | Calculate the value of loge( 1 + x ). | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float logbf ( float x ) | ✓ | ✓ | +| float logbf ( float x ) | ? | ? | | | | | | Calculate the floating point representation of the exponent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log2f ( float x ) | ✓ | ✓ | +| float log2f ( float x ) | ? | ? | | | | | | Calculate the base 2 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float logf ( float x ) | ✓ | ✓ | +| float logf ( float x ) | ? | ? | | | | | | Calculate the natural logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float modff ( float x, float* iptr ) | ✓ | ✗ | +| float modff ( float x, float* iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nanf ( const char* tagp ) | ✗ | ✓ | +| float nanf ( const char* tagp ) | ? | ? | | | | | | Returns "Not a Number"" value." | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nearbyintf ( float x ) | ✓ | ✓ | +| float nearbyintf ( float x ) | ? | ? | | | | | | Round the input argument to the nearest integer. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float powf ( float x, float y ) | ✓ | ✓ | +| float powf ( float x, float y ) | ? | ? | | | | | | Calculate the value of first argument to the power of second argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remainderf ( float x, float y ) | ✓ | ✓ | +| float remainderf ( float x, float y ) | ? | ? | | | | | | Compute single-precision floating-point remainder. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remquof ( float x, float y, int* quo ) | ✓ | ✗ | +| float remquof ( float x, float y, int* quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float roundf ( float x ) | ✓ | ✓ | +| float roundf ( float x ) | ? | ? | | | | | | Round to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float scalbnf ( float x, int n ) | ✓ | ✓ | +| float scalbnf ( float x, int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 signbit ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 signbit ( float a ) | ? | ? | | | | | | Return the sign bit of the input. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincosf ( float x, float* sptr, float* cptr ) | ✓ | ✗ | +| void sincosf ( float x, float* sptr, float* cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sinf ( float x ) | ✓ | ✓ | +| float sinf ( float x ) | ? | ? | | | | | | Calculate the sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sinhf ( float x ) | ✓ | ✓ | +| float sinhf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sqrtf ( float x ) | ✓ | ✓ | +| float sqrtf ( float x ) | ? | ? | | | | | | Calculate the square root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tanf ( float x ) | ✓ | ✓ | +| float tanf ( float x ) | ? | ? | | | | | | Calculate the tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tanhf ( float x ) | ✓ | ✓ | +| float tanhf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float truncf ( float x ) | ✓ | ✓ | +| float truncf ( float x ) | ? | ? | | | | | | Truncate input argument to the integral part. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tgammaf ( float x ) | ✓ | ✓ | +| float tgammaf ( float x ) | ? | ? | | | | | | Calculate the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcinvf ( float y ) | ✓ | ✓ | +| float erfcinvf ( float y ) | ? | ? | | | | | | Calculate the inverse complementary function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcxf ( float x ) | ✓ | ✓ | +| float erfcxf ( float x ) | ? | ? | | | | | | Calculate the scaled complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfinvf ( float y ) | ✓ | ✓ | +| float erfinvf ( float y ) | ? | ? | | | | | | Calculate the inverse error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fdividef ( float x, float y ) | ✓ | ✓ | +| float fdividef ( float x, float y ) | ? | ? | | | | | | Divide two floating point values. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float frexpf ( float x, int *nptr ) | ✓ | ✓ | +| float frexpf ( float x, int *nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float j0f ( float x ) | ✓ | ✓ | +| float j0f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float j1f ( float x ) | ✓ | ✓ | +| float j1f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float jnf ( int n, float x ) | ✓ | ✓ | +| float jnf ( int n, float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float lgammaf ( float x ) | ✓ | ✓ | +| float lgammaf ( float x ) | ? | ? | | | | | | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llrintf ( float x ) | ✓ | ✓ | +| long long int llrintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llroundf ( float x ) | ✓ | ✓ | +| long long int llroundf ( float x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lrintf ( float x ) | ✓ | ✓ | +| long int lrintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lroundf ( float x ) | ✓ | ✓ | +| long int lroundf ( float x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float modff ( float x, float *iptr ) | ✓ | ✓ | +| float modff ( float x, float *iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nextafterf ( float x, float y ) | ✓ | ✓ | +| float nextafterf ( float x, float y ) | ? | ? | | | | | | Returns next representable single-precision floating-point value after argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm3df ( float a, float b, float c ) | ✓ | ✓ | +| float norm3df ( float a, float b, float c ) | ? | ? | | | | | | Calculate the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm4df ( float a, float b, float c, float d ) | ✓ | ✓ | +| float norm4df ( float a, float b, float c, float d ) | ? | ? | | | | | | Calculate the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normcdff ( float y ) | ✓ | ✓ | +| float normcdff ( float y ) | ? | ? | | | | | | Calculate the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normcdfinvf ( float y ) | ✓ | ✓ | +| float normcdfinvf ( float y ) | ? | ? | | | | | | Calculate the inverse of the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normf ( int dim, const float *a ) | ✓ | ✓ | +| float normf ( int dim, const float *a ) | ? | ? | | | | | | Calculate the square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rcbrtf ( float x ) | ✓ | ✓ | +| float rcbrtf ( float x ) | ? | ? | | | | | | Calculate the reciprocal cube root function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remquof ( float x, float y, int *quo ) | ✓ | ✓ | +| float remquof ( float x, float y, int *quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rhypotf ( float x, float y ) | ✓ | ✓ | +| float rhypotf ( float x, float y ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rintf ( float x ) | ✓ | ✓ | +| float rintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnorm3df ( float a, float b, float c ) | ✓ | ✓ | +| float rnorm3df ( float a, float b, float c ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnorm4df ( float a, float b, float c, float d ) | ✓ | ✓ | +| float rnorm4df ( float a, float b, float c, float d ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnormf ( int dim, const float *a ) | ✓ | ✓ | +| float rnormf ( int dim, const float *a ) | ? | ? | | | | | | Calculate the reciprocal of square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float scalblnf ( float x, long int n ) | ✓ | ✓ | +| float scalblnf ( float x, long int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincosf ( float x, float *sptr, float *cptr ) | ✓ | ✓ | +| void sincosf ( float x, float *sptr, float *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincospif ( float x, float *sptr, float *cptr ) | ✓ | ✓ | +| void sincospif ( float x, float *sptr, float *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument multiplied by PI. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float y0f ( float x ) | ✓ | ✓ | +| float y0f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float y1f ( float x ) | ✓ | ✓ | +| float y1f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ynf ( int n, float x ) | ✓ | ✓ | +| float ynf ( int n, float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. +[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. **Double Precision Mathematical Functions** @@ -603,348 +603,348 @@ Following is the list of supported double precision mathematical functions. +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ | Function | Supported on Host | Supported on Device | +====================================================================================================+===================+=====================+ -| double acos ( double x ) | ✓ | ✓ | +| double acos ( double x ) | ? | ? | | | | | | Calculate the arc cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double acosh ( double x ) | ✓ | ✓ | +| double acosh ( double x ) | ? | ? | | | | | | Calculate the nonnegative arc hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double asin ( double x ) | ✓ | ✓ | +| double asin ( double x ) | ? | ? | | | | | | Calculate the arc sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double asinh ( double x ) | ✓ | ✓ | +| double asinh ( double x ) | ? | ? | | | | | | Calculate the arc hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atan ( double x ) | ✓ | ✓ | +| double atan ( double x ) | ? | ? | | | | | | Calculate the arc tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atan2 ( double y, double x ) | ✓ | ✓ | +| double atan2 ( double y, double x ) | ? | ? | | | | | | Calculate the arc tangent of the ratio of first and second input arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atanh ( double x ) | ✓ | ✓ | +| double atanh ( double x ) | ? | ? | | | | | | Calculate the arc hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cbrt ( double x ) | ✓ | ✓ | +| double cbrt ( double x ) | ? | ? | | | | | | Calculate the cube root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double ceil ( double x ) | ✓ | ✓ | +| double ceil ( double x ) | ? | ? | | | | | | Calculate ceiling of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double copysign ( double x, double y ) | ✓ | ✓ | +| double copysign ( double x, double y ) | ? | ? | | | | | | Create value with given magnitude, copying sign of second value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cos ( double x ) | ✓ | ✓ | +| double cos ( double x ) | ? | ? | | | | | | Calculate the cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cosh ( double x ) | ✓ | ✓ | +| double cosh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erf ( double x ) | ✓ | ✓ | +| double erf ( double x ) | ? | ? | | | | | | Calculate the error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfc ( double x ) | ✓ | ✓ | +| double erfc ( double x ) | ? | ? | | | | | | Calculate the complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp ( double x ) | ✓ | ✓ | +| double exp ( double x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp10 ( double x ) | ✓ | ✓ | +| double exp10 ( double x ) | ? | ? | | | | | | Calculate the base 10 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp2 ( double x ) | ✓ | ✓ | +| double exp2 ( double x ) | ? | ? | | | | | | Calculate the base 2 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double expm1 ( double x ) | ✓ | ✓ | +| double expm1 ( double x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument, minus 1. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fabs ( double x ) | ✓ | ✓ | +| double fabs ( double x ) | ? | ? | | | | | | Calculate the absolute value of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fdim ( double x, double y ) | ✓ | ✓ | +| double fdim ( double x, double y ) | ? | ? | | | | | | Compute the positive difference between x and y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double floor ( double x ) | ✓ | ✓ | +| double floor ( double x ) | ? | ? | | | | | | Calculate the largest integer less than or equal to x. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fma ( double x, double y, double z ) | ✓ | ✓ | +| double fma ( double x, double y, double z ) | ? | ? | | | | | -| Compute x × y + z as a single operation. | | | +| Compute x x y + z as a single operation. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmax ( double , double ) | ✓ | ✓ | +| double fmax ( double , double ) | ? | ? | | | | | | Determine the maximum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmin ( double x, double y ) | ✓ | ✓ | +| double fmin ( double x, double y ) | ? | ? | | | | | | Determine the minimum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmod ( double x, double y ) | ✓ | ✓ | +| double fmod ( double x, double y ) | ? | ? | | | | | | Calculate the floating-point remainder of x / y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double frexp ( double x, int* nptr ) | ✓ | ✗ | +| double frexp ( double x, int* nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double hypot ( double x, double y ) | ✓ | ✓ | +| double hypot ( double x, double y ) | ? | ? | | | | | | Calculate the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| int ilogb ( double x ) | ✓ | ✓ | +| int ilogb ( double x ) | ? | ? | | | | | | Compute the unbiased integer exponent of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isfinite ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isfinite ( double a ) | ? | ? | | | | | | Determine whether argument is finite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isinf ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isinf ( double a ) | ? | ? | | | | | | Determine whether argument is infinite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isnan ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isnan ( double a ) | ? | ? | | | | | | Determine whether argument is a NaN. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double ldexp ( double x, int exp ) | ✓ | ✓ | +| double ldexp ( double x, int exp ) | ? | ? | | | | | -| Calculate the value of x ⋅ 2exp. | | | +| Calculate the value of x ? 2exp. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log ( double x ) | ✓ | ✓ | +| double log ( double x ) | ? | ? | | | | | | Calculate the base e logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log10 ( double x ) | ✓ | ✓ | +| double log10 ( double x ) | ? | ? | | | | | | Calculate the base 10 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log1p ( double x ) | ✓ | ✓ | +| double log1p ( double x ) | ? | ? | | | | | | Calculate the value of loge( 1 + x ). | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log2 ( double x ) | ✓ | ✓ | +| double log2 ( double x ) | ? | ? | | | | | | Calculate the base 2 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double logb ( double x ) | ✓ | ✓ | +| double logb ( double x ) | ? | ? | | | | | | Calculate the floating point representation of the exponent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double modf ( double x, double* iptr ) | ✓ | ✗ | +| double modf ( double x, double* iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nan ( const char* tagp ) | ✗ | ✓ | +| double nan ( const char* tagp ) | ? | ? | | | | | | Returns "Not a Number"" value." | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nearbyint ( double x ) | ✓ | ✓ | +| double nearbyint ( double x ) | ? | ? | | | | | | Round the input argument to the nearest integer. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double pow ( double x, double y ) | ✓ | ✓ | +| double pow ( double x, double y ) | ? | ? | | | | | | Calculate the value of first argument to the power of second argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remainder ( double x, double y ) | ✓ | ✓ | +| double remainder ( double x, double y ) | ? | ? | | | | | | Compute double-precision floating-point remainder. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remquo ( double x, double y, int* quo ) | ✓ | ✗ | +| double remquo ( double x, double y, int* quo ) | ? | ? | | | | | | Compute double-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double round ( double x ) | ✓ | ✓ | +| double round ( double x ) | ? | ? | | | | | | Round to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double scalbn ( double x, int n ) | ✓ | ✓ | +| double scalbn ( double x, int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 signbit ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 signbit ( double a ) | ? | ? | | | | | | Return the sign bit of the input. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sin ( double x ) | ✓ | ✓ | +| double sin ( double x ) | ? | ? | | | | | | Calculate the sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincos ( double x, double* sptr, double* cptr ) | ✓ | ✗ | +| void sincos ( double x, double* sptr, double* cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sinh ( double x ) | ✓ | ✓ | +| double sinh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sqrt ( double x ) | ✓ | ✓ | +| double sqrt ( double x ) | ? | ? | | | | | | Calculate the square root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tan ( double x ) | ✓ | ✓ | +| double tan ( double x ) | ? | ? | | | | | | Calculate the tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tanh ( double x ) | ✓ | ✓ | +| double tanh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tgamma ( double x ) | ✓ | ✓ | +| double tgamma ( double x ) | ? | ? | | | | | | Calculate the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double trunc ( double x ) | ✓ | ✓ | +| double trunc ( double x ) | ? | ? | | | | | | Truncate input argument to the integral part. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfcinv ( double y ) | ✓ | ✓ | +| double erfcinv ( double y ) | ? | ? | | | | | | Calculate the inverse complementary function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfcx ( double x ) | ✓ | ✓ | +| double erfcx ( double x ) | ? | ? | | | | | | Calculate the scaled complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfinv ( double y ) | ✓ | ✓ | +| double erfinv ( double y ) | ? | ? | | | | | | Calculate the inverse error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double frexp ( float x, int *nptr ) | ✓ | ✓ | +| double frexp ( float x, int *nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double j0 ( double x ) | ✓ | ✓ | +| double j0 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double j1 ( double x ) | ✓ | ✓ | +| double j1 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double jn ( int n, double x ) | ✓ | ✓ | +| double jn ( int n, double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double lgamma ( double x ) | ✓ | ✓ | +| double lgamma ( double x ) | ? | ? | | | | | | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llrint ( double x ) | ✓ | ✓ | +| long long int llrint ( double x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llround ( double x ) | ✓ | ✓ | +| long long int llround ( double x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lrint ( double x ) | ✓ | ✓ | +| long int lrint ( double x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lround ( double x ) | ✓ | ✓ | +| long int lround ( double x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double modf ( double x, double *iptr ) | ✓ | ✓ | +| double modf ( double x, double *iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nextafter ( double x, double y ) | ✓ | ✓ | +| double nextafter ( double x, double y ) | ? | ? | | | | | | Returns next representable single-precision floating-point value after argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double norm3d ( double a, double b, double c ) | ✓ | ✓ | +| double norm3d ( double a, double b, double c ) | ? | ? | | | | | | Calculate the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm4d ( double a, double b, double c, double d ) | ✓ | ✓ | +| float norm4d ( double a, double b, double c, double d ) | ? | ? | | | | | | Calculate the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double normcdf ( double y ) | ✓ | ✓ | +| double normcdf ( double y ) | ? | ? | | | | | | Calculate the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double normcdfinv ( double y ) | ✓ | ✓ | +| double normcdfinv ( double y ) | ? | ? | | | | | | Calculate the inverse of the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rcbrt ( double x ) | ✓ | ✓ | +| double rcbrt ( double x ) | ? | ? | | | | | | Calculate the reciprocal cube root function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remquo ( double x, double y, int *quo ) | ✓ | ✓ | +| double remquo ( double x, double y, int *quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rhypot ( double x, double y ) | ✓ | ✓ | +| double rhypot ( double x, double y ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rint ( double x ) | ✓ | ✓ | +| double rint ( double x ) | ? | ? | | | | | | Round input to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm3d ( double a, double b, double c ) | ✓ | ✓ | +| double rnorm3d ( double a, double b, double c ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm4d ( double a, double b, double c, double d ) | ✓ | ✓ | +| double rnorm4d ( double a, double b, double c, double d ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm ( int dim, const double *a ) | ✓ | ✓ | +| double rnorm ( int dim, const double *a ) | ? | ? | | | | | | Calculate the reciprocal of square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double scalbln ( double x, long int n ) | ✓ | ✓ | +| double scalbln ( double x, long int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincos ( double x, double *sptr, double *cptr ) | ✓ | ✓ | +| void sincos ( double x, double *sptr, double *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincospi ( double x, double *sptr, double *cptr ) | ✓ | ✓ | +| void sincospi ( double x, double *sptr, double *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument multiplied by PI. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double y0f ( double x ) | ✓ | ✓ | +| double y0f ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double y1 ( double x ) | ✓ | ✓ | +| double y1 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double yn ( int n, double x ) | ✓ | ✓ | +| double yn ( int n, double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. +[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. **Integer Intrinsics** @@ -1038,23 +1038,23 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi +----------------------------------------------------------------------------+ | float __frsqrt_rn ( float x ) | | | -| Compute 1/√x in round-to-nearest-even mode. | +| Compute 1/?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rd ( float x ) | | | -| Compute √x in round-down mode. | +| Compute ?x in round-down mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rn ( float x ) | | | -| Compute √x in round-to-nearest-even mode. | +| Compute ?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | float __fsqrt_ru ( float x ) | | | -| Compute √x in round-up mode. | +| Compute ?x in round-up mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rz ( float x ) | | | -| Compute √x in round-towards-zero mode. | +| Compute ?x in round-towards-zero mode. | +----------------------------------------------------------------------------+ | float __log10f ( float x ) | | | @@ -1082,19 +1082,19 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi +----------------------------------------------------------------------------+ | double __dsqrt_rd ( double x ) | | | -| Compute √x in round-down mode. | +| Compute ?x in round-down mode. | +----------------------------------------------------------------------------+ | double __dsqrt_rn ( double x ) | | | -| Compute √x in round-to-nearest-even mode. | +| Compute ?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | double __dsqrt_ru ( double x ) | | | -| Compute √x in round-up mode. | +| Compute ?x in round-up mode. | +----------------------------------------------------------------------------+ | double __dsqrt_rz ( double x ) | | | -| Compute √x in round-towards-zero mode. | +| Compute ?x in round-towards-zero mode. | +----------------------------------------------------------------------------+ Texture Functions @@ -1123,65 +1123,65 @@ HIP supports the following atomic operations. +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ | Function | Supported in HIP | Supported in CUDA | +=============================================================================================================================+==================+===================+ -| int atomicAdd(int* address, int val) | ✓ | ✓ | +| int atomicAdd(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| float atomicAdd(float* address, float val) | ✓ | ✓ | +| float atomicAdd(float* address, float val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicSub(int* address, int val) | ✓ | ✓ | +| int atomicSub(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicSub(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicSub(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicExch(int* address, int val) | ✓ | ✓ | +| int atomicExch(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicExch(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicExch(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| float atomicExch(float* address, float val) | ✓ | ✓ | +| float atomicExch(float* address, float val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicMin(int* address, int val) | ✓ | ✓ | +| int atomicMin(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicMin(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicMin(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicMax(int* address, int val) | ✓ | ✓ | +| int atomicMax(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicMax(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicMax(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicInc(unsigned int* address) | ✗ | ✓ | +| unsigned int atomicInc(unsigned int* address) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicDec(unsigned int* address) | ✗ | ✓ | +| unsigned int atomicDec(unsigned int* address) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicCAS(int* address, int compare, int val) | ✓ | ✓ | +| int atomicCAS(int* address, int compare, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ✓ | ✓ | +| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicAnd(int* address, int val) | ✓ | ✓ | +| int atomicAnd(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicOr(int* address, int val) | ✓ | ✓ | +| int atomicOr(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicOr(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicOr(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicXor(int* address, int val) | ✓ | ✓ | +| int atomicXor(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicXor(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicXor(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ✓ | ✓ | +| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ @@ -1197,11 +1197,11 @@ Warp Cross Lane Functions Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory. -Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn’t assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties:: - +Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn't assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties:: + cudaDeviceProp props; cudaGetDeviceProperties(&props, deviceID); - int w = props.warpSize; + int w = props.warpSize; // implement portable algorithm based on w (rather than assume 32 or 64) **Warp Vote and Ballot Functions** @@ -1219,14 +1219,14 @@ Threads in a warp are referred to as lanes and are numbered from 0 to warpSize - Applications can test whether the target platform supports the any/all instruction using the hasWarpVote device property or the HIP_ARCH_HAS_WARP_VOTE compiler define. -__ballot provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda’s 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define. +__ballot provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda's 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define. Warp Shuffle Functions ************************ Half-float shuffles are not supported. The default width is warpSize---see :ref:`WarpCross` . Applications should not assume the warpSize is 32 or 64. - + :: int __shfl (int var, int srcLane, int width=warpSize); @@ -1235,7 +1235,7 @@ Half-float shuffles are not supported. The default width is warpSize---see :ref: float __shfl_up (float var, unsigned int delta, int width=warpSize); int __shfl_down (int var, unsigned int delta, int width=warpSize); float __shfl_down (float var, unsigned int delta, int width=warpSize) ; - int __shfl_xor (int var, int laneMask, int width=warpSize) + int __shfl_xor (int var, int laneMask, int width=warpSize) float __shfl_xor (float var, int laneMask, int width=warpSize); Profiler Counter Function @@ -1263,7 +1263,7 @@ hip_launch_bounds allows the application to provide usage hints that influence t :: __global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ... - MyKernel(hipGridLaunch lp, ...) + MyKernel(hipGridLaunch lp, ...) ... launch_bounds supports two parameters: @@ -1295,7 +1295,7 @@ CUDA defines a __launch_bounds which is also designed to control occupancy: :: * The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.) :: - + MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK)/32 @@ -1320,14 +1320,14 @@ Unroll with a bounds that is known at compile-time is supported. For example:: #pragma unroll 16 /* hint to compiler to unroll next loop by 16 */ for (int i=0; i<16; i++) ... -:: - +:: + #pragma unroll 1 /* tell compiler to never unroll the loop */ for (int i=0; i<16; i++) ... -Unbounded loop unroll is under development on HCC compiler. +Unbounded loop unroll is under development on HCC compiler. :: - + #pragma unroll /* hint to compiler to completely unroll next loop. */ for (int i=0; i<16; i++) ... @@ -1348,12 +1348,12 @@ Kernel Compilation hipcc now supports compiling C++/HIP kernels to binary code objects. The user can specify the target for which the binary can be generated. HIP/HCC does not yet support fat binaries so only a single target may be specified. The file format for binary is .co which means Code Object. The following command builds the code object using hipcc. -:: +:: hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE] :: - + [INPUT FILE] = Name of the file containing kernels [OUTPUT FILE] = Name of the generated code object file diff --git a/Programming_Guides/HIP-porting-guide.rst b/Programming_Guides/HIP-porting-guide.rst index a6315f25..b5c578a3 100644 --- a/Programming_Guides/HIP-porting-guide.rst +++ b/Programming_Guides/HIP-porting-guide.rst @@ -7,7 +7,7 @@ HIP Porting Guide ~~~~~~~~~~~~~~~~~ In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease the porting of existing CUDA code into the HIP environment. This section describes the available tools and provides practical suggestions on how to port CUDA code and work through common issues. - + Porting a New Cuda Project ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -24,8 +24,8 @@ Scanning existing CUDA code to scope the porting effort The hipexamine.sh tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified, -:: - +:: + > cd examples/rodinia_3.0/cuda/kmeans > $HIP_DIR/bin/hipexamine.sh . info: hipify ./kmeans.h =====> @@ -47,10 +47,10 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir * Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name. * Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file: - :: - + :: + info: hipify ./kmeans_cuda_kernel.cu =====> - info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 + info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 * Interesting information in kmeans_cuda_kernel.cu : * How many CUDA calls were converted to HIP (40) @@ -60,7 +60,7 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir * hipexamine also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called. An example from above: -:: +:: info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607 kernels (1 total) : kmeansPoint(1) @@ -68,7 +68,7 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir Converting a project "in-place" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:: +:: > hipify --inplace @@ -82,7 +82,7 @@ This is useful for testing improvements to the hipify toolset. The `hipconvertinplace.sh `_ script will perform inplace conversion for all code files in the specified directory. This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure and filenames - so includes work. After converting in-place, you can review the code to add additional parameters to directory names. :: - + > hipconvertinplace.sh MY_SRC_DIR Distinguishing Compiler Modes @@ -103,27 +103,27 @@ Identifying the Compiler: hcc, hip-clang or nvcc Often, it useful to know whether the underlying compiler is hcc or nvcc. This knowledge can guard platform-specific code (features that only work on the nvcc or hcc path but not both) or aid in platform-specific performance tuning. -:: +:: #ifdef __HCC__ - // Compiled with hcc - + // Compiled with hcc + :: #ifdef __HIP__ - // Compiled with hip-clang + // Compiled with hip-clang :: #ifdef __NVCC__ - // Compiled with nvcc + // Compiled with nvcc // Could be compiling with Cuda language extensions enabled (for example, a ".cu file) // Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file) - -:: + +:: #ifdef __CUDACC__ - // Compiled with nvcc (Cuda language extensions enabled) + // Compiled with nvcc (Cuda language extensions enabled) hcc and hip-clang directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the __CUDA_ACC define. @@ -136,9 +136,9 @@ Identifying Current Compilation Pass: Host or Device Both nvcc and hcc make two passes over the code: one for host code and one for device code. __HIP_DEVICE_COMPILE__ is set to a nonzero value when the compiler (hcc or nvcc) is compiling code for a device inside a __global__ kernel or for a device function. __HIP_DEVICE_COMPILE__ can replace #ifdef checks on the __CUDA_ARCH__ define. :: - - // #ifdef __CUDA_ARCH__ - + + // #ifdef __CUDA_ARCH__ + #if __HIP_DEVICE_COMPILE__ Unlike __CUDA_ARCH__, the __HIP_DEVICE_COMPILE__ value is 1 or undefined, and it doesn't represent the feature capability of the target device. @@ -149,48 +149,48 @@ Compiler Defines: Summary +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | Define | hcc | nvcc | Other (GCC, ICC, Clang, etc.) | +===========================+===============================+=================================+======================================+ - |HIP-related defines: | + |HIP-related defines: | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __HIP_PLATFORM_HCC___ | Defined | Undefined | | Defined if targeting hcc platform; | - | | | | | undefined otherwise | + | | | | | undefined otherwise | | | | | | - +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ + +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __HIP_PLATFORM_NVCC___ | Undefined | defined | | Defined if targeting NVcc platform;| - | | | | | undefined otherwise | + | | | | | undefined otherwise | | | | | | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ - | | | 1 if compiling for device; | | 1 if compiling for device; | | - |__HIP_DEVICE_COMPILE__ | | undefined if compiling | | undefined if compiling | Undefined | - | | | for host | | for host | | + | | | 1 if compiling for device; | | 1 if compiling for device; | | + |__HIP_DEVICE_COMPILE__ | | undefined if compiling | | undefined if compiling | Undefined | + | | | for host | | for host | | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __HIPCC__ | Defined | Defined | Undefined | - +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ + +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | | | 0 or 1 depending on feature | | 0 or 1 depending on feature | | | __HIP_ARCH_* | | support (see below) | | support (see below) | 0 | | | | | | - +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ - | nvcc-related defines: | - +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ + +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ + | nvcc-related defines: | + +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __CUDACC__ | Undefined | | Defined if source code is | | | | | | compiled by nvcc; | Undefined | | | | | undefined otherwise | | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __NVCC__ | Undefined | Defined | Undefined | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ - | | | | Unsigned representing compute | | - | __CUDA_ARCH__ | Undefined | | capability (e.g., "130")if in | Undefined | + | | | | Unsigned representing compute | | + | __CUDA_ARCH__ | Undefined | | capability (e.g., "130")if in | Undefined | | | | | device code; 0 if in host code| | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | hcc-related defines: | - +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ + +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | __HCC__ | Defined | Undefined | Undefined | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | | | Nonzero if in device code; | | | - | __HCC_ACCELERATOR__ | | otherwise undefined | Undefined | Undefined | - | | | | | + | __HCC_ACCELERATOR__ | | otherwise undefined | Undefined | Undefined | + | | | | | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ | | Defined | Undefined | | Defined if using Clang; | - | __clang__ | | | | otherwise undefined | + | __clang__ | | | | otherwise undefined | +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ Identifying Architecture Features @@ -201,8 +201,8 @@ HIP_ARCH Defines Some Cuda code tests __CUDA_ARCH__ for a specific value to determine whether the machine supports a certain architectural feature. For instance, :: - - #if (__CUDA_ARCH__ >= 130) + + #if (__CUDA_ARCH__ >= 130) // doubles are supported @@ -212,7 +212,7 @@ This type of code requires special attention, since hcc/AMD and nvcc/Cuda device The __HIP_ARCH_* defines can replace comparisons of __CUDA_ARCH__ values: :: - + //#if (__CUDA_ARCH__ >= 130) // non-portable if __HIP_ARCH_HAS_DOUBLES__ { // portable HIP feature query // doubles are supported @@ -241,32 +241,32 @@ The table below shows the full set of architectural properties that HIP supports |Define (use only in device code) | Device Property (run-time query) | Comment | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | 32-bit atomics: | | - +------------------------------------------+-----------------------------------+----------------------------------------------------+ + +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ | hasGlobalInt32Atomics | 32-bit integer atomics for global memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__| hasGlobalFloatAtomicExch | 32-bit float atomic exchange for global memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ | hasSharedInt32Atomics | 32-bit integer atomics for shared memory | + | __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ | hasSharedInt32Atomics | 32-bit integer atomics for shared memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__| hasSharedFloatAtomicExch | 32-bit float atomic exchange for shared memory | - +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory | + +------------------------------------------+-----------------------------------+----------------------------------------------------+ + | __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | 64-bit atomics: | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ | hasGlobalInt64Atomics | 64-bit integer atomics for global memory | + | __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ | hasGlobalInt64Atomics | 64-bit integer atomics for global memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ | hasSharedInt64Atomics | 64-bit integer atomics for shared memory | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | Doubles: | + | Doubles: | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_DOUBLES__ | hasDoubles | Double-precision floating point | + | __HIP_ARCH_HAS_DOUBLES__ | hasDoubles | Double-precision floating point | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | Warp cross-lane operations: | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_WARP_VOTE__ | hasWarpVote | Warp vote instructions (any, all) | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_WARP_BALLOT__ | hasWarpBallot | Warp ballot instructions | + | __HIP_ARCH_HAS_WARP_BALLOT__ | hasWarpBallot | Warp ballot instructions | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_WARP_SHUFFLE__ | hasWarpShuffle | Warp shuffle operations (shfl_*) | +------------------------------------------+-----------------------------------+----------------------------------------------------+ @@ -277,15 +277,15 @@ The table below shows the full set of architectural properties that HIP supports | __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ | hasThreadFenceSystem | threadfence_system | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_SYNC_THREAD_EXT__ | hasSyncThreadsExt | syncthreads_count, syncthreads_and, syncthreads_or | - +------------------------------------------+-----------------------------------+----------------------------------------------------+ + +------------------------------------------+-----------------------------------+----------------------------------------------------+ | Miscellaneous: | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_SURFACE_FUNCS__ | hasSurfaceFuncs | | +------------------------------------------+-----------------------------------+----------------------------------------------------+ | __HIP_ARCH_HAS_3DGRID__ | has3dGrid | Grids and groups are 3D | +------------------------------------------+-----------------------------------+----------------------------------------------------+ - | __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ | hasDynamicParallelism | | - +------------------------------------------+-----------------------------------+----------------------------------------------------+ + | __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ | hasDynamicParallelism | | + +------------------------------------------+-----------------------------------+----------------------------------------------------+ Finding HIP @@ -315,11 +315,11 @@ While this can be a convenient single-line kernel launch syntax, the macro imple Avoid nesting macro parameters inside parenthesis - here's an alternative that will work: :: - + #define MY_LAUNCH(command, doTrace) \ {\ if (doTrace) printf ("TRACE: %s\n", #command); \ - command;\ + command;\ } MY_LAUNCH (hipLaunchKernelGGL(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall"); @@ -328,12 +328,12 @@ Compiler Options ~~~~~~~~~~~~~~~~ hipcc is a portable compiler driver that will call nvcc or hcc (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler. The hipconfig script may helpful in making infrastructure that identifies the target platform and sets options appropriately. It returns either "nvcc" or "hcc." The following sample shows the script in a makefile: -:: +:: HIP_PLATFORM=$(shell hipconfig --compiler) ifeq (${HIP_PLATFORM}, nvcc) - HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 + HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif ifeq (${HIP_PLATFORM}, hcc) HIPCC_FLAGS = -Wno-deprecated-register @@ -387,7 +387,7 @@ You can compile hip_runtime_api.h using a standard C or C++ compiler (e.g., gcc :: - > hipconfig --cxx_config + > hipconfig --cxx_config -D__HIP_PLATFORM_HCC__ -I/home/user1/hip/include You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax: @@ -470,7 +470,7 @@ Device Code: } std::cout<<"Passed"< - + ... - + <hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0 libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0 @@ -64,9 +64,9 @@ ltrace can be easily combined with the HIP_DB switches to visualize the runtime libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0 libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0 <... hsa_amd_memory_unlock resumed> ) = 0 - hip-api tid:1.17 hipMemcpy + hip-api tid:1.17 hipMemcpy ret= 0 (hipSuccess)>> - + Some key information from the trace above. @@ -108,15 +108,15 @@ Debugging HIP Applications * The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotonically increasing shortTid ID. Both of these fields are displayed in the HIP debug info. - :: + :: (gdb) p tls_tidInfo $32 = {_shortTid = 1, _apiSeqNum = 803} - + * HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output. This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. The gdb syntax also supports using the variable name (in this case 'dst'): :: - + (gdb) p dst $33 = (void *) 0x5ec7e9000 (gdb) call hc::am_memtracker_print(dst) @@ -125,16 +125,16 @@ Debugging HIP Applications ... -->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) - To debug an explicit address, cast the address to (void*) + To debug an explicit address, cast the address to (void*) :: - + (gdb) call hc::am_memtracker_print((void*)0x508c7f000) * Debugging GPUVM fault. For example: Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. :: - + Program received signal SIGABRT, Aborted. [Switching to Thread 0x7fffdffb5700 (LWP 14893)] 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 @@ -163,9 +163,9 @@ Debugging HIP Applications #3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so #4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so #5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so - #6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 + #6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 ... - + .. _General Debugging Tips: diff --git a/Programming_Guides/Kernel_language.rst b/Programming_Guides/Kernel_language.rst index faf5b330..de8bf4ea 100644 --- a/Programming_Guides/Kernel_language.rst +++ b/Programming_Guides/Kernel_language.rst @@ -63,7 +63,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly * Math functions resembling those in the "math.h" header included with standard C++ compilers * Built-in functions for accessing specific GPU hardware capabilities -This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. +This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. Features are marked with one of the following keywords: @@ -134,14 +134,14 @@ Calling __global__ Functions __global__ MyKernel(float *A, float *B, float *C, size_t N) { ... - } - + } + // Replace MyKernel<<>> (a,b,c,n); - + hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n); - -The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 `_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See `Coordinate Built-Ins `_. + +The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 `_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing. See `Coordinate Built-Ins `_. .. _Kernel-Launch-Example: @@ -150,15 +150,15 @@ Kernel-Launch Example :: - // Example showing device function, __device__ __host__ - // <- compile for both device and host - float PlusOne(float x) + // Example showing device function, __device__ __host__ + // <- compile for both device and host + float PlusOne(float x) { return x + 1.0; } - - __global__ - void + + __global__ + void MyKernel (const float *a, const float *b, float *c, unsigned N) { unsigned gid = hipThreadIdx_x; // <- coordinate index function @@ -170,11 +170,11 @@ Kernel-Launch Example { float *a, *b, *c; // initialization not shown... unsigned N = 1000000; - const unsigned blockSize = 256; - + const unsigned blockSize = 256; + hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N); } - + .. _Variable-Type-Qualifiers: Variable-Type Qualifiers @@ -260,13 +260,13 @@ Note that these types are defined in hip_runtime.h and are not automatically pro Short Vector Types ++++++++++++++++++++ -Short vector types derive from the basic integer and floating-point types. They are structures defined in hip_vector_types.h. The first, second, third and fourth components of the vector are accessible through the *x, y, z* and *w* fields, respectively. All the short vector types support a constructor function of the form make_(). +Short vector types derive from the basic integer and floating-point types. They are structures defined in hip_vector_types.h. The first, second, third and fourth components of the vector are accessible through the *x, y, z* and *w* fields, respectively. All the short vector types support a constructor function of the form make_(). For example, ``float4 make_float4(float x, float y, float z, float w)`` creates a vector of type float4 and value (x,y,z,w). HIP supports the following short vector formats: * Signed Integers: - + * char1, char2, char3, char4 * short1, short2, short3, short4 * int1, int2, int3, int4 @@ -274,7 +274,7 @@ HIP supports the following short vector formats: * longlong1, longlong2, longlong3, longlong4 * Unsigned Integers: - + * uchar1, uchar2, uchar3, uchar4 * ushort1, ushort2, ushort3, ushort4 * uint1, uint2, uint3, uint4 @@ -282,7 +282,7 @@ HIP supports the following short vector formats: * ulonglong1, ulonglong2, ulonglong3, ulonglong4 * Floating Points - + * float1, float2, float3, float4 * double1, double2, double3, double4 @@ -295,13 +295,13 @@ dim3 is a three-dimensional integer vector type commonly used to specify grid an :: typedef struct dim3 { - uint32_t x; - uint32_t y; - uint32_t z; - - dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {}; - }; - + uint32_t x; + uint32_t y; + uint32_t z; + + dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {}; + }; + .. _Memory-Fence-Instructions: Memory-Fence Instructions @@ -337,351 +337,351 @@ Following is the list of supported single precision mathematical functions. +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ | Function | Supported on Host | Supported on Device | +====================================================================================================+===================+=====================+ -| float acosf ( float x ) | ✓ | ✓ | +| float acosf ( float x ) | ? | ? | | | | | | Calculate the arc cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float acoshf ( float x ) | ✓ | ✓ | +| float acoshf ( float x ) | ? | ? | | | | | | Calculate the nonnegative arc hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float asinf ( float x ) | ✓ | ✓ | +| float asinf ( float x ) | ? | ? | | | | | | Calculate the arc sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float asinhf ( float x ) | ✓ | ✓ | +| float asinhf ( float x ) | ? | ? | | | | | | Calculate the arc hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atan2f ( float y, float x ) | ✓ | ✓ | +| float atan2f ( float y, float x ) | ? | ? | | | | | | Calculate the arc tangent of the ratio of first and second input arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atanf ( float x ) | ✓ | ✓ | +| float atanf ( float x ) | ? | ? | | | | | | Calculate the arc tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float atanhf ( float x ) | ✓ | ✓ | +| float atanhf ( float x ) | ? | ? | | | | | | Calculate the arc hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float cbrtf ( float x ) | ✓ | ✓ | +| float cbrtf ( float x ) | ? | ? | | | | | | Calculate the cube root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ceilf ( float x ) | ✓ | ✓ | +| float ceilf ( float x ) | ? | ? | | | | | | Calculate ceiling of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float copysignf ( float x, float y ) | ✓ | ✓ | +| float copysignf ( float x, float y ) | ? | ? | | | | | | Create value with given magnitude, copying sign of second value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float cosf ( float x ) | ✓ | ✓ | +| float cosf ( float x ) | ? | ? | | | | | | Calculate the cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float coshf ( float x ) | ✓ | ✓ | +| float coshf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcf ( float x ) | ✓ | ✓ | +| float erfcf ( float x ) | ? | ? | | | | | | Calculate the complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erff ( float x ) | ✓ | ✓ | +| float erff ( float x ) | ? | ? | | | | | | Calculate the error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float exp10f ( float x ) | ✓ | ✓ | +| float exp10f ( float x ) | ? | ? | | | | | | Calculate the base 10 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float exp2f ( float x ) | ✓ | ✓ | +| float exp2f ( float x ) | ? | ? | | | | | | Calculate the base 2 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float expf ( float x ) | ✓ | ✓ | +| float expf ( float x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float expm1f ( float x ) | ✓ | ✓ | +| float expm1f ( float x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument, minus 1. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fabsf ( float x ) | ✓ | ✓ | +| float fabsf ( float x ) | ? | ? | | | | | | Calculate the absolute value of its argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fdimf ( float x, float y ) | ✓ | ✓ | +| float fdimf ( float x, float y ) | ? | ? | | | | | | Compute the positive difference between x and y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float floorf ( float x ) | ✓ | ✓ | +| float floorf ( float x ) | ? | ? | | | | | | Calculate the largest integer less than or equal to x. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmaf ( float x, float y, float z ) | ✓ | ✓ | +| float fmaf ( float x, float y, float z ) | ? | ? | | | | | -| Compute x × y + z as a single operation. | | | +| Compute x x y + z as a single operation. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmaxf ( float x, float y ) | ✓ | ✓ | +| float fmaxf ( float x, float y ) | ? | ? | | | | | | Determine the maximum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fminf ( float x, float y ) | ✓ | ✓ | +| float fminf ( float x, float y ) | ? | ? | | | | | | Determine the minimum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fmodf ( float x, float y ) | ✓ | ✓ | +| float fmodf ( float x, float y ) | ? | ? | | | | | | Calculate the floating-point remainder of x / y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float frexpf ( float x, int* nptr ) | ✓ | ✗ | +| float frexpf ( float x, int* nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float hypotf ( float x, float y ) | ✓ | ✓ | +| float hypotf ( float x, float y ) | ? | ? | | | | | | Calculate the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| int ilogbf ( float x ) | ✓ | ✓ | +| int ilogbf ( float x ) | ? | ? | | | | | | Compute the unbiased integer exponent of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isfinite ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isfinite ( float a ) | ? | ? | | | | | | Determine whether argument is finite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isinf ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isinf ( float a ) | ? | ? | | | | | | Determine whether argument is infinite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isnan ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 isnan ( float a ) | ? | ? | | | | | | Determine whether argument is a NaN. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ldexpf ( float x, int exp ) | ✓ | ✓ | +| float ldexpf ( float x, int exp ) | ? | ? | | | | | -| Calculate the value of x ⋅ 2exp. | | | +| Calculate the value of x ? 2exp. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log10f ( float x ) | ✓ | ✓ | +| float log10f ( float x ) | ? | ? | | | | | | Calculate the base 10 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log1pf ( float x ) | ✓ | ✓ | +| float log1pf ( float x ) | ? | ? | | | | | | Calculate the value of loge( 1 + x ). | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float logbf ( float x ) | ✓ | ✓ | +| float logbf ( float x ) | ? | ? | | | | | | Calculate the floating point representation of the exponent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float log2f ( float x ) | ✓ | ✓ | +| float log2f ( float x ) | ? | ? | | | | | | Calculate the base 2 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float logf ( float x ) | ✓ | ✓ | +| float logf ( float x ) | ? | ? | | | | | | Calculate the natural logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float modff ( float x, float* iptr ) | ✓ | ✗ | +| float modff ( float x, float* iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nanf ( const char* tagp ) | ✗ | ✓ | +| float nanf ( const char* tagp ) | ? | ? | | | | | | Returns "Not a Number"" value." | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nearbyintf ( float x ) | ✓ | ✓ | +| float nearbyintf ( float x ) | ? | ? | | | | | | Round the input argument to the nearest integer. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float powf ( float x, float y ) | ✓ | ✓ | +| float powf ( float x, float y ) | ? | ? | | | | | | Calculate the value of first argument to the power of second argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remainderf ( float x, float y ) | ✓ | ✓ | +| float remainderf ( float x, float y ) | ? | ? | | | | | | Compute single-precision floating-point remainder. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remquof ( float x, float y, int* quo ) | ✓ | ✗ | +| float remquof ( float x, float y, int* quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float roundf ( float x ) | ✓ | ✓ | +| float roundf ( float x ) | ? | ? | | | | | | Round to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float scalbnf ( float x, int n ) | ✓ | ✓ | +| float scalbnf ( float x, int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 signbit ( float a ) | ✓ | ✓ | +| __RETURN_TYPE1 signbit ( float a ) | ? | ? | | | | | | Return the sign bit of the input. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincosf ( float x, float* sptr, float* cptr ) | ✓ | ✗ | +| void sincosf ( float x, float* sptr, float* cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sinf ( float x ) | ✓ | ✓ | +| float sinf ( float x ) | ? | ? | | | | | | Calculate the sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sinhf ( float x ) | ✓ | ✓ | +| float sinhf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float sqrtf ( float x ) | ✓ | ✓ | +| float sqrtf ( float x ) | ? | ? | | | | | | Calculate the square root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tanf ( float x ) | ✓ | ✓ | +| float tanf ( float x ) | ? | ? | | | | | | Calculate the tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tanhf ( float x ) | ✓ | ✓ | +| float tanhf ( float x ) | ? | ? | | | | | | Calculate the hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float truncf ( float x ) | ✓ | ✓ | +| float truncf ( float x ) | ? | ? | | | | | | Truncate input argument to the integral part. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float tgammaf ( float x ) | ✓ | ✓ | +| float tgammaf ( float x ) | ? | ? | | | | | | Calculate the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcinvf ( float y ) | ✓ | ✓ | +| float erfcinvf ( float y ) | ? | ? | | | | | | Calculate the inverse complementary function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfcxf ( float x ) | ✓ | ✓ | +| float erfcxf ( float x ) | ? | ? | | | | | | Calculate the scaled complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float erfinvf ( float y ) | ✓ | ✓ | +| float erfinvf ( float y ) | ? | ? | | | | | | Calculate the inverse error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float fdividef ( float x, float y ) | ✓ | ✓ | +| float fdividef ( float x, float y ) | ? | ? | | | | | | Divide two floating point values. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float frexpf ( float x, int *nptr ) | ✓ | ✓ | +| float frexpf ( float x, int *nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float j0f ( float x ) | ✓ | ✓ | +| float j0f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float j1f ( float x ) | ✓ | ✓ | +| float j1f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float jnf ( int n, float x ) | ✓ | ✓ | +| float jnf ( int n, float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float lgammaf ( float x ) | ✓ | ✓ | +| float lgammaf ( float x ) | ? | ? | | | | | | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llrintf ( float x ) | ✓ | ✓ | +| long long int llrintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llroundf ( float x ) | ✓ | ✓ | +| long long int llroundf ( float x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lrintf ( float x ) | ✓ | ✓ | +| long int lrintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lroundf ( float x ) | ✓ | ✓ | +| long int lroundf ( float x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float modff ( float x, float *iptr ) | ✓ | ✓ | +| float modff ( float x, float *iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float nextafterf ( float x, float y ) | ✓ | ✓ | +| float nextafterf ( float x, float y ) | ? | ? | | | | | | Returns next representable single-precision floating-point value after argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm3df ( float a, float b, float c ) | ✓ | ✓ | +| float norm3df ( float a, float b, float c ) | ? | ? | | | | | | Calculate the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm4df ( float a, float b, float c, float d ) | ✓ | ✓ | +| float norm4df ( float a, float b, float c, float d ) | ? | ? | | | | | | Calculate the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normcdff ( float y ) | ✓ | ✓ | +| float normcdff ( float y ) | ? | ? | | | | | | Calculate the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normcdfinvf ( float y ) | ✓ | ✓ | +| float normcdfinvf ( float y ) | ? | ? | | | | | | Calculate the inverse of the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float normf ( int dim, const float *a ) | ✓ | ✓ | +| float normf ( int dim, const float *a ) | ? | ? | | | | | | Calculate the square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rcbrtf ( float x ) | ✓ | ✓ | +| float rcbrtf ( float x ) | ? | ? | | | | | | Calculate the reciprocal cube root function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float remquof ( float x, float y, int *quo ) | ✓ | ✓ | +| float remquof ( float x, float y, int *quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rhypotf ( float x, float y ) | ✓ | ✓ | +| float rhypotf ( float x, float y ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rintf ( float x ) | ✓ | ✓ | +| float rintf ( float x ) | ? | ? | | | | | | Round input to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnorm3df ( float a, float b, float c ) | ✓ | ✓ | +| float rnorm3df ( float a, float b, float c ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnorm4df ( float a, float b, float c, float d ) | ✓ | ✓ | +| float rnorm4df ( float a, float b, float c, float d ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float rnormf ( int dim, const float *a ) | ✓ | ✓ | +| float rnormf ( int dim, const float *a ) | ? | ? | | | | | | Calculate the reciprocal of square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float scalblnf ( float x, long int n ) | ✓ | ✓ | +| float scalblnf ( float x, long int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincosf ( float x, float *sptr, float *cptr ) | ✓ | ✓ | +| void sincosf ( float x, float *sptr, float *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincospif ( float x, float *sptr, float *cptr ) | ✓ | ✓ | +| void sincospif ( float x, float *sptr, float *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument multiplied by PI. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float y0f ( float x ) | ✓ | ✓ | +| float y0f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float y1f ( float x ) | ✓ | ✓ | +| float y1f ( float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float ynf ( int n, float x ) | ✓ | ✓ | +| float ynf ( int n, float x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ @@ -699,348 +699,348 @@ Following is the list of supported double precision mathematical functions. +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ | Function | Supported on Host | Supported on Device | +====================================================================================================+===================+=====================+ -| double acos ( double x ) | ✓ | ✓ | +| double acos ( double x ) | ? | ? | | | | | | Calculate the arc cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double acosh ( double x ) | ✓ | ✓ | +| double acosh ( double x ) | ? | ? | | | | | | Calculate the nonnegative arc hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double asin ( double x ) | ✓ | ✓ | +| double asin ( double x ) | ? | ? | | | | | | Calculate the arc sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double asinh ( double x ) | ✓ | ✓ | +| double asinh ( double x ) | ? | ? | | | | | | Calculate the arc hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atan ( double x ) | ✓ | ✓ | +| double atan ( double x ) | ? | ? | | | | | | Calculate the arc tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atan2 ( double y, double x ) | ✓ | ✓ | +| double atan2 ( double y, double x ) | ? | ? | | | | | | Calculate the arc tangent of the ratio of first and second input arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double atanh ( double x ) | ✓ | ✓ | +| double atanh ( double x ) | ? | ? | | | | | | Calculate the arc hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cbrt ( double x ) | ✓ | ✓ | +| double cbrt ( double x ) | ? | ? | | | | | | Calculate the cube root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double ceil ( double x ) | ✓ | ✓ | +| double ceil ( double x ) | ? | ? | | | | | | Calculate ceiling of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double copysign ( double x, double y ) | ✓ | ✓ | +| double copysign ( double x, double y ) | ? | ? | | | | | | Create value with given magnitude, copying sign of second value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cos ( double x ) | ✓ | ✓ | +| double cos ( double x ) | ? | ? | | | | | | Calculate the cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double cosh ( double x ) | ✓ | ✓ | +| double cosh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic cosine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erf ( double x ) | ✓ | ✓ | +| double erf ( double x ) | ? | ? | | | | | | Calculate the error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfc ( double x ) | ✓ | ✓ | +| double erfc ( double x ) | ? | ? | | | | | | Calculate the complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp ( double x ) | ✓ | ✓ | +| double exp ( double x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp10 ( double x ) | ✓ | ✓ | +| double exp10 ( double x ) | ? | ? | | | | | | Calculate the base 10 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double exp2 ( double x ) | ✓ | ✓ | +| double exp2 ( double x ) | ? | ? | | | | | | Calculate the base 2 exponential of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double expm1 ( double x ) | ✓ | ✓ | +| double expm1 ( double x ) | ? | ? | | | | | | Calculate the base e exponential of the input argument, minus 1. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fabs ( double x ) | ✓ | ✓ | +| double fabs ( double x ) | ? | ? | | | | | | Calculate the absolute value of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fdim ( double x, double y ) | ✓ | ✓ | +| double fdim ( double x, double y ) | ? | ? | | | | | | Compute the positive difference between x and y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double floor ( double x ) | ✓ | ✓ | +| double floor ( double x ) | ? | ? | | | | | | Calculate the largest integer less than or equal to x. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fma ( double x, double y, double z ) | ✓ | ✓ | +| double fma ( double x, double y, double z ) | ? | ? | | | | | -| Compute x × y + z as a single operation. | | | +| Compute x x y + z as a single operation. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmax ( double , double ) | ✓ | ✓ | +| double fmax ( double , double ) | ? | ? | | | | | | Determine the maximum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmin ( double x, double y ) | ✓ | ✓ | +| double fmin ( double x, double y ) | ? | ? | | | | | | Determine the minimum numeric value of the arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double fmod ( double x, double y ) | ✓ | ✓ | +| double fmod ( double x, double y ) | ? | ? | | | | | | Calculate the floating-point remainder of x / y. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double frexp ( double x, int* nptr ) | ✓ | ✗ | +| double frexp ( double x, int* nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double hypot ( double x, double y ) | ✓ | ✓ | +| double hypot ( double x, double y ) | ? | ? | | | | | | Calculate the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| int ilogb ( double x ) | ✓ | ✓ | +| int ilogb ( double x ) | ? | ? | | | | | | Compute the unbiased integer exponent of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isfinite ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isfinite ( double a ) | ? | ? | | | | | | Determine whether argument is finite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isinf ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isinf ( double a ) | ? | ? | | | | | | Determine whether argument is infinite. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 isnan ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 isnan ( double a ) | ? | ? | | | | | | Determine whether argument is a NaN. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double ldexp ( double x, int exp ) | ✓ | ✓ | +| double ldexp ( double x, int exp ) | ? | ? | | | | | -| Calculate the value of x ⋅ 2exp. | | | +| Calculate the value of x ? 2exp. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log ( double x ) | ✓ | ✓ | +| double log ( double x ) | ? | ? | | | | | | Calculate the base e logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log10 ( double x ) | ✓ | ✓ | +| double log10 ( double x ) | ? | ? | | | | | | Calculate the base 10 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log1p ( double x ) | ✓ | ✓ | +| double log1p ( double x ) | ? | ? | | | | | | Calculate the value of loge( 1 + x ). | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double log2 ( double x ) | ✓ | ✓ | +| double log2 ( double x ) | ? | ? | | | | | | Calculate the base 2 logarithm of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double logb ( double x ) | ✓ | ✓ | +| double logb ( double x ) | ? | ? | | | | | | Calculate the floating point representation of the exponent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double modf ( double x, double* iptr ) | ✓ | ✗ | +| double modf ( double x, double* iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nan ( const char* tagp ) | ✗ | ✓ | +| double nan ( const char* tagp ) | ? | ? | | | | | | Returns "Not a Number"" value." | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nearbyint ( double x ) | ✓ | ✓ | +| double nearbyint ( double x ) | ? | ? | | | | | | Round the input argument to the nearest integer. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double pow ( double x, double y ) | ✓ | ✓ | +| double pow ( double x, double y ) | ? | ? | | | | | | Calculate the value of first argument to the power of second argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remainder ( double x, double y ) | ✓ | ✓ | +| double remainder ( double x, double y ) | ? | ? | | | | | | Compute double-precision floating-point remainder. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remquo ( double x, double y, int* quo ) | ✓ | ✗ | +| double remquo ( double x, double y, int* quo ) | ? | ? | | | | | | Compute double-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double round ( double x ) | ✓ | ✓ | +| double round ( double x ) | ? | ? | | | | | | Round to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double scalbn ( double x, int n ) | ✓ | ✓ | +| double scalbn ( double x, int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| __RETURN_TYPE1 signbit ( double a ) | ✓ | ✓ | +| __RETURN_TYPE1 signbit ( double a ) | ? | ? | | | | | | Return the sign bit of the input. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sin ( double x ) | ✓ | ✓ | +| double sin ( double x ) | ? | ? | | | | | | Calculate the sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincos ( double x, double* sptr, double* cptr ) | ✓ | ✗ | +| void sincos ( double x, double* sptr, double* cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sinh ( double x ) | ✓ | ✓ | +| double sinh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic sine of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double sqrt ( double x ) | ✓ | ✓ | +| double sqrt ( double x ) | ? | ? | | | | | | Calculate the square root of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tan ( double x ) | ✓ | ✓ | +| double tan ( double x ) | ? | ? | | | | | | Calculate the tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tanh ( double x ) | ✓ | ✓ | +| double tanh ( double x ) | ? | ? | | | | | | Calculate the hyperbolic tangent of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double tgamma ( double x ) | ✓ | ✓ | +| double tgamma ( double x ) | ? | ? | | | | | | Calculate the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double trunc ( double x ) | ✓ | ✓ | +| double trunc ( double x ) | ? | ? | | | | | | Truncate input argument to the integral part. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfcinv ( double y ) | ✓ | ✓ | +| double erfcinv ( double y ) | ? | ? | | | | | | Calculate the inverse complementary function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfcx ( double x ) | ✓ | ✓ | +| double erfcx ( double x ) | ? | ? | | | | | | Calculate the scaled complementary error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double erfinv ( double y ) | ✓ | ✓ | +| double erfinv ( double y ) | ? | ? | | | | | | Calculate the inverse error function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double frexp ( float x, int *nptr ) | ✓ | ✓ | +| double frexp ( float x, int *nptr ) | ? | ? | | | | | | Extract mantissa and exponent of a floating-point value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double j0 ( double x ) | ✓ | ✓ | +| double j0 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double j1 ( double x ) | ✓ | ✓ | +| double j1 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double jn ( int n, double x ) | ✓ | ✓ | +| double jn ( int n, double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the first kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double lgamma ( double x ) | ✓ | ✓ | +| double lgamma ( double x ) | ? | ? | | | | | | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llrint ( double x ) | ✓ | ✓ | +| long long int llrint ( double x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long long int llround ( double x ) | ✓ | ✓ | +| long long int llround ( double x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lrint ( double x ) | ✓ | ✓ | +| long int lrint ( double x ) | ? | ? | | | | | | Round input to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| long int lround ( double x ) | ✓ | ✓ | +| long int lround ( double x ) | ? | ? | | | | | | Round to nearest integer value. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double modf ( double x, double *iptr ) | ✓ | ✓ | +| double modf ( double x, double *iptr ) | ? | ? | | | | | | Break down the input argument into fractional and integral parts. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double nextafter ( double x, double y ) | ✓ | ✓ | +| double nextafter ( double x, double y ) | ? | ? | | | | | | Returns next representable single-precision floating-point value after argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double norm3d ( double a, double b, double c ) | ✓ | ✓ | +| double norm3d ( double a, double b, double c ) | ? | ? | | | | | | Calculate the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| float norm4d ( double a, double b, double c, double d ) | ✓ | ✓ | +| float norm4d ( double a, double b, double c, double d ) | ? | ? | | | | | | Calculate the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double normcdf ( double y ) | ✓ | ✓ | +| double normcdf ( double y ) | ? | ? | | | | | | Calculate the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double normcdfinv ( double y ) | ✓ | ✓ | +| double normcdfinv ( double y ) | ? | ? | | | | | | Calculate the inverse of the standard normal cumulative distribution function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rcbrt ( double x ) | ✓ | ✓ | +| double rcbrt ( double x ) | ? | ? | | | | | | Calculate the reciprocal cube root function. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double remquo ( double x, double y, int *quo ) | ✓ | ✓ | +| double remquo ( double x, double y, int *quo ) | ? | ? | | | | | | Compute single-precision floating-point remainder and part of quotient. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rhypot ( double x, double y ) | ✓ | ✓ | +| double rhypot ( double x, double y ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of two arguments. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rint ( double x ) | ✓ | ✓ | +| double rint ( double x ) | ? | ? | | | | | | Round input to nearest integer value in floating-point. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm3d ( double a, double b, double c ) | ✓ | ✓ | +| double rnorm3d ( double a, double b, double c ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of three coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm4d ( double a, double b, double c, double d ) | ✓ | ✓ | +| double rnorm4d ( double a, double b, double c, double d ) | ? | ? | | | | | | Calculate one over the square root of the sum of squares of four coordinates of the argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double rnorm ( int dim, const double *a ) | ✓ | ✓ | +| double rnorm ( int dim, const double *a ) | ? | ? | | | | | | Calculate the reciprocal of square root of the sum of squares of any number of coordinates. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double scalbln ( double x, long int n ) | ✓ | ✓ | +| double scalbln ( double x, long int n ) | ? | ? | | | | | | Scale floating-point input by integer power of two. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincos ( double x, double *sptr, double *cptr ) | ✓ | ✓ | +| void sincos ( double x, double *sptr, double *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| void sincospi ( double x, double *sptr, double *cptr ) | ✓ | ✓ | +| void sincospi ( double x, double *sptr, double *cptr ) | ? | ? | | | | | | Calculate the sine and cosine of the first input argument multiplied by PI. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double y0f ( double x ) | ✓ | ✓ | +| double y0f ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 0 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double y1 ( double x ) | ✓ | ✓ | +| double y1 ( double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order 1 for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -| double yn ( int n, double x ) | ✓ | ✓ | +| double yn ( int n, double x ) | ? | ? | | | | | | Calculate the value of the Bessel function of the second kind of order n for the input argument. | | | +----------------------------------------------------------------------------------------------------+-------------------+---------------------+ -[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. +[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. .. _Integer-Intrinsics: @@ -1135,23 +1135,23 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi +----------------------------------------------------------------------------+ | float __frsqrt_rn ( float x ) | | | -| Compute 1/√x in round-to-nearest-even mode. | +| Compute 1/?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rd ( float x ) | | | -| Compute √x in round-down mode. | +| Compute ?x in round-down mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rn ( float x ) | | | -| Compute √x in round-to-nearest-even mode. | +| Compute ?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | float __fsqrt_ru ( float x ) | | | -| Compute √x in round-up mode. | +| Compute ?x in round-up mode. | +----------------------------------------------------------------------------+ | float __fsqrt_rz ( float x ) | | | -| Compute √x in round-towards-zero mode. | +| Compute ?x in round-towards-zero mode. | +----------------------------------------------------------------------------+ | float __log10f ( float x ) | | | @@ -1179,19 +1179,19 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi +----------------------------------------------------------------------------+ | double __dsqrt_rd ( double x ) | | | -| Compute √x in round-down mode. | +| Compute ?x in round-down mode. | +----------------------------------------------------------------------------+ | double __dsqrt_rn ( double x ) | | | -| Compute √x in round-to-nearest-even mode. | +| Compute ?x in round-to-nearest-even mode. | +----------------------------------------------------------------------------+ | double __dsqrt_ru ( double x ) | | | -| Compute √x in round-up mode. | +| Compute ?x in round-up mode. | +----------------------------------------------------------------------------+ | double __dsqrt_rz ( double x ) | | | -| Compute √x in round-towards-zero mode. | +| Compute ?x in round-towards-zero mode. | +----------------------------------------------------------------------------+ .. _Texture-Functions: @@ -1206,7 +1206,7 @@ Texture functions are not supported. Surface Functions ------------------ Surface functions are not supported. - + .. _Timer-Functions: Timer Functions @@ -1217,7 +1217,7 @@ HIP provides the following built-in functions for reading a high-resolution time clock_t clock() long long int clock64() - + Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used. @@ -1232,65 +1232,65 @@ HIP supports the following atomic operations. +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ | Function | Supported in HIP | Supported in CUDA | +=============================================================================================================================+==================+===================+ -| int atomicAdd(int* address, int val) | ✓ | ✓ | +| int atomicAdd(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| float atomicAdd(float* address, float val) | ✓ | ✓ | +| float atomicAdd(float* address, float val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicSub(int* address, int val) | ✓ | ✓ | +| int atomicSub(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicSub(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicSub(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicExch(int* address, int val) | ✓ | ✓ | +| int atomicExch(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicExch(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicExch(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| float atomicExch(float* address, float val) | ✓ | ✓ | +| float atomicExch(float* address, float val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicMin(int* address, int val) | ✓ | ✓ | +| int atomicMin(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicMin(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicMin(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicMax(int* address, int val) | ✓ | ✓ | +| int atomicMax(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicMax(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicMax(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicInc(unsigned int* address) | ✗ | ✓ | +| unsigned int atomicInc(unsigned int* address) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicDec(unsigned int* address) | ✗ | ✓ | +| unsigned int atomicDec(unsigned int* address) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicCAS(int* address, int compare, int val) | ✓ | ✓ | +| int atomicCAS(int* address, int compare, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ✓ | ✓ | +| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicAnd(int* address, int val) | ✓ | ✓ | +| int atomicAnd(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicOr(int* address, int val) | ✓ | ✓ | +| int atomicOr(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicOr(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicOr(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ | +| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| int atomicXor(int* address, int val) | ✓ | ✓ | +| int atomicXor(int* address, int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned int atomicXor(unsigned int* address,unsigned int val) | ✓ | ✓ | +| unsigned int atomicXor(unsigned int* address,unsigned int val) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ -| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ✓ | ✓ | +| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ? | ? | +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+ .. _Caveats-and-Features-Under-Development: @@ -1309,20 +1309,20 @@ Warp Cross-Lane Functions Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory. -Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn’t assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties:: +Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn't assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties:: cudaDeviceProp props; cudaGetDeviceProperties(&props, deviceID); - int w = props.warpSize; + int w = props.warpSize; // implement portable algorithm based on w (rather than assume 32 or 64) - + .. _Warp-Vote-and-Ballot-Functions: Warp Vote and Ballot Functions ++++++++++++++++++++++++++++++++ :: - + int __all(int predicate) int __any(int predicate) uint64_t __ballot(int predicate) @@ -1334,7 +1334,7 @@ Threads in a warp are referred to as lanes and are numbered from 0 to warpSize - * __all() returns 1 if all other warp lanes contribute nonzero predicates, or 0 otherwise Applications can test whether the target platform supports the any/all instruction using the hasWarpVote device property or the HIP_ARCH_HAS_WARP_VOTE compiler define. -``__ballot`` provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda’s 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define. +``__ballot`` provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda's 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define. .. _Warp-Shuffle-Functions: @@ -1343,14 +1343,14 @@ Warp Shuffle Functions ++++++++++++++++++++++++ Half-float shuffles are not supported. The default width is warpSize---see :ref:`Warp Cross-Lane Functions`. Applications should not assume the warpSize is 32 or 64. :: - + int __shfl (int var, int srcLane, int width=warpSize); float __shfl (float var, int srcLane, int width=warpSize); int __shfl_up (int var, unsigned int delta, int width=warpSize); float __shfl_up (float var, unsigned int delta, int width=warpSize); int __shfl_down (int var, unsigned int delta, int width=warpSize); float __shfl_down (float var, unsigned int delta, int width=warpSize) ; - int __shfl_xor (int var, int laneMask, int width=warpSize) + int __shfl_xor (int var, int laneMask, int width=warpSize) float __shfl_xor (float var, int laneMask, int width=warpSize); .. _Cooperative Groups Functions: @@ -1365,88 +1365,88 @@ HIP does not support any of the kernel language cooperative groups types or functions. +--------------------------------------------------------+------------------------+----------------------------+ -| Function | Supported in HIP | Supported in CUDA | +| Function | Supported in HIP | Supported in CUDA | +--------------------------------------------------------+------------------------+----------------------------+ -|void thread_group.sync() | | y | +|void thread_group.sync() | | y | | | | | +--------------------------------------------------------+------------------------+----------------------------+ -|unsigned thread_group.size() | | y | +|unsigned thread_group.size() | | y | | | | | +--------------------------------------------------------+------------------------+----------------------------+ |unsigned thread_group.thread_rank() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ -|bool thread_group.is_valid() | | y | +|bool thread_group.is_valid() | | y | | | | | +--------------------------------------------------------+------------------------+----------------------------+ |thread_group tiled_partiti0on(thread_group, size) | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |thread_block_tile tiled_partition(thread_group) | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |thread_block this_thread_block() | | y | | | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.shfl() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.shfl_down() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.shfl_up() | | y | | | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.shfl_xor() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.any() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.all() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.ballot() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.match_any() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |T thread_block_tile.match_all() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |coalesced_group coalesced_threads() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |grid_group this_grid() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |void grid_group.sync() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |unsigned grid_group.size() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |unsigned grid_group.thread_rank() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |bool grid_group.is_valid() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |multi_grid_group this_multi_grid() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |void multi_grid_group.sync() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |unsigned multi_grid_group.size() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |unsigned multi_grid_group.thread_rank() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ |bool multi_grid_group.is_valid() | | y | -| | | | +| | | | +--------------------------------------------------------+------------------------+----------------------------+ @@ -1462,21 +1462,21 @@ HIP does not support any of the kernel language warp matrix types or functions. +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -| Function | Supported in HIP | Supported in CUD | +| Function | Supported in HIP | Supported in CUD | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda) | | ✓ | +|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda) | | ? | | | | | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout) | | ✓ | +|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout) | | ? | | | | | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -|void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout) | | ✓ | +|void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout) | | ? | | | | | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -|void fill_fragment(fragment<...> &a, const T &value) | | ✓ | +|void fill_fragment(fragment<...> &a, const T &value) | | ? | | | | | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ -|void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, | | ✓ | +|void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, | | ? | |const fragment<...> &c , bool sat) | | | +--------------------------------------------------------------------------------------+------------------------+----------------------------+ @@ -1526,14 +1526,14 @@ GPU multiprocessors have a fixed pool of resources (primarily registers and shar **hip_launch_bounds** allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. **hip_launch_bounds** is a function attribute that must be attached to a **global** function:: __global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ... - MyKernel(hipGridLaunch lp, ...) + MyKernel(hipGridLaunch lp, ...) ... **launch_bounds** supports two parameters: -* MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than - MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, - MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying +* MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than + MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, + MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z). * MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage. @@ -1563,13 +1563,13 @@ Porting from CUDA __launch_bounds CUDA defines a __launch_bounds which is also designed to control occupancy:: __launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) - + * The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.) :: - + MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK)/32 - + The key differences in the interface are: @@ -1598,15 +1598,15 @@ Unroll with a bounds that is known at compile-time is supported. For example:: #pragma unroll 16 /* hint to compiler to unroll next loop by 16 */ for (int i=0; i<16; i++) ... - - + + #pragma unroll 1 /* tell compiler to never unroll the loop */ for (int i=0; i<16; i++) ... - + #pragma unroll /* hint to compiler to completely unroll next loop. */ for (int i=0; i<16; i++) ... - + .. _In-Line-Assembly: @@ -1635,12 +1635,12 @@ Kernel Compilation hipcc now supports compiling C++/HIP kernels to binary code objects. The user can specify the target for which the binary can be generated. HIP/HCC does not yet support fat binaries so only a single target may be specified. The file format for binary is ``.co`` which means Code Object. The following command builds the code object using **hipcc**. :: - hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE] - + hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE] + [INPUT FILE] = Name of the file containing kernels - [OUTPUT FILE] = Name of the generated code object file``` + [OUTPUT FILE] = Name of the generated code object file``` Note that one important fact to remember when using binary code objects is that the number of arguments to the kernel are different on HCC and NVCC path. Refer to the sample in samples/0_Intro/module_api for differences in the arguments to be passed to the kernel. - + diff --git a/Programming_Guides/LanguageInto.rst b/Programming_Guides/LanguageInto.rst index a3020cb2..2dec61d0 100644 --- a/Programming_Guides/LanguageInto.rst +++ b/Programming_Guides/LanguageInto.rst @@ -14,7 +14,7 @@ problem at hand. Here, we describe some of the options and how to choose among t HCC: Heterogeneous Compute Compiler #################################### -What is the Heterogeneous Compute (HC) API? It’s a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We’ll be following these developments closely, and you’ll see HC move quickly to include standard C++ capabilities. +What is the Heterogeneous Compute (HC) API? It's a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We'll be following these developments closely, and you'll see HC move quickly to include standard C++ capabilities. The Heterogeneous Compute Compiler (HCC) provides two important benefits: @@ -29,7 +29,7 @@ The Heterogeneous Compute Compiler (HCC) provides two important benefits: **Full control over the machine** - * Access AMD scratchpad memories (“LDS”) + * Access AMD scratchpad memories ("LDS") * Fully control data movement, prefetch and discard * Fully control asynchronous kernel launch and completion * Get device-side dependency resolution for kernel and data commands (without host involvement) @@ -44,7 +44,7 @@ performance or control of the machine. HIP: Heterogeneous-Computing Interface for Portability ######################################################### -What is Heterogeneous-Computing Interface for Portability (HIP)? It’s a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the +What is Heterogeneous-Computing Interface for Portability (HIP)? It's a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the host/kernel boundary. The Hipify tool automates much of the conversion work by performing a source-to-source transformation from Cuda to HIP. HIP code can run on AMD hardware (through the HCC compiler) or Nvidia hardware (through the NVCC compiler) with no performance loss compared with the original Cuda code. @@ -55,9 +55,9 @@ Programmers familiar with other GPGPU languages will find HIP very easy to learn Use HIP when converting Cuda applications to portable C++ and for new projects that require portability between AMD and Nvidia. HIP provides a C++ development language and access to the best development tools on both platforms. -**OpenCL™: Open Compute Language** +**OpenCL(TM): Open Compute Language** -What is OpenCL? It’s a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel +What is OpenCL? It's a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel and Nvidia GPUs support version 1.2 of the specification, as do x86 CPUs and other devices (including FPGAs and DSPs). OpenCL provides a C run-time API and C99-based kernel language. **When to Use OpenCL** @@ -67,7 +67,7 @@ Windows, Linux and Mac OS, as well as a wide variety of hardware platforms (desc **Anaconda Python With Numba** -What is Anaconda? It’s a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner, is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as +What is Anaconda? It's a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner, is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as ROCm-enabled discrete GPUs via Numba. It gives superpowers to the people who are changing the world. **Numba** @@ -81,7 +81,7 @@ Numba works by generating optimized machine code using the LLVM compiler infrast **When to Use Anaconda** -Use Anaconda when you’re handling large-scale data-analytics, +Use Anaconda when you're handling large-scale data-analytics, scientific and engineering problems that require you to manipulate large data arrays. diff --git a/Programming_Guides/Opencl-optimization.rst b/Programming_Guides/Opencl-optimization.rst index 1fa69e53..df03bf98 100644 --- a/Programming_Guides/Opencl-optimization.rst +++ b/Programming_Guides/Opencl-optimization.rst @@ -6,7 +6,7 @@ OPENCL Optimization ======================== -.. Note:: Re-Write in Progress to move this to Vega and FIJI/Polaris optimization guide +.. Note:: Re-Write in Progress to move this to Vega and FIJI/Polaris optimization guide Chapter 1 OpenCL Performance and Optimization ============================================== @@ -17,7 +17,7 @@ This chapter discusses performance and optimization when programming for AMD het -------------- AMD's CodeXL is an OpenCL kernel debugging and memory and performance analysis tool that gathers data from the OpenCL run-time and OpenCL devices during the execution of an OpenCL application. This information is used to discover bottlenecks in the application and find ways to optimize the application's performance for AMD platforms. -CodeXL 1.7, the latest version as of this writing, is available as an extension to Microsoft® Visual Studio®, a stand-alone version for Windows, and a stand-alone version for Linux. +CodeXL 1.7, the latest version as of this writing, is available as an extension to Microsoft(R) Visual Studio(R), a stand-alone version for Windows, and a stand-alone version for Linux. For a high-level summary of CodeXL features, see Chapter 4 in the AMD OpenCL User Guide. For information about how to use CodeXL to gather performance data about your OpenCL application, such as application traces and timeline views, see the `CodeXL home page `_. @@ -29,7 +29,7 @@ The Timeline View can be useful for debugging your OpenCL application. Examples For example, the timeline should show that non-dependent kernel executions and data transfer operations occurred simultaneously. -CodeXL also provides information about GPU kernel performance counters. This information can be used to find possible bottlenecks in the kernel execution. You can find the list of performance counters supported by AMD Radeon™ GPUs in the CodeXL documentation. Once the trace data has been used to discover which kernel is most in need of optimization, you can collect the GPU performance counters to drill down into the kernel execution on a GPU device. +CodeXL also provides information about GPU kernel performance counters. This information can be used to find possible bottlenecks in the kernel execution. You can find the list of performance counters supported by AMD Radeon(TM) GPUs in the CodeXL documentation. Once the trace data has been used to discover which kernel is most in need of optimization, you can collect the GPU performance counters to drill down into the kernel execution on a GPU device. The Analyze Mode in CodeXL provides the Statistics View, which can be used to gather useful statistics regarding the GPU usage of kernels. @@ -66,21 +66,21 @@ The sample code below shows how to compute the kernel execution time (End- Start The CodeXL GPU Profiler also can record the execution time for a kernel automatically. The Kernel Time metric reported in the Profiler output uses the built-in OpenCL timing capability and reports the same result as the ``kernelExecTimeNs`` calculation shown above. -Another interesting metric to track is the kernel launch time (Start - Queue). The kernel launch time includes both the time spent in the user application (after enqueuing the command, but before it is submitted to the device), as well as the time spent in the runtime to launch the kernel. For CPU devices, the kernel launch time is fast (tens of 1's), but for discrete GPU devices it can be several hundred μs. Enabling profiling on a command queue adds approximately 10 μs to 40 μs overhead to all clEnqueue calls. Much of the profiling overhead affects the start time; thus, it is visible in the launch time. Be careful when interpreting this metric. To reduce the launch overhead, the AMD OpenCL runtime combines several command submissions into a batch. Commands submitted as batch report similar start times and the same end time. +Another interesting metric to track is the kernel launch time (Start - Queue). The kernel launch time includes both the time spent in the user application (after enqueuing the command, but before it is submitted to the device), as well as the time spent in the runtime to launch the kernel. For CPU devices, the kernel launch time is fast (tens of 1's), but for discrete GPU devices it can be several hundred I 1/4 s. Enabling profiling on a command queue adds approximately 10 I 1/4 s to 40 I 1/4 s overhead to all clEnqueue calls. Much of the profiling overhead affects the start time; thus, it is visible in the launch time. Be careful when interpreting this metric. To reduce the launch overhead, the AMD OpenCL runtime combines several command submissions into a batch. Commands submitted as batch report similar start times and the same end time. Measure performance of your test with CPU counters. Do not use OCL profiling. To determine if an application is executed asynchonically, build a dependent execution with OCL events. This is a "generic" solution; however, there is an exception when you can enable profiling and have overlap transfers. DRMDMA engines do not support timestamps ("GPU counters"). To get OCL profiling data, the runtime must synchronize the main command processor (CP) with the DMA engine; this disables overlap. Note, however, that Southern Islands has two independent main CPs and runtime pairs them with DMA engines. So, the application can still execute kernels on one CP, while another is synced with a DRM engine for profiling; this lets you profile it with APP or OCL profiling. 1.2.2 Using the OpenCL timer with Other System Timers ++++++++++++++++++++++++++++++++++++++++++++++++++++++ The resolution of the timer, given in ns, can be obtained from:: - + clGetDeviceInfo(...,CL_DEVICE_PROFILING_TIMER_RESOLUTION...); AMD CPUs and GPUs report a timer resolution of 1 ns. AMD OpenCL devices are required to correctly track time across changes in frequency and power states. Also, the AMD APP SDK uses the same time-domain for all devices in the platform; thus, the profiling timestamps can be directly compared across the CPU and GPU devices. The sample code below can be used to read the current value of the OpenCL timer clock. The clock is the same routine used by the AMD OpenCL runtime to generate the profiling timestamps. This function is useful for correlating other program events with the OpenCL profiling timestamps. :: - + uint64_t timeNanos() { #ifdef linux @@ -101,7 +101,7 @@ For more information, see section 5.9, "Profiling Operations on Memory Objects a 1.2.3 Estimating Memory Bandwidth ++++++++++++++++++++++++++++++++++ The memory bandwidth required by a kernel is perhaps the most important performance consideration. To calculate this: - + Effective Bandwidth = (Br + Bw)/T where: @@ -119,7 +119,7 @@ Bw = 1 x (1024 x 1024 x 4 bytes) = 4194304 bytes ;; 1 array, 1024x1024, each ele If the elapsed time for this copy as reported by the profiling timers is 1000000 ns (1 million ns, or .001 sec), the effective bandwidth is: (Br+Bw)/T = (8388608+4194304)/1000000 = 12.6GB/s - + The CodeXL GPU Profiler can report the number of dynamic instructions per thread that access global memory through the FetchInsts and WriteInsts counters. The Fetch and Write reports average the per-thread counts; these can be fractions if the threads diverge. The Profiler also reports the dimensions of the global NDRange for the kernel in the GlobalWorkSize field. The total number of threads can be determined by multiplying together the three components of the range. If all (or most) global accesses are the same size, the counts from the Profiler and the approximate size can be used to estimate Br and Bw: Br = Fetch * GlobalWorkitems * Size @@ -156,7 +156,7 @@ OpenCL uses memory objects to pass data to kernels. These can be either buffers * how to control which memory kind is used for a memory object; * how the runtime maps memory objects for host access; - + * how the runtime performs memory object reading, writing and copying; * how best to use command queues; and @@ -167,13 +167,13 @@ OpenCL uses memory objects to pass data to kernels. These can be either buffers +++++++++++++++++++++++++++++++++++++++++ Memory is used to store memory objects that are accessed by kernels executing on the device, as well as to hold memory object data when they are mapped for access by the host application. This section describes the different memory kinds used by the runtime. Table 1.1 lists the performance of each memory type given -a PCIe3-capable platform and a high-end AMD Radeon™ 7XXX discrete GPU. In Table 1.1, when host memory is accessed by the GPU shader, it is of type ``CL_MEM_ALLOC_HOST_PTR``. When GPU memory is accessed by the CPU, it is of type ``CL_MEM_PERSISTENT_MEM_AMD``. +a PCIe3-capable platform and a high-end AMD Radeon(TM) 7XXX discrete GPU. In Table 1.1, when host memory is accessed by the GPU shader, it is of type ``CL_MEM_ALLOC_HOST_PTR``. When GPU memory is accessed by the CPU, it is of type ``CL_MEM_PERSISTENT_MEM_AMD``. **Table 1.1 Memory Bandwidth in GB/s (R = read, W = write) in GB/s** **Table 2:** - + +-------------+---------+---------+--------------+--------------+-----------+-------------+ | | CPU R | GPU W | GPU Shader R | GPU Shader W | GPU DMA R | GPU DMA W | +=============+=========+=========+==============+==============+===========+=============+ @@ -204,7 +204,7 @@ If the runtime knows the data is in pinned host memory, it can be transferred to Currently, the runtime recognizes only data that is in pinned host memory for operation arguments that are memory objects it has allocated in pinned host memory. For example, the buffer argument of ``clEnqueueReadBuffer/clEnqueueWriteBuffer`` and ``image`` argument of ``clEnqueueReadImage/clEnqueueWriteImage.`` It does not detect that the ptr arguments of these operations addresses pinned host memory, even if they are the result of ``clEnqueueMapBuffer/clEnqueueMapImage`` on a memory object that is in pinned host memory. -The runtime can make pinned host memory directly accessible from the GPU. Like regular host memory, the CPU uses caching when accessing pinned host memory. For discrete devices, the GPU access to this memory is through the PCIe bus, which also limits bandwidth. For APU devices that do not have the PCIe overhead, GPU access is significantly slower than accessing device-visible host memory (see section 1.3.1.3), which does not use the cache coherency protocol. +The runtime can make pinned host memory directly accessible from the GPU. Like regular host memory, the CPU uses caching when accessing pinned host memory. For discrete devices, the GPU access to this memory is through the PCIe bus, which also limits bandwidth. For APU devices that do not have the PCIe overhead, GPU access is significantly slower than accessing device-visible host memory (see section 1.3.1.3), which does not use the cache coherency protocol. 1.3.1.3 Device-Visible Host Memory ################################### @@ -307,7 +307,7 @@ The host application can use ``clEnqueueMapBuffer/clEnqueueMapImage`` to obtain 1.3.4.1 Zero Copy Memory Objects ################################# ``CL_MEM_USE_PERSISTENT_MEM_AMD``, ``CL_MEM_USE_HOST_PTR,`` and ``CL_MEM_ALLOC_HOST_PTR`` support zero copy memory objects. The first provides device-resident zero copy memory objects, the other two provide host-resident zero copy memory objects. - + Zero copy memory objects can be used by an application to optimize data movement. When ``clEnqueueMapBuffer / clEnqueueMapImage / clEnqueueUnmapMemObject`` are used, no runtime transfers are performed, and the operations are very fast; however, the runtime can return a different pointer value each time a zero copy memory object is mapped. Note that only images created with ``CL_MEM_USE_PERSISTENT_MEM_AMD`` can be zero copy. From Southern Island on, devices support zero copy memory objects under Linux; however, only images created with ``CL_MEM_USE_PERSISTENT_MEM_AMD`` can be zero copy. @@ -363,11 +363,11 @@ For Southern Islands and later, devices support at least two hardware compute qu An OpenCL queue is assigned to a hardware queue on creation time. The hardware compute queues are selected according to the creation order within an OpenCL context. If the hardware supports K concurrent hardware queues, the Nth created OpenCL queue within a specific OpenCL context will be assigned to the (N mod K) hardware queue. The number of compute queues can be limited by specifying the ``GPU_NUM_COMPUTE_RINGS`` environment variable. -Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues), so they offer more performance. For example, the AMD Radeon™ R9290X devices, in the VI family contain 8 ACEs and 44 CUs. - +Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues), so they offer more performance. For example, the AMD Radeon(TM) R9290X devices, in the VI family contain 8 ACEs and 44 CUs. + 1.3.6.1 A note on hardware queues ################################# -A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon™ HD 290X compute device can execute up to 112,640 threads concurrently. The GPU can execute two kernels each spawning 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue. +A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon(TM) HD 290X compute device can execute up to 112,640 threads concurrently. The GPU can execute two kernels each spawning 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue. 1.4 OpenCL Data Transfer Optimization -------------------------------------- @@ -377,7 +377,7 @@ The AMD OpenCL implementation offers several optimized paths for data transfer t ++++++++++++++++++ * *Deferred allocation* - The CL runtime attempts to minimize resource consumption by delaying buffer allocation until first use. As a side effect, the first accesses to a buffer may be more expensive than subsequent accesses. - * *Peak interconnect bandwidth* - As used in the text below, this is the transfer bandwidth between host and device that is available under optimal conditions at the application level. It is dependent on the type of interconnect, the chipset, and the graphics chip. As an example, a high-performance PC with a PCIe 3.0 16x bus and a GCN architecture (AMD Radeon™ HD 7XXX series) graphics card has a nominal interconnect bandwidth of 16 GB/s. + * *Peak interconnect bandwidth* - As used in the text below, this is the transfer bandwidth between host and device that is available under optimal conditions at the application level. It is dependent on the type of interconnect, the chipset, and the graphics chip. As an example, a high-performance PC with a PCIe 3.0 16x bus and a GCN architecture (AMD Radeon(TM) HD 7XXX series) graphics card has a nominal interconnect bandwidth of 16 GB/s. * *Pinning* - When a range of host memory is prepared for transfer to the GPU, its pages are locked into system memory. This operation is called pinning; it can impose a high cost, proportional to the size of the memory range. One of the goals of optimizing data transfer is to use pre-pinned buffers whenever possible. However, if pre-pinned buffers are used excessively, it can reduce the available system memory and result in excessive swapping. Host side zero copy buffers provide easy access to pre- pinned memory. * *WC* - Write Combine is a feature of the CPU write path to a select region of the address space. Multiple adjacent writes are combined into cache lines (for example, 64 bytes) before being sent to the external bus. This path typically provides fast streamed writes, but slower scattered writes. Depending on the chip set, scattered writes across a graphics interconnect can be very slow. Also, some platforms require multi-core CPU writes to saturate the WC path over an interconnect. * *Uncached accesses* - Host memory and I/O regions can be configured as uncached. CPU read accesses are typically very slow; for example: uncached CPU reads of graphics memory over an interconnect. @@ -404,16 +404,16 @@ If a given platform supports the zero copy feature, the following buffer types a * The CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR buffers are: * zero copy buffers that resides on the host. - + * directly accessible by the host at host memory bandwidth. * directly accessible by the device across the interconnect. * a pre-pinned sources or destinations for CL read, write, and copy commands into device memory at peak interconnect bandwidth. - + Note that buffers created with the flag CL_MEM_ALLOC_HOST_PTR together with CL_MEM_READ_ONLY may reside in uncached write-combined memory. As a result, CPU can have high streamed write bandwidth, but low read and potentially low write scatter bandwidth, due to the uncached WC path. - + * The CL_MEM_USE_PERSISTENT_MEM_AMD buffer is * a zero copy buffer that resides on the GPU device. @@ -439,7 +439,7 @@ Zero copy buffers work well on APU devices. SDK 2.5 introduced an optimization t As this memory is not cacheable, CPU read operations are very slow. This type of buffer also exists on discrete platforms, but transfer performance typically is limited by PCIe bandwidth. Zero copy buffers can provide low latency for small transfers, depending on the transfer path. For small buffers, the combined latency of map/CPU memory access/unmap can be smaller than the corresponding DMA latency. - + 1.4.2.3 Pre-pinned Buffers ############################ @@ -467,12 +467,12 @@ From an application point of view, two fundamental use cases exist, and they can Note that the OpenCL runtime uses deferred allocation to maximize memory resources. This means that a complete roundtrip chain, including data transfer and kernel compute, might take one or two iterations to reach peak performance. A code sample named BufferBandwidth can be used to investigate and benchmark the various transfer options in combination with different buffer types. - + **Option 1** - clEnqueueWriteBuffer() and clEnqueueReadBuffer(). This option is the easiest to use on the application side. *CL_MEM_USE_HOST_PTR* is an ideal choice if the application wants to transfer a buffer that has already been allocated through ``malloc( )`` or ``mmap( )``. - There are two ways to use this option. The first uses clEnqueueRead/WriteBuffer on a pre-pinned, mapped host-side buffer: - + There are two ways to use this option. The first uses clEnqueueRead/WriteBuffer on a pre-pinned, mapped host-side buffer: + a. pinnedBuffer = clCreateBuffer ( CL_MEM_ALLOC_HOST_PTR or CL_MEM_USE_HOST_PTR ) b. deviceBuffer = clCreateBuffer( ) c. void *pinnedMemory = clEnqueueMapBuffer (pinnedBuffer) @@ -486,7 +486,7 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the **Option 2** - clEnqueueCopyBuffer() on a pre-pinned host buffer (requires pre-pinned buffer support) This is analogous to Option 1. Performing a CL copy of a pre-pinned buffer to a device buffer (or vice versa) runs at peak interconnect bandwidth. - + a. pinnedBuffer = clCreateBuffer( CL_MEM_ALLOC_HOST_PTR or CL_MEM_USE_HOST_PTR ) b. deviceBuffer = clCreateBuffer() *This is followed either by :* c. void *memory = clEnqueueMapBuffer ( pinnedBuffer ) @@ -509,45 +509,45 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the The transfer sequence is as follows: a. Data transfer from host to device buffer. - + 1. ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_WRITE, ..) Since the buffer is mapped write-only, no data is transferred from device buffer to host. The map operation is very low cost. A pointer to a pinned host buffer is returned. 2. The application fills in the host buffer through memset( ptr ), memcpy ( ptr, srcptr ), fread( ptr ), or direct CPU writes. This happens at host memory bandwidth. 3. clEnqueueUnmapMemObject( .., buf, ptr, .. ) The pre-pinned buffer is transferred to the GPU device, at peak interconnect bandwidth. - + b. Data transfer from device buffer to host. - - 1. ptr = clEnqueueMapBuffer(.., buf, .., CL_MAP_READ, .. ) + + 1. ptr = clEnqueueMapBuffer(.., buf, .., CL_MAP_READ, .. ) This command triggers a transfer from the device to host memory, into a pre-pinned temporary buffer, at peak interconnect bandwidth. A pointer to the pinned memory is returned. 2. The application reads and processes the data, or executes a memcpy( dstptr, ptr ), fwrite (ptr), or similar function. Since the buffer resides in host memory, this happens at host memory bandwidth. 3. clEnqueueUnmapMemObject( .., buf, ptr, .. ) - + Since the buffer was mapped as read-only, no transfer takes place, and the unmap operation is very low cost. **Option 4** - Direct host access to a zero copy device buffer (requires zero copy support) This option allows overlapping of data transfers and GPU compute. It is also useful for sparse write updates under certain constraints. - + a. A zero copy buffer on the device is created using the following command: buf = clCreateBuffer ( .., CL_MEM_USE_PERSISTENT_MEM_AMD, ..) This buffer can be directly accessed by the host CPU, using the uncached WC path. This can take place at the same time the GPU executes a compute kernel. A common double buffering scheme has the kernel process data from one buffer while the CPU fills a second buffer. See the TransferOverlap code sample. A zero copy device buffer can also be used to for sparse updates, such as assembling sub-rows of a larger matrix into a smaller, contiguous block for GPU processing. Due to the WC path, it is a good design choice to try to align writes to the cache line size, and to pick the write block size as large as possible. b. Transfer from the host to the device. - + 1.ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_WRITE, .. ) - + This operation is low cost because the zero copy device buffer is directly mapped into the host address space. 2.The application transfers data via memset( ptr ), memcpy( ptr, srcptr ), or direct CPU writes. The CPU writes directly across the interconnect into the zero copy device buffer. Depending on the chipset, the bandwidth can be of the same order of magnitude as the interconnect bandwidth, although it typically is lower than peak. 3.clEnqueueUnmapMemObject ( .., buf, ptr, .. ) - + As with the preceding map, this operation is low cost because the buffer continues to reside on the device. - + c. If the buffer content must be read back later, use clEnqueueReadBuffer( .., buf, ..) or clEnqueueCopyBuffer( .., buf, zero copy host buffer, .. ) - - This bypasses slow host reads through the uncached path. + + This bypasses slow host reads through the uncached path. **Option 5** - Direct GPU access to a zero copy host buffer (requires zero copy support) @@ -557,14 +557,14 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the buf = clCreateBuffer( .., CL_MEM_ALLOC_HOST_PTR, .. ) b. Next the application modifies or reads the zero copy host buffer. - + 1. ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_READ | CL_MAP_WRITE, .. ) This operation is very low cost because it is a map of a buffer already residing in host memory. 2. The application modifies the data through ``memset( ptr )``, ``memcpy`` (in either direction), sparse or dense CPU reads or writes. Since the application is modifying a host buffer, these operations take place at host memory bandwidth. 3. clEnqueueUnmapMemObject( .., buf, ptr, .. ) - + As with the preceding map, this operation is very low cost because the buffer continues to reside in host memory. - + c. The application runs clEnqueueNDRangeKernel(), using buffers of this type as input or output. GPU kernel reads and writes go across the interconnect to host memory, and the data transfer becomes part of the kernel execution. The achievable bandwidth depends on the platform and chipset, but can be of the same order of magnitude as the peak interconnect bandwidth. For discrete graphics cards, it is important to note that resulting GPU kernel bandwidth is an order of magnitude lower compared to a kernel accessing a regular device buffer located on the device. @@ -576,17 +576,17 @@ The AMD OpenCL runtime supports both CPU and GPU devices. This section introduce 1.5.1 CPU and GPU Devices +++++++++++++++++++++++++++++ -Table 1.1 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range AMD Radeon™ HD 7770 GPU running at 1 GHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column. +Table 1.1 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range AMD Radeon(TM) HD 7770 GPU running at 1 GHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column. The GPU excels at high-throughput: the peak execution rate (measured in FLOPS) is 7X higher than the CPU, and the memory bandwidth is 2.5X higher than the CPU. The GPU also consumes approximately 65% the power of the CPU; thus, for this comparison, the power efficiency in flops/watt is 10X higher. While power efficiency can vary significantly with different devices, GPUs generally provide greater power efficiency (flops/watt) than CPUs because they optimize for throughput and eliminate hardware designed to hide latency. - + **Table 1.1 CPU and GPU Performance Characteristics** +------------------------------------+-------------------+---------------------+--------------+ | | CPU | GPU | Winner Ratio | +====================================+===================+=====================+==============+ -| Example Device | AMD Phenom™ II X4 | AMD Radeon™ HD 7770 | | +| Example Device | AMD Phenom(TM) II X4 | AMD Radeon(TM) HD 7770 | | +------------------------------------+-------------------+---------------------+--------------+ | Core Frequency | 2800 MHz | 1 GHz | 3 X | +------------------------------------+-------------------+---------------------+--------------+ @@ -618,11 +618,11 @@ The GPU excels at high-throughput: the peak execution rate (measured in FLOPS) i +------------------------------------+-------------------+---------------------+--------------+ | | | | | +------------------------------------+-------------------+---------------------+--------------+ -| Approx Kernel Launch Latency | 25 μs | 50 μs | 2 X | +| Approx Kernel Launch Latency | 25 us | 50 us | 2 X | +------------------------------------+-------------------+---------------------+--------------+ -.. [1] For the power specifications of the AMD Phenom™ II x4, see http://www.amd.com/us/products/desktop/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx . +.. [1] For the power specifications of the AMD Phenom(TM) II x4, see http://www.amd.com/us/products/desktop/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx . Table 4.5 provides a comparison of the CPU and GPU performance charac- teristics in an AMD A8-4555M "Trinity" APU (19 W, 21 GB/s memory bandwidth). @@ -655,7 +655,7 @@ Table 4.5 provides a comparison of the CPU and GPU performance charac- teristics -Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 10X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 1000 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires four cycles. The CPU also has a latency-optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in- flight threads to hide the latency. The AMD Radeon™ HD 7770 GPU, for example, supports more than 25,000 in-flight work-items and can switch to a new wavefront (containing up to 64 work-items) in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses. +Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 10X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 1000 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires four cycles. The CPU also has a latency-optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in- flight threads to hide the latency. The AMD Radeon(TM) HD 7770 GPU, for example, supports more than 25,000 in-flight work-items and can switch to a new wavefront (containing up to 64 work-items) in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses. Each GPU wavefront has its own register state, which enables the fast single- cycle switching between threads. Also, GPUs can be very efficient at gather/scatter operations: each work-item can load from any arbitrary address, and the registers are completely decoupled from the other threads. This is substantially more flexible and higher-performing than a classic Vector ALU-style architecture (such as SSE on the CPU), which typically requires that data be accessed from contiguous and aligned memory locations. SSE supports instructions that write parts of a register (for example, MOVLPS and MOVHPS, which write the upper and lower halves, respectively, of an SSE register), but these instructions generate additional microarchitecture dependencies and frequently require additional pack instructions to format the data correctly. @@ -682,7 +682,7 @@ Usually, when the data size is small, it is faster to use the CPU because the st By design, each OpenCL command queue can only schedule work on a single OpenCL device. Thus, using multiple devices requires the developer to create a separate queue for each device, then partition the work between the available command queues. A simple scheme for partitioning work between devices would be to statically determine the relative performance of each device, partition the work so that faster devices received more work, launch all the kernels, and then wait for them to complete. In practice, however, this rarely yields optimal performance. The relative performance of devices can be difficult to determine, in particular for kernels whose performance depends on the data input. Further, the device performance can be affected by dynamic frequency scaling, OS thread scheduling decisions, or contention for shared resources, such as shared caches and DRAM bandwidth. Simple static partitioning algorithms which "guess wrong" at the beginning can result in significantly lower performance, since some devices finish and become idle while the whole system waits for the single, unexpectedly slow device. - + For these reasons, a dynamic scheduling algorithm is recommended. In this approach, the workload is partitioned into smaller parts that are periodically scheduled onto the hardware. As each device completes a part of the workload, it requests a new part to execute from the pool of remaining work. Faster devices, or devices which work on easier parts of the workload, request new input faster, resulting in a natural workload balancing across the system. The approach creates some additional scheduling and kernel submission overhead, but dynamic scheduling generally helps avoid the performance cliff from a single bad initial scheduling decision, as well as higher performance in real-world system environments (since it can adapt to system conditions as the algorithm runs). Multi-core runtimes, such as Cilk, have already introduced dynamic scheduling algorithms for multi-core CPUs, and it is natural to consider extending these scheduling algorithms to GPUs as well as CPUs. A GPU introduces several new aspects to the scheduling process: @@ -723,55 +723,55 @@ The AMD OpenCL implementation spawns a new thread to manage each command queue. For low-latency CPU response, it can be more efficient to use a dedicated spin loop and not call clFinish() Calling clFinish() indicates that the application wants to wait for the GPU, putting the thread to sleep. For low latency, the application should use ``clFlush()``, followed by a loop to wait for the event to complete. This is also true for blocking maps. The application should use non- blocking maps followed by a loop waiting on the event. The following provides sample code for this. :: - + if (sleep) - + { // this puts host thread to sleep, useful if power is a consideration or overhead is not a concern ``clFinish`` (cmd_queue_); - + } else - + { - + // this keeps the host thread awake, useful if latency is a concern - + clFlush(cmd_queue_); - + error_ = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); - + while (eventStatus > 0) - + { - + error_ = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); - + to find - + Sleep(0); // be nice to other threads, allow scheduler other work if possible - + // Choose your favorite way to yield, SwitchToThread() for example, in place of Sleep(0) - + } - + } - + 1.5.5 GPU and CPU Kernels +++++++++++++++++++++++++++++ While OpenCL provides functional portability so that the same kernel can run on any device, peak performance for each device is typically obtained by tuning the OpenCL kernel for the target device. -Code optimized for the Tahiti device (the AMD Radeon™ HD 7970 GPU) typically runs well across other members of the Southern Islands family. +Code optimized for the Tahiti device (the AMD Radeon(TM) HD 7970 GPU) typically runs well across other members of the Southern Islands family. CPUs and GPUs have very different performance characteristics, and some of these impact how one writes an optimal kernel. Notable differences include: @@ -788,7 +788,7 @@ Another approach is to leverage a CPU-targeted routine written in a standard hig ++++++++++++++++++++++++++++ The AMD OpenCL program creates at least one context, and each context can contain multiple devices. Thus, developers must choose whether to place all devices in the same context or create a new context for each device. Generally, it is easier to extend a context to support additional devices rather than duplicating the context for each device: buffers are allocated at the context level (and automatically across all devices), programs are associated with the context, and kernel compilation (via ``clBuildProgram``) can easily be done for all devices in a context. However, with current OpenCL implementations, creating a separate context for each device provides more flexibility, especially in that buffer allocations can be targeted to occur on specific devices. Generally, placing the devices in the same context is the preferred solution. - + Chapter 2 OpenCL Performance and Optimiza- tion for GCN Devices @@ -802,9 +802,9 @@ The GPU consists of multiple compute units. Each compute unit (CU) contains loca Each compute unit contains 64 kB local memory, 16 kB of read/write L1 cache, four vector units, and one scalar unit. The maximum local memory allocation is 32 kB per work-group. Each vector unit contains 512 scalar registers (SGPRs) for handling branching, constants, and other data constant across a wavefront. Vector units also contain 256 vector registers (VGPRs). VGPRs actually are scalar registers, but they are replicated across the whole wavefront. Vector units contain 16 processing elements (PEs). Each PE is scalar. -Since the L1 cache is 16 kB per compute unit, the total L1 cache size is 16 kB * (# of compute units). For the AMD Radeon™ HD 7970, this means a total of 512 kB L1 cache. L1 bandwidth can be computed as: +Since the L1 cache is 16 kB per compute unit, the total L1 cache size is 16 kB * (# of compute units). For the AMD Radeon(TM) HD 7970, this means a total of 512 kB L1 cache. L1 bandwidth can be computed as: L1 peak bandwidth = Compute Units * (4 threads/clock) * (128 bits per thread) * (1 byte / 8 bits) * Engine Clock -For the AMD Radeon™ HD 7970, this is ~1.9 TB/s. +For the AMD Radeon(TM) HD 7970, this is ~1.9 TB/s. If two memory access requests are directed to the same controller, the hardware serializes the access. This is called a channel conflict. Similarly, if two memory access requests go to the same memory bank, hardware serializes the access. This is called a bank conflict. From a developer's point of view, there is not much difference between channel and bank conflicts. Often, a large power of two stride results in a channel conflict. The size of the power of two stride that causes a specific type of conflict depends on the chip. A stride that results in a channel conflict on a machine with eight channels might result in a bank conflict on a machine with four. @@ -827,7 +827,7 @@ When the application has complete control of the access pattern and address gene In this example: :: - for (ptr=base; ptr> B) & C ==> [u]bit_extract - + where - + | B and C are compile time constants, | A is a 8/16/32bit integer type, and | C is a mask. * Bitfield insert on signed/unsigned integers | ((A & B) << C) | ((D & E) << F ==> ubit_insert - + where - + | B and E have no conflicting bits (B^E == 0), | B, C, E, and F are compile-time constants, and | B and E are masks. @@ -1472,7 +1472,7 @@ Examples for using this loop follow. :: No unrolling example:: - #pragma unroll 1 + #pragma unroll 1 for (int i = 0; i < n; i++) { ... } @@ -1543,18 +1543,18 @@ In the second block of code, the ``?:`` operator executes in the vector units, s a[idx] = d[idx]; } -This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what ‘d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. +This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. *If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size. *Since the AMD OpenCL runtime supports only in-order queuing, using clFinish() on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command. For example:: - + clEnqueueWriteBuffer(myCQ, buff, **CL_FALSE**, 0, buffSize, input, 0, NULL, NULL); clFinish(myCQ); is equivalent, for the AMD OpenCL runtime, to:: - + clEnqueueWriteBuffer(myCQ, buff, **CL_TRUE**, 0, buffSize, input, 0, NULL, NULL); * GPU ISA: GCN-based GPUs have 32KB of dedicated L1 instruction cache. A single instruction cache instance serves up to 4 CUs (depending upon the architecture family and device), with each CU holding up to 40 wavefronts. As each wavefront includes its own program counter, a single instruction cache unit may serve up to 160 wavefronts with each executing a different instruction in the program. @@ -1567,8 +1567,8 @@ is equivalent, for the AMD OpenCL runtime, to:: * Porting from CUDA to OpenCL is relatively straightforward. Multiple vendors have documents describing how to do this, including AMD:http://developer.amd.com/documentation/articles/pages/OpenCL-and-the-ATI-Stream-v2.0-Beta.aspx#four * Some specific performance recommendations which differ from other GPU architectures: - * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an AMD Radeon™ HD 7970 GPU. - * AMD GPUs have a very high single-precision flops capability (3.788 teraflops in a single AMD Radeon™ HD 7970 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware. + * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an AMD Radeon(TM) HD 7970 GPU. + * AMD GPUs have a very high single-precision flops capability (3.788 teraflops in a single AMD Radeon(TM) HD 7970 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware. 2.8.5 Guidance for CPU Programmers Using OpenCL to Program GPUs +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1617,7 +1617,7 @@ can be written as a composition of mad instructions which use fused multiple add 2.8.6.4 Avoid Barriers When Possible ##################################### Using barriers in a kernel on the CPU causes a significant performance penalty compared to the same kernel without barriers. Use a barrier only if the kernel requires it for correctness, and consider changing the algorithm to reduce barriers usage. - + 2.8.7 Optimizing Kernels for Southern Island GPUs ++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1625,10 +1625,10 @@ Using barriers in a kernel on the CPU causes a significant performance penalty c ####################################### A conditional of the form "if-then-else" generates branching. Use the ``select()`` function to replace these structures with conditional assignments that do not cause branching. For example: :: - + if(x==1) r=0.5; if(x==2) r=1.0; - + becomes :: @@ -1641,7 +1641,7 @@ Note that if the body of the ``if`` statement contains an I/O, the ``if`` statem ############################### A conditional expression with many terms can compile into nested conditional code due to the C-language requirement that expressions must short circuit. To prevent this, move the expression out of the control flow statement. For example:: - + if(a&&b&&c&&d){...} becomes @@ -1662,7 +1662,7 @@ Because the GPU is a Vector ALU architecture, there is a cost to executing an `` 2.8.7.5 Experiment With do/while/for Loops ########################################### ``for`` loops can generate more conditional code than equivalent ``do`` or ``while`` loops. Experiment with these different loop types to find the one with best performance. - + 2.9 Specific Guidelines for GCN family GPUs -------------------------------------------- @@ -1680,9 +1680,9 @@ Typical scalar instructions execute in four cycles. The scalar engine can accept Typical vector instructions execute in four cycles. SIMDs within a compute unit can overlap vector instruction execution; each SIMD unit is offset by one cycle from the previous one. This allows each SIMD unit to execute one Vector ALU instruction and one scalar ALU instruction every four clocks. -All GCN GPUs have double-precision support. For Tahiti (AMD Radeon™ HD 79XX series), double precision adds run at one-half the single precision add rate. Double-precision multiplies and MAD instructions run at one-quarter the floating- point rate. - -The double-precision rate of Pitcairn (AMD Radeon™ HD 78XX series) and Cape Verde (AMD Radeon™ HD 77XX series) is one quarter that of Tahiti. This also affects the performance of single-precision fused multiple add (FMA). +All GCN GPUs have double-precision support. For Tahiti (AMD Radeon(TM) HD 79XX series), double precision adds run at one-half the single precision add rate. Double-precision multiplies and MAD instructions run at one-quarter the floating- point rate. + +The double-precision rate of Pitcairn (AMD Radeon(TM) HD 78XX series) and Cape Verde (AMD Radeon(TM) HD 77XX series) is one quarter that of Tahiti. This also affects the performance of single-precision fused multiple add (FMA). Similar to previous generations local data share (LDS) is a shared resource within a compute unit. The maximum LDS allocation size for a work-group is still 32 kB, however each compute unit has a total of 64 kB of LDS. On SI GPUs, LDS memory has 32 banks; thus, it is important to be aware of LDS bank conflicts on half-wavefront boundaries. The allocation granularity for LDS is 256 bytes; the minimum size is 0 bytes. It is much easier to achieve high LDS bandwidth use on SI hardware. @@ -1701,7 +1701,7 @@ Since there are no more clauses in the instruction set architecture (ISA) for GC * The engine is wider than previous generations; this means larger dispatches are required to keep the all the compute units busy. * A single wavefront can take twice as long to execute compared to previous generations (assuming ALU bound). This is because GPUs with VLIW-4 could execute the four instructions in a VLIW bundle in eight clocks (typical), and SI GPUs can execute one vector instruction in four clocks (typical). * Execution of kernel dispatches can overlap if there are no dependencies between them and if there are resources available in the GPU. This is critical when writing benchmarks it is important that the measurements are accurate and that "false dependencies" do not cause unnecessary slowdowns.An example of false dependency is: - + a. Application creates a kernel "foo". b. Application creates input and output buffers. c. Application binds input and output buffers to kernel "foo". @@ -1728,7 +1728,7 @@ Table 2.4 provides a simplified picture showing the Northern Island compute unit **Figure 2.4 Northern Islands Compute Unit Arrangement** Table 2.5 provides a simplified picture showing the Southern Island compute unit arrangement. - + .. image:: Opencl_optimization_images/2.5.png **Figure 2.5 Southern Island Compute Unit Arrangement** @@ -1744,7 +1744,7 @@ The following table provides device-specific information for some AMD Southern I +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+ | | Verde PRO | Verde XT | Pitcairn PRO | Pitcairn XT | Tahiti PRO | Tahiti XT | +===============================+===========+==========+==============+=============+============+===========+ -| Product Name (AMD Radeon™ HD) | 7750 | 7770 | 7850 | 7870 | 7950 | 7970 | +| Product Name (AMD Radeon(TM) HD) | 7750 | 7770 | 7850 | 7870 | 7950 | 7970 | +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+ | Engine Speed (MHz) | 800 | 1000 | 860 | 1000 | 800 | 925 | +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+ @@ -1822,22 +1822,22 @@ This chapter discusses performance and optimization when programming for AMD GPU ------------------------------- Figure 3.1 is a block diagram of the GPU memory system. The up arrows are read paths, the down arrows are write paths. WC is the write combine cache. -The GPU consists of multiple compute units. Each compute unit contains 32 kB local (on-chip) memory, L1 cache, registers, and 16 processing element (PE). Each processing element contains a five-way (or four-way, depending on the GPU type) VLIW processor. Individual work-items execute on a single processing element; one or more work-groups execute on a single compute unit. On a GPU, hardware schedules the work-items. On the ATI Radeon™ HD 5000 series of GPUs, hardware schedules groups of work-items, called wavefronts, onto stream cores; thus, work-items within a wavefront execute in lock-step; the same instruction is executed on different data. +The GPU consists of multiple compute units. Each compute unit contains 32 kB local (on-chip) memory, L1 cache, registers, and 16 processing element (PE). Each processing element contains a five-way (or four-way, depending on the GPU type) VLIW processor. Individual work-items execute on a single processing element; one or more work-groups execute on a single compute unit. On a GPU, hardware schedules the work-items. On the ATI Radeon(TM) HD 5000 series of GPUs, hardware schedules groups of work-items, called wavefronts, onto stream cores; thus, work-items within a wavefront execute in lock-step; the same instruction is executed on different data. -The L1 cache is 8 kB per compute unit. (For the ATI Radeon™ HD 5870 GPU, this means 160 kB for the 20 compute units.) The L1 cache bandwidth on the ATI Radeon™ HD 5870 GPU is one terabyte per second: +The L1 cache is 8 kB per compute unit. (For the ATI Radeon(TM) HD 5870 GPU, this means 160 kB for the 20 compute units.) The L1 cache bandwidth on the ATI Radeon(TM) HD 5870 GPU is one terabyte per second: L1 Bandwidth = Compute Units * Wavefront Size/Compute Unit * EngineClock Multiple compute units share L2 caches. -The L2 cache size on the ATI Radeon™ HD 5870 GPUs is 512 kB:L2 Cache Size = Number or channels * L2 per Channel -The bandwidth between L1 caches and the shared L2 cache is 435 GB/s: +The L2 cache size on the ATI Radeon(TM) HD 5870 GPUs is 512 kB:L2 Cache Size = Number or channels * L2 per Channel +The bandwidth between L1 caches and the shared L2 cache is 435 GB/s: L2 Bandwidth = Number of channels * Wavefront Size * Engine Clock - + .. image:: Opencl_optimization_images/3.1.png **Figure 3.1 Memory System** -The ATI Radeon™ HD 5870 GPU has eight memory controllers ("Memory Channel" in Figure 3.1). The memory controllers are connected to multiple banks of memory. The memory is GDDR5, with a clock speed of 1200 MHz and a data rate of 4800 Mb/pin. Each channel is 32-bits wide, so the peak bandwidth for the ATI Radeon™ HD 5870 GPU is: (8 memory controllers) * (4800 Mb/pin) * (32 bits) * (1 B/8b) = 154 GB/s +The ATI Radeon(TM) HD 5870 GPU has eight memory controllers ("Memory Channel" in Figure 3.1). The memory controllers are connected to multiple banks of memory. The memory is GDDR5, with a clock speed of 1200 MHz and a data rate of 4800 Mb/pin. Each channel is 32-bits wide, so the peak bandwidth for the ATI Radeon(TM) HD 5870 GPU is: (8 memory controllers) * (4800 Mb/pin) * (32 bits) * (1 B/8b) = 154 GB/s If two memory access requests are directed to the same controller, the hardware serializes the access. This is called a channel conflict. Similarly, if two memory access requests go to the same memory bank, hardware serializes the access. This is called a bank conflict. From a developer's point of view, there is not much difference between channel and bank conflicts. A large power of two stride results in a channel conflict; a larger power of two stride results in a bank conflict. The size of the power of two stride that causes a specific type of conflict depends on the chip. A stride that results in a channel conflict on a machine with eight channels might result in a bank conflict on a machine with four. @@ -1846,7 +1846,7 @@ In this document, the term bank conflict is used to refer to either kind of conf 3.1.1 Two Memory Paths ++++++++++++++++++++++++ -ATI Radeon™ HD 5000 series graphics processors have two, independent memory paths between the compute units and the memory: +ATI Radeon(TM) HD 5000 series graphics processors have two, independent memory paths between the compute units and the memory: * FastPath performs only basic operations, such as loads and stores (data sizes must be a multiple of 32 bits). This often is faster and preferred when there are no advanced operations. * CompletePath, supports additional advanced operations, including atomics and sub-32-bit (byte/short) data transfers. @@ -1854,7 +1854,7 @@ ATI Radeon™ HD 5000 series graphics processors have two, independent memory pa 3.1.1.1 Performance Impact of FastPath and CompletePath ######################################################## -There is a large difference in performance on ATI Radeon™ HD 5000 series hardware between FastPath and CompletePath. Figure 3.2 shows two kernels (one FastPath, the other CompletePath) and the delivered DRAM bandwidth for each kernel on the ATI Radeon™ HD 5870 GPU. Note that an atomic add forces CompletePath. +There is a large difference in performance on ATI Radeon(TM) HD 5000 series hardware between FastPath and CompletePath. Figure 3.2 shows two kernels (one FastPath, the other CompletePath) and the delivered DRAM bandwidth for each kernel on the ATI Radeon(TM) HD 5870 GPU. Note that an atomic add forces CompletePath. .. image:: Opencl_optimization_images/3.2.png @@ -1865,7 +1865,7 @@ There is a large difference in performance on ATI Radeon™ HD 5000 series hardw The kernel code follows. Note that the atomic extension must be enabled under OpenCL 1.0. :: - + __kernel void CopyFastPath( global const float * input, global float * output) @@ -1883,7 +1883,7 @@ The kernel code follows. Note that the atomic extension must be enabled under Op } output[gid] = input[gid]; return ; - } + } Table 3.1 lists the effective bandwidth and ratio to maximum bandwidth. @@ -1916,7 +1916,7 @@ There are two ways to find out which path is used. The first method uses the Cod The second method is static and lets you determine the path by looking at a machine-level ISA listing (using the AMD CodeXL Static Kernel Analyzer in OpenCL). :: - + MEM_RAT_CACHELESS -> FastPath MEM_RAT -> CompPath MEM_RAT_NOP_RTN -> Comp_load @@ -1927,8 +1927,8 @@ FastPath operations appear in the listing as:: TEX: ... ... VFETCH ... ... MEM_RAT_CACHELESS_STORE_RAW: ... - ... - + ... + The ``vfetch` Instruction is a load type that in graphics terms is called vertex a fetch (the group control TEX indicates that the load uses the L1 cache.) The instruction ``MEM_RAT_CACHELESS`` indicates that FastPath operations are used. Loads in CompletePath are a split-phase operation. In the first phase, hardware copies the old value of a memory location into a special buffer. This is done by performing atomic operations on the memory location. After the value has reached the buffer, a normal load is used to read the value. Note that RAT stands for random access target, which is the same as an unordered access view (UAV); it allows, on DX11 hardware, writes to, and reads from, any arbitrary location in a buffer. @@ -1950,7 +1950,7 @@ The instruction sequence means the following: **TEX** - Use the L1 cache for the next instruction. **VFETCH** - Do a load instruction to (finally) get the value. - + Stores appear as: :: @@ -1974,48 +1974,48 @@ When the application has complete control of the access pattern and address gene In this example:: - for (ptr=base; ptr> B) & C ==> [u]bit_extract - + where - + | B and C are compile time constants, | A is a 8/16/32bit integer type, and | C is a mask. - + * Bitfield insert on signed/unsigned integers | ((A & B) << C) | ((D & E) << F ==> ubit_insert - + where - + | B and E have no conflicting bits (B^E == 0), | B, C, E, and F are compile-time constants, and | B and E are masks. | The first bit set in B is greater than the number of bits in E plus the first bit set in E, or the first bit set in E is greater than the number of bits in B plus the first bit set in B. | If B, C, E, or F are equivalent to the value 0, this optimization is also supported. - + 3.9 Clause Boundaries ---------------------- AMD GPUs groups instructions into clauses. These are broken at control-flow boundaries when: * the instruction type changes (for example, from FETCH to ALU), or - * if the clause contains the maximum amount of operations (the maximum size for an ALU clause is 128 operations). + * if the clause contains the maximum amount of operations (the maximum size for an ALU clause is 128 operations). ALU and LDS access instructions are placed in the same clause. FETCH, ALU/LDS, and STORE instructions are placed into separate clauses. @@ -3103,7 +3103,7 @@ ALU dependencies on memory operations are handled at the clause level. Specifica Switching to another clause in the same wavefront requires approximately 40 cycles. The hardware immediately schedules another wavefront if one is available, so developers are encouraged to provide multiple wavefronts/compute unit. The cost to switch clauses is far less than the memory latency; typically, if the program is designed to hide memory latency, it hides the clause latency as well. The address calculations for FETCH and STORE instructions execute on the same hardware in the compute unit as do the ALU clauses. The address calculations for memory operations consumes the same executions resources that are used for floating-point computations. - + * The ISA dump shows the clause boundaries. See the example shown below. For more information on clauses, see the AMD Evergreen-Family ISA Microcode And Instructions (v1.0b) and the AMD R600/R700/Evergreen Assembly Language Format documents. @@ -3111,22 +3111,22 @@ And Instructions (v1.0b) and the AMD R600/R700/Evergreen Assembly Language Forma The following is an example disassembly showing clauses. There are 13 clauses in the kernel. The first clause is an ALU clause and has 6 instructions. :: - + 00 ALU_PUSH_BEFORE: ADDR(32) CNT(13) KCACHE0(CB1:0-15) KCACHE1(CB0:0-15) 0 x: MOV R3.x, KC0[0].x y: MOV R2.y, KC0[0].y z: MOV R2.z, KC0[0].z w: MOV R2.w, KC0[0].w 1 x: MOV R4.x, KC0[2].x - y: MOV R2.y, KC0[2].y + y: MOV R2.y, KC0[2].y z: MOV R2.z, KC0[2].z w: MOV R2.w, KC0[2].w t: SETGT_INT R5.x, PV0.x, 0.0f 2 t: MULLO_INT __, R1.x, KC1[1].x 3 y: ADD_INT __, R0.x, PS2 4 x: ADD_INT R0.x, PV3.y, KC1[6].x - 5 x: PREDNE_INT __, R5.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED - + 5 x: PREDNE_INT __, R5.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED + 01 JUMP POP_CNT(1) ADDR(12) 02 ALU: ADDR(45) CNT(5) KCACHE0(CB1:0-15) 6 z: LSHL __, R0.x, @@ -3135,11 +3135,11 @@ The following is an example disassembly showing clauses. There are 13 clauses in 03 LOOP_DX10 i0 FAIL_JUMP_ADDR(11) 04 ALU: ADDR(50) CNT(4) 9 x: ADD_INT R3.x, -1, R3.x - y: LSHR R0.y, R4.x, (0x00000002, 2.802596929e-45f).x + y: LSHR R0.y, R4.x, (0x00000002, 2.802596929e-45f).x t: ADD_INT R4.x, R4.x, (0x00000004, 5.605193857e-45f).y 05 WAIT_ACK: Outstanding_acks <= 0 06 TEX: ADDR(64) CNT(1) - 10 VFETCH R0.x__, R0.y, fc156 MEGA(4) + 10 VFETCH R0.x__, R0.y, fc156 MEGA(4) FETCH_TYPE(NO_INDEX_OFFSET) 07 ALU: ADDR(54) CNT(3) 11 x: MULADD_e R0.x, R0.x, (0x40C00000, 6.0f).y, (0x41880000, 17.0f).x @@ -3150,8 +3150,8 @@ The following is an example disassembly showing clauses. There are 13 clauses in 10 ENDLOOP i0 PASS_JUMP_ADDR(4) 11 POP (1) ADDR(12) 12 NOP NO_BARRIER - END_OF_PROGRAM - + END_OF_PROGRAM + 3.10 Additional Performance Guidance ------------------------------------- @@ -3165,7 +3165,7 @@ The compiler directive ``#pragma unroll `` can be placed immedia Examples for using this loop follow. No unrolling example:: - + #pragma unroll 1 for (int i = 0; i < n; i++) { ... @@ -3176,8 +3176,8 @@ Partial unrolling example:: #pragma unroll 4 for (int i = 0; i < 128; i++) { ... - } - + } + Currently, the unroll pragma requires that the loop boundaries can be determined at compile time. Both loop bounds must be known at compile time. If n is not given, it is equivalent to the number of iterations of the loop when both loop bounds are known. If the unroll-factor is not specified, and the compiler can determine the loop count, the compiler fully unrolls the loop. If the unroll-factor is not specified, and the compiler cannot determine the loop count, the compiler does no unrolling. @@ -3185,7 +3185,7 @@ Currently, the unroll pragma requires that the loop boundaries can be determined +++++++++++++++++++++ There are many possible physical memory layouts for images. AMD devices can access memory in a tiled or in a linear arrangement. - + * Linear - A linear layout format arranges the data linearly in memory such that element addresses are sequential. This is the layout that is familiar to CPU programmers. This format must be used for OpenCL buffers; it can be used for images. * Tiled - A tiled layout format has a pre-defined sequence of element blocks arranged in sequential memory addresses (see Figure 3.11 for a conceptual illustration). A microtile consists of ABIJ; a macrotile consists of the top-left 16 squares for which the arrows are red. Only images can use this format. Translating from user address space to the tiled arrangement is transparent to the user. Tiled memory layouts provide an optimized memory access pattern to make more efficient use of the RAM attached to the GPU compute device. This can contribute to lower latency. @@ -3206,20 +3206,20 @@ Memory access patterns in compute kernels are usually different from those in th * Avoid declaring global arrays on the kernel's stack frame as these typically cannot be allocated in registers and require expensive global memory operations. * Use predication rather than control-flow. The predication allows the GPU to execute both paths of execution in parallel, which can be faster than attempting to minimize the work through clever control-flow. The reason for this is that if no memory operation exists in a ``?:`` operator (also called a ternary operator), this operation is translated into a single ``cmov_logical`` instruction, which is executed in a single cycle. An example of this is : :: - + If (A>B) { C += D; } else { C -= D; - } + } Replace this with:: - + int factor = (A>B) ? 1:-1; C += factor*D; In the first block of code, this translates into an IF/ELSE/ENDIF sequence of CF clauses, each taking ~40 cycles. The math inside the control flow adds two cycles if the control flow is divergent, and one cycle if it is not. This code executes in ~120 cycles. -In the second block of code, the ``?:`` operator executes in an ALU clause, so no extra CF instructions are generated. Since the instructions are sequentially dependent, this block of code executes in three cycles, for a ~40x speed improvement. To see this, the first cycle is the (A>B) comparison, the result of which is input to the second cycle, which is the ``cmov_logical`` factor, bool, 1, -1. The final cycle is a MAD instruction that: mad C, factor, D, C. If the ratio between CF clauses and ALU instructions is low, this is a good pattern to remove the control flow. +In the second block of code, the ``?:`` operator executes in an ALU clause, so no extra CF instructions are generated. Since the instructions are sequentially dependent, this block of code executes in three cycles, for a ~40x speed improvement. To see this, the first cycle is the (A>B) comparison, the result of which is input to the second cycle, which is the ``cmov_logical`` factor, bool, 1, -1. The final cycle is a MAD instruction that: mad C, factor, D, C. If the ratio between CF clauses and ALU instructions is low, this is a good pattern to remove the control flow. * Loop Unrolling * OpenCL kernels typically are high instruction-per-clock applications. Thus, the overhead to evaluate control-flow and execute branch instructions can consume a significant part of resource that otherwise can be used for high-throughput compute operations. @@ -3228,10 +3228,10 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n * When tuning an algorithm, it is often beneficial to code a simple but accurate algorithm that is retained and used for functional comparison. GPU tuning can be an iterative process, so success requires frequent experimentation, verification, and performance measurement. * The profiler and analysis tools report statistics on a per-kernel granularity. To narrow the problem further, it might be useful to remove or comment-out sections of code, then re-run the timing and profiling tool. * Writing code with dynamic pointer assignment should be avoided on the GPU. For example:: - + kernel void dyn_assign(global int* a, global int* b, global int* c) { - global int* d; + global int* d; size_t idx = get_global_id(0); if (idx & 1) { d = b; @@ -3239,20 +3239,20 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n d = c; } a[idx] = d[idx]; - } + } + + This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. - This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what ‘d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations. - * If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size. * Since the AMD OpenCL runtime supports only in-order queuing, using ``clFinish`` () on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command. For example:: - + clEnqueueWriteBuffer(myCQ, buff, **CL_FALSE**, 0, buffSize, input, 0, NULL, NULL);`` clFinish(myCQ); - + is equivalent, for the AMD OpenCL runtime, to:: - + clEnqueueWriteBuffer(myCQ, buff, **CL_TRUE**, 0, buffSize, input, 0, NULL, NULL);`` @@ -3262,24 +3262,24 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n * Porting from CUDA to OpenCL is relatively straightforward. Multiple vendors have documents describing how to do this, including AMD : http://developer.amd.com/tools-and-sdks/opencl-zone/ * Some specific performance recommendations which differ from other GPU architectures: - * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an ATI Radeon™ HD 5870 GPU. + * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an ATI Radeon(TM) HD 5870 GPU. * Vectorization can lead to substantially greater efficiency. The ``ALUPacking`` counter provided by the Profiler can track how well the kernel code is using the five-wide (or four-wide, depending on the GPU type) VLIW unit. Values below 70 percent may indicate that dependencies are preventing the full use of the processor. For some kernels, vectorization can be used to increase efficiency and improve kernel performance. - * AMD GPUs have a very high single-precision flops capability (2.72 teraflops in a single ATI Radeon™ HD 5870 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware. + * AMD GPUs have a very high single-precision flops capability (2.72 teraflops in a single ATI Radeon(TM) HD 5870 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware. 3.10.5 Guidance for CPU Programmers Using OpenCL to Program GPUs ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ OpenCL is the industry-standard toolchain for programming GPUs and parallel devices from many vendors. It is expected that many programmers skilled in CPU programming will program GPUs for the first time using OpenCL. This section provides some guidance for experienced programmers who are programming a GPU for the first time. It specifically highlights the key differences in optimization strategy. - + * Study the local memory (LDS) optimizations. These greatly affect the GPU performance. Note the difference in the organization of local memory on the GPU as compared to the CPU cache. Local memory is shared by many work-items (64 on Cypress). This contrasts with a CPU cache that normally is dedicated to a single work-item. GPU kernels run well when they collaboratively load the shared memory. * GPUs have a large amount of raw compute horsepower, compared to memory bandwidth and to "control flow" bandwidth. This leads to some high- level differences in GPU programming strategy. * A CPU-optimized algorithm may test branching conditions to minimize the workload. On a GPU, it is frequently faster simply to execute the workload. * A CPU-optimized version can use memory to store and later load pre- computed values. On a GPU, it frequently is faster to recompute values rather than saving them in registers. Per-thread registers are a scarce resource on the CPU; in contrast, GPUs have many available per-thread register resources. - + * Use ``float4`` and the OpenCL built-ins for vector types `` (vload, vstore,`` etc.). These enable the AMD OpenCL implementation to generate efficient, packed SSE instructions when running on the CPU. Vectorization is an optimization that benefits both the AMD CPU and GPU. - + 3.10.6 Optimizing Kernel Code ++++++++++++++++++++++++++++++ @@ -3305,12 +3305,12 @@ The Bulldozer family of CPUs supports FMA4 instructions, exchanging instructions There also is hardware support for OpenCL functions that give the new hardware implementation of rotating. For example:: - + sum.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x; can be written as a composition of mad instructions which use fused multiple add (FMA):: - + sum.x += mad(tempA0.x, tempB0.x, mad(tempA0.y, tempB1.x, mad(tempA0.z, tempB2.x, tempA0.w*tempB3.x))); @@ -3332,7 +3332,7 @@ The AMD CodeXL Static Kernel Analyzer assembler listing lets you view clauses. T 3.10.7.2 Remove Conditional Assignments ######################################## A conditional of the form "if-then-else" generates branching and thus generates one or more clauses. Use the ``select()`` function to replace these structures with conditional assignments that do not cause branching. For example:: - + if(x==1) r=0.5; if(x==2) r=1.0; @@ -3346,14 +3346,14 @@ Note that if the body of the ``if`` statement contains an I/O, the if statement 3.10.7.3 Bypass Short-Circuiting ################################## A conditional expression with many terms can compile into a number of clauses due to the C-language requirement that expressions must short circuit. To prevent this, move the expression out of the control flow statement. For example:: - + if(a&&b&&c&&d){...} becomes :: bool cond = a&&b&&c&&d; if(cond){...} - + The same applies to conditional expressions used in loop constructs `` (do, while, for)``. 3.10.7.4 Unroll Small Loops @@ -3364,7 +3364,7 @@ If the loop bounds are known, and the loop is small (less than 16 or 32 instruct 3.10.7.5 Avoid Nested ifs ########################## Because the GPU is a Vector ALU architecture, there is a cost to executing an if-then-else block because both sides of the branch are evaluated, then one result is retained while the other is discarded. When if blocks are nested, the results are twice as bad; in general, if blocks are nested k levels deep, there 2^k clauses are generated. In this situation, restructure the code to eliminate nesting. - + 3.10.7.6 Experiment With do/while/for Loops ############################################# ``for`` loops can generate more clauses than equivalent ``do`` or ``while`` loops. Experiment with these different loop types to find the one with best performance. diff --git a/Programming_Guides/Opencl-programming-guide.rst b/Programming_Guides/Opencl-programming-guide.rst index 63b27374..1c57861f 100644 --- a/Programming_Guides/Opencl-programming-guide.rst +++ b/Programming_Guides/Opencl-programming-guide.rst @@ -11,13 +11,13 @@ OpenCL Programming Guide * :ref:`Synchronization` * :ref:`Memory-Arch` * :ref:`Example` - - * :ref:`AMD_Implementation` + + * :ref:`AMD_Implementation` * :ref:`AMD-ROCm-Implementation` - * :ref:`Hardware-Overview-GCNDevices` + * :ref:`Hardware-Overview-GCNDevices` * :ref:`Communication-Host-GPU` * :ref:`Wavefront-Scheduling` - + * :ref:`Build_Run_Opencl` * :ref:`Compilin-Host-Program` * :ref:`Compiling-device-programs` @@ -28,36 +28,36 @@ OpenCL Programming Guide * :ref:`Running-Program` * :ref:`note-on-thread-safety` * :ref:`Toolchain-considerations` - + * :ref:`Profiling_OpenCL` * :ref:`AMD-CodeXL-GPU` - - * :ref:`OpenCL_static` + + * :ref:`OpenCL_static` * :ref:`Overview` - * :ref:`OpenCL-C-Runtime` + * :ref:`OpenCL-C-Runtime` * :ref:`C-Programming-Language` * :ref:`Examples` - - * :ref:`OpenCL_2.0` + + * :ref:`OpenCL_2.0` * :ref:`Introduction` * :ref:`Shared-virtual-Memory` * :ref:`Generi` - * :ref:`Device-side-enqueue` + * :ref:`Device-side-enqueue` * :ref:`Atomics` * :ref:`Pipes` * :ref:`Program-scope-global-Variables` * :ref:`Image-Enhancements` * :ref:`Non-uniform-work-group-size` * :ref:`Portability-considerations` - + * :ref:`OpenCL_Extentions` * :ref:`ICD` * :ref:`BIF` * :ref:`pre_GCN_Devices` * :ref:`OpenCL_OpenGL` * :ref:`Functions_OpenCL` - - + + .. _OpenCL Architecture: OpenCL Architecture and AMD Accelerated Parallel Processing Technology ======================================================================= @@ -66,13 +66,13 @@ OpenCL Architecture and AMD Accelerated Parallel Processing Technology Terminology ############ -**compute kernel :** +**compute kernel :** To define a compute kernel, it is first necessary to define a kernel. A kernel is a small unit of execution that performs a clearly defined function and that can be executed in parallel. Such a kernel can be executed on each element of an input stream (called an NDRange), or simply at each point in an arbitrary index space. A kernel is analogous and, on some devices identical, to what graphics programmers call a shader program. This kernel is not to be confused with an OS kernel, which controls hardware. The most basic form of an NDRange is simply mapped over input data and produces one output item for each input tuple. Subsequent extensions of the basic model provide random-access functionality, variable output counts, and reduction/accumulation operations. Kernels are specified using the kernel keyword. A compute kernel is a specific type of kernel that is not part of the traditional graphics pipeline. The compute kernel type can be used for graphics, but its strength lies in using it for non-graphics fields such as physics, AI, modeling, HPC, and various other computationally intensive applications. -In a compute kernel, the work-item spawn order is sequential. This means that on a chip with N work-items per wavefront, the first N work- items go to wavefront 1, the second N work-items go to wavefront 2, etc. Thus, the work-item IDs for wavefront K are in the range (K•N) to ((K+1)•N)-1. +In a compute kernel, the work-item spawn order is sequential. This means that on a chip with N work-items per wavefront, the first N work- items go to wavefront 1, the second N work-items go to wavefront 2, etc. Thus, the work-item IDs for wavefront K are in the range (KoN) to ((K+1)oN)-1. **wavefronts and work-groups :** @@ -84,7 +84,7 @@ Work-groups are composed of wavefronts. Best performance is attained when the gr **local data store(LDS) :** -The LDS is a high-speed, low-latency memory private to each compute unit. It is a full gather/scatter model: a work-group can write anywhere in its allocated space. This model is unchanged for the AMD Radeon™ HD 7XXX series. The constraints of the current LDS model are: +The LDS is a high-speed, low-latency memory private to each compute unit. It is a full gather/scatter model: a work-group can write anywhere in its allocated space. This model is unchanged for the AMD Radeon(TM) HD 7XXX series. The constraints of the current LDS model are: * The LDS size is allocated per work-group. Each work-group specifies how much of the LDS it requires. The hardware scheduler uses this information to determine which work groups can share a compute unit. * Data can only be shared within work-items in a work-group. @@ -114,14 +114,14 @@ executing kernels for specific devices. .. image:: images/img1.png :align: center - + The devices are capable of running data- and task-parallel work. A kernel can be executed as a function of multi-dimensional domains of indices. Each element is called a work-item; the total number of indices is defined as the global work-size. The global work-size can be divided into sub-domains, called work-groups, and individual work-items within a group can communicate through global or locally shared memory. Work-items are synchronized through barrier or fence operations. Figure 1.1 is a representation of the host/device architecture with a single platform, consisting of a GPU and a CPU. -An OpenCL application is built by first querying the runtime to determine which platforms are present. There can be any number of different OpenCL implementations installed on a single system. The desired OpenCL platform can be selected by matching the platform vendor string to the desired vendor name, such as “Advanced Micro Devices, Inc.” The next step is to create a context. As shown in Figure 1.1, an OpenCL context has associated with it a number of compute devices (for example, CPU or GPU devices),. Within a context, OpenCL guarantees a relaxed consistency between these devices. This means that memory objects, such as buffers or images, are allocated per context; but changes made by one device are only guaranteed to be visible by another device at well-defined synchronization points. For this, OpenCL provides events, with the ability to synchronize on a given event to enforce the correct order of execution. +An OpenCL application is built by first querying the runtime to determine which platforms are present. There can be any number of different OpenCL implementations installed on a single system. The desired OpenCL platform can be selected by matching the platform vendor string to the desired vendor name, such as "Advanced Micro Devices, Inc." The next step is to create a context. As shown in Figure 1.1, an OpenCL context has associated with it a number of compute devices (for example, CPU or GPU devices),. Within a context, OpenCL guarantees a relaxed consistency between these devices. This means that memory objects, such as buffers or images, are allocated per context; but changes made by one device are only guaranteed to be visible by another device at well-defined synchronization points. For this, OpenCL provides events, with the ability to synchronize on a given event to enforce the correct order of execution. -Many operations are performed with respect to a given context; there also are many operations that are specific to a device. For example, program compilation and kernel execution are done on a per-device basis. Performing work with a device, such as executing kernels or moving data to and from the device’s local memory, is done using a corresponding command queue. A command queue is associated with a single device and a given context; all work for a specific device is done through this interface. Note that while a single command queue can be associated with only a single device, there is no limit to the number of command queues that can point to the same device. For example, it is possible to have one command queue for executing kernels and a command queue for managing data transfers between the host and the device. +Many operations are performed with respect to a given context; there also are many operations that are specific to a device. For example, program compilation and kernel execution are done on a per-device basis. Performing work with a device, such as executing kernels or moving data to and from the device's local memory, is done using a corresponding command queue. A command queue is associated with a single device and a given context; all work for a specific device is done through this interface. Note that while a single command queue can be associated with only a single device, there is no limit to the number of command queues that can point to the same device. For example, it is possible to have one command queue for executing kernels and a command queue for managing data transfers between the host and the device. Most OpenCL programs follow the same pattern. Given a specific platform, select a device or devices to create a context, allocate memory, create device-specific command queues, and perform data transfers and computations. Generally, the platform is the gateway to accessing specific devices, given these devices and a corresponding context, the application is independent of the platform. Given a context, the application can: @@ -147,14 +147,14 @@ There are two types of synchronization between commands in a command- queue: * command-queue barrier - enforces ordering within a single queue. Any resulting changes to memory are available to the following commands in the queue. * events - enforces ordering between, or within, queues. Enqueued commands in OpenCL return an event identifying the command as well as the memory object updated by it. This ensures that following commands waiting on that event see the updated memory objects before they execute. -OpenCL 2.0 provides additional synchronization options. For an overview, see “Atomics and synchronization.”. +OpenCL 2.0 provides additional synchronization options. For an overview, see "Atomics and synchronization.". .. _Memory-Arch: Memory Architecture and Access ################################### -OpenCL has four memory domains: private, local, global, and constant; the AMD Compute Technology system also recognizes host (CPU) and PCI Express® (PCIe® ) memory. +OpenCL has four memory domains: private, local, global, and constant; the AMD Compute Technology system also recognizes host (CPU) and PCI Express(R) (PCIe(R) ) memory. ============= ==================================================================================================================== Memory Type Description @@ -167,11 +167,11 @@ global Accessible to all work-items executing in a context, as well as to constant Read-only region for host-allocated and -initialized objects that are not changed during kernel execution. -host (CPU) Host-accessible region for an application’s data structures and program data. +host (CPU) Host-accessible region for an application's data structures and program data. PCIe Part of host (CPU) memory accessible from, and modifiable by, the host program and the GPU compute device. Modifying this memory requires synchronization between the GPU compute device and the CPU. ============= ==================================================================================================================== - + **Table: illustrates the interrelationship of the memories.** .. image:: images/img2.png @@ -222,7 +222,7 @@ Dataflow in Memory Hierarchy .. image:: images/img5.png :align: center -To load data into LDS from global memory, it is read from global memory and placed into the work-item’s registers; then, a store is performed to LDS. Similarly, to store data into global memory, data is read from LDS and placed into the work- item’s registers, then placed into global memory. To make effective use of the LDS, an algorithm must perform many operations on what is transferred between global memory and LDS. It also is possible to load data from a memory buffer directly into LDS, bypassing VGPRs. +To load data into LDS from global memory, it is read from global memory and placed into the work-item's registers; then, a store is performed to LDS. Similarly, to store data into global memory, data is read from LDS and placed into the work- item's registers, then placed into global memory. To make effective use of the LDS, an algorithm must perform many operations on what is transferred between global memory and LDS. It also is possible to load data from a memory buffer directly into LDS, bypassing VGPRs. LDS atomics are performed in the LDS hardware. (Thus, although ALUs are not directly used for these operations, latency is incurred by the LDS executing this function.) If the algorithm does not require write-to-read reuse (the data is read only), it usually is better to use the image dataflow (see right side of Figure 1.5) because of the cache hierarchy. @@ -256,7 +256,7 @@ Image reads are done by addressing the desired location in the input memory usin Image reads are cached through the texture system (corresponding to the L2 and L1 caches). -.. _Example: +.. _Example: Example Programs ################### @@ -282,7 +282,7 @@ This sample shows a minimalist OpenCL C program that sets a given buffer to some 7. The data is mapped to the host for examination. Calling clEnqueueMapBuffer ensures the visibility of the buffer on the host, which in this case probably includes a physical transfer. Alternatively, we could use ``clEnqueueWriteBuffer()``, which requires a pre-allocated host-side buffer. -**Example Code 1** +**Example Code 1** :: @@ -381,22 +381,22 @@ Example: SAXPY Function This section provides an introductory sample for beginner-level OpenCL programmers using C++ bindings. -The sample implements the SAXPY function (Y = aX + Y, where X and Y are vectors, and a is a scalar). The full code is reproduced at the end of this section. It uses C++ bindings for OpenCL. These bindings are available in the CL/cl.hpp file in the AMD Compute SDK; they also are downloadable from the Khronos website: http://www.khronos.org/registry/cl +The sample implements the SAXPY function (Y = aX + Y, where X and Y are vectors, and a is a scalar). The full code is reproduced at the end of this section. It uses C++ bindings for OpenCL. These bindings are available in the CL/cl.hpp file in the AMD Compute SDK; they also are downloadable from the Khronos website: http://www.khronos.org/registry/cl The following steps guide you through this example. 1. Enable error checking through the exception handling mechanism in the C++ bindings by using the following define. :: - + #define CL ENABLE_EXCEPTIONS This removes the need to error check after each OpenCL call. If there is an error, the C++ bindings code throw an exception that is caught at the end of the try block, where we can clean up the host memory allocations. In this example, the C++ object representing OpenCL resources (cl::Context, cl::CommandQueue, etc.) are declared as automatic variables, so they do not need to be released. If an OpenCL call returns an error, the error code is defined in the CL/cl.h file. 2. The kernel is very simple: each work-item, i, does the SAXPY calculation for its corresponding elements ``Y[i] = aX[i] + Y[i]``. Both X and Y vectors are stored in global memory; X is read-only, Y is read-write. - :: - + :: + kernel void saxpy(const __global float * X, __global float * Y, const float a) @@ -414,31 +414,31 @@ The following steps guide you through this example. 4. Create an OpenCL context on that platform. :: - + cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter)(), 0 }; context = cl::Context(CL_DEVICE_TYPE_GPU, cps); 5. Get OpenCL devices from the context. :: - + devices = context.getInfo(); 6. Create an OpenCL command queue. :: - + queue = cl::CommandQueue(context, devices[0]); 7. Create two buffers, corresponding to the X and Y vectors. Ensure the host- side buffers, pX and pY, are allocated and initialized. The CL_MEM_COPY_HOST_PTR flag instructs the runtime to copy over the contents of the host pointer pX in order to initialize the buffer bufX. The bufX buffer uses the CL_MEM_READ_ONLY flag, while bufY requires the CL_MEM_READ_WRITE flag. :: - + bufX = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * length, pX); 8. Create a program object from the kernel source string, build the program for our devices, and create a kernel object corresponding to the SAXPY kernel. (At this point, it is possible to create multiple kernel objects if there are more than one.) :: - + cl::Program::Sources sources(1, std::make_pair(kernelStr.c_str(), kernelStr.length())); program = cl::Program(context, sources); program.build(devices); @@ -448,17 +448,17 @@ The following steps guide you through this example. Set each argument individually in separate kernel.setArg() calls. The arguments, do not need to be set again for subsequent kernelenqueue calls. Reset only those arguments that are to pass a new value to the kernel. Then, enqueue the kernel to the command queue with the appropriate global and local work sizes. :: - + kernel.setArg(0,bufX); kernel.setArg(1,bufY); kernel.setArg(2,a); queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(length), cl::NDRange(64)); 10. Read back the results from bufY to the host pointer pY. We will make this a blocking call (using the CL_TRUE argument) since we do not want to proceed before the kernel has finished execution and we have our results back. :: - + queue.enqueueReadBuffer(bufY, CL_TRUE, 0, length * sizeof(cl_float), pY); 11. Clean up the host resources (pX and pY). OpenCL resources is cleaned up by the C++ bindings support code. - + The catch(cl::Error err) block handles exceptions thrown by the C++ bindings code. If there is an OpenCL call error, it prints out the name of the call and the error code (codes are defined in CL/cl.h). If there is a kernel compilation error, the error code is CL_BUILD_PROGRAM_FAILURE, in which case it is necessary to print out the build log. **Example Code 2** @@ -490,7 +490,7 @@ The following steps guide you through this example. cout << arrayData[i] << " "; cout << endl; } - + ///////////////////////////////////////////////////////////////// // Globals ///////////////////////////////////////////////////////////////// @@ -520,7 +520,7 @@ The following steps guide you through this example. " uint gid = get_global_id(0);\n" " y[gid] = a* x[gid] + y[gid];\n" "}\n"; - + ///////////////////////////////////////////////////////////////// // Allocate and initialize memory on the host ///////////////////////////////////////////////////////////////// @@ -583,7 +583,7 @@ The following steps guide you through this example. break; } } - + ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// @@ -678,7 +678,7 @@ The code is written so that it performs very well on either CPU or GPU. The numb The sample includes a number of programming techniques useful for simple tests. Only minimal error checking and resource tear-down is used. -Runtime Code – +Runtime Code - 1. The source memory buffer is allocated, and initialized with a random pattern. Also, the actual min() value for this data set is serially computed, in order to later verify the parallel result. @@ -694,8 +694,8 @@ Runtime Code – 5. After the kernels are built, the code prints errors that occurred during kernel compilation and linking. 6. The main loop is set up so that the measured timing reflects the actual kernel performance. If a sufficiently large NLOOPS is chosen, effects from kernel launch time and delayed buffer copies to the device by the CL runtime are minimized. Note that while only a single clFinish() is executed at the end of the timing run, the two kernels are always linked using an event to ensure serial execution. - - The bandwidth is expressed as “number of input bytes processed.” For high- end graphics cards, the bandwidth of this algorithm is about an order of magnitude higher than that of the CPU, due to the parallelized memory subsystem of the graphics card. + + The bandwidth is expressed as "number of input bytes processed." For high- end graphics cards, the bandwidth of this algorithm is about an order of magnitude higher than that of the CPU, due to the parallelized memory subsystem of the graphics card. 7. The results then are checked against the comparison value. This also establishes that the result is the same on both CPU and GPU, which can serve as the first verification test for newly written kernel code. @@ -703,7 +703,7 @@ Runtime Code – 9. You can use the Timer.cpp and Timer.h files from the TransferOverlap sample, which is in the SDK samples. -Kernel Code – +Kernel Code - 10. The code uses four-component vectors (uint4) so the compiler can identify concurrent execution paths as often as possible. On the GPU, this can be used to further optimize memory accesses and distribution across ALUs. On the CPU, it can be used to enable SSE like execution. @@ -755,7 +755,7 @@ Kernel Code – " uint idx = (dev == 0) ? get_global_id(0) * count \n" " : get_global_id(0); \n" " uint stride = (dev == 0) ? 1 : get_global_size(0); \n" - " uint pmin = (uint) -1; \n" + " uint pmin = (uint) -1; \n" " // 11. First, compute private min, for this work-item. \n" " for( int n=0; n < count; n++, idx += stride ) \n" " { \n" @@ -779,7 +779,7 @@ Kernel Code – " { \n" " dbg[0] = get_num_groups(0); \n" " dbg[1] = get_global_size(0); \n" - " dbg[2] = count; \n" + " dbg[2] = count; \n" " dbg[3] = stride; \n" " } \n" "} \n" @@ -790,7 +790,7 @@ Kernel Code – "{ \n" " (void) atom_min( gmin, gmin[get_global_id(0)] ); \n" "}; \n"; - + int main(int argc, char ** argv) { cl_platform_id platform; @@ -819,7 +819,7 @@ Kernel Code – // Get a platform. clGetPlatformIDs( 1, &platform, NULL ); - + // 3. Iterate over devices. for(dev=0; dev < NDEVS; dev++) { @@ -1038,8 +1038,8 @@ The AMD ROCm software stack provides end-users and developers with a complete, f The software includes the following components: * OpenCL compiler and runtime - * Debugging and Performance Profiling Tools – AMD CodeXL. - * Performance Libraries – clMath and other OpenCL accelerated libraries for optimized NDRange-specific algorithms. + * Debugging and Performance Profiling Tools - AMD CodeXL. + * Performance Libraries - clMath and other OpenCL accelerated libraries for optimized NDRange-specific algorithms. The latest generations of AMD GPUs use unified shader architectures capable of running different kernel types interleaved on the same hardware.Programmable GPU compute devices execute various user-developed programs,known to graphics programmers as shaders and to compute programmers as kernels. These GPU compute devices can execute non-graphics functions using a data-parallel programming model that maps executions onto compute units. Each compute unit contains one (pre-GCN devices) or more (GCN devices) vector (SIMD) units. In this programming model, known as AMD Accelerated Parallel Processing Technology, arrays of input data elements stored in memory are accessed by a number of compute units. @@ -1065,11 +1065,11 @@ OpenCL maps the total number of work-items to be launched onto an n- dimensional Work-Item Processing ***************************** -All processing elements within a vector unit execute the same instruction in each cycle. For a typical instruction, 16 processing elements execute one instruction for 64 work items over 4 cycles. The block of work-items that are executed together is called a wavefront. For example, on the AMD Radeon™ HD 290X +All processing elements within a vector unit execute the same instruction in each cycle. For a typical instruction, 16 processing elements execute one instruction for 64 work items over 4 cycles. The block of work-items that are executed together is called a wavefront. For example, on the AMD Radeon(TM) HD 290X compute device, the 16 processing elements within each vector unit execute the same instruction for four cycles, which effectively appears as a 64-wide compute unit in execution width. -The size of wavefronts can differ on different GPU compute devices. For example, some of the low-end and older GPUs, such as the AMD Radeon™ HD 54XX series graphics cards, have a wavefront size of 32 work-items. Higher-end and newer AMD GPUs have a wavefront size of 64 work-items. +The size of wavefronts can differ on different GPU compute devices. For example, some of the low-end and older GPUs, such as the AMD Radeon(TM) HD 54XX series graphics cards, have a wavefront size of 32 work-items. Higher-end and newer AMD GPUs have a wavefront size of 64 work-items. Compute units operate independently of each other, so it is possible for different compute units to execute different instructions. It is also possible for different vector units within a compute unit to execute different instructions. @@ -1123,8 +1123,8 @@ executes on an ALU, as shown in Figure 2.4). In GCN devices, each CU includes one Scalar Unit and four Vector (SIMD) units, each of which contains an array of 16 processing elements (PEs). Each PE contains one ALU. Each SIMD unit simultaneously executes a single operation across 16 work items, but each can be working on a separate wavefront. -For example, for the AMD Radeon™ HD 79XX devices each of the 32 CUs has one Scalar Unit and four Vector Units. Figure 2.5 shows only two compute engines/command processors of the array that comprises the compute device of -the AMD Radeon™ HD 79XX family. +For example, for the AMD Radeon(TM) HD 79XX devices each of the 32 CUs has one Scalar Unit and four Vector Units. Figure 2.5 shows only two compute engines/command processors of the array that comprises the compute device of +the AMD Radeon(TM) HD 79XX family. .. image:: images/2.5.png @@ -1138,7 +1138,7 @@ The Asynchronous Compute Engines (ACEs) manage the CUs; a graphics command proce Key differences between pre-GCN and GCN devices *********************************************** -In pre-GCN devices (for a hardware overview, see Appendix D, “Hardware overview of pre-GCN devices.”), each compute unit consists of a single vector unit, each containing up to 16 processing elements. Each processing element, which contains 4 or 5 ALUs, could execute bundles of 4 or 5 independent instructions co-issued in a VLIW (Very Long Instruction Word) format. All the processing elements within a vector unit execute a single wavefront (a group of +In pre-GCN devices (for a hardware overview, see Appendix D, "Hardware overview of pre-GCN devices."), each compute unit consists of a single vector unit, each containing up to 16 processing elements. Each processing element, which contains 4 or 5 ALUs, could execute bundles of 4 or 5 independent instructions co-issued in a VLIW (Very Long Instruction Word) format. All the processing elements within a vector unit execute a single wavefront (a group of 64 work items). If operations within a wavefront contain dependencies, they cannot be scheduled in the same clock cycle, leaving some ALUs un-utilized. In such cases, some processing elements (and hence, vector units) remain under- utilized. In GCN devices, the CUs are arranged in four vector unit arrays consisting of 16 processing elements each. Each of these arrays executes a single instruction across each lane for each block of 16 work-items. That instruction is repeated over four cycles to make the 64-element vector called a wavefront. @@ -1161,13 +1161,13 @@ Each ACE contains up to eight hardware queues and, together with the graphics co Devices in the Southern Islands families typically have two ACEs. The ACE engines on the Southern Islands families are single-threaded, which means that they contain two hardware queues. -Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues) so they offer more performance. For example, the AMD Radeon™ R9 +Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues) so they offer more performance. For example, the AMD Radeon(TM) R9 290X devices, in the VI family contain 8 ACEs and 44 CUs. A note on hardware queues ************************** -A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon™ HD 290X compute device can execute up to +A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon(TM) HD 290X compute device can execute up to 112,640 threads concurrently. The GPU can execute two kernels each spawning 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue. @@ -1232,7 +1232,7 @@ Wavefront Scheduling ##################### GPU compute devices are very efficient at parallelizing large numbers of work- items in a manner transparent to the application. Each GPU compute device uses the large number of wavefronts to hide memory access latencies by having the resource scheduler switch the active wavefront in a given compute unit whenever the current wavefront is waiting for a memory access to complete. Hiding memory access latencies requires that each work-item contain a large number of ALU operations per memory load/store. -Figure 2.6 shows the timing of a simplified execution of wavefronts in a single compute unit. At time 0, the wavefronts are queued and waiting for execution. In this example, only four wavefronts (T0…T3) are scheduled for the compute unit. The hardware limit for the number of active wavefront is dependent on the resource usage (such as the number of active registers used) of the program being executed. An optimally programmed GPU compute device typically has many of active wavefronts. +Figure 2.6 shows the timing of a simplified execution of wavefronts in a single compute unit. At time 0, the wavefronts are queued and waiting for execution. In this example, only four wavefronts (T0...T3) are scheduled for the compute unit. The hardware limit for the number of active wavefront is dependent on the resource usage (such as the number of active registers used) of the program being executed. An optimally programmed GPU compute device typically has many of active wavefronts. .. image:: images/2.6.png @@ -1242,7 +1242,7 @@ At runtime, wavefront T0 executes until cycle 20; at this time, a stall occurs d If the data wavefront T0 is waiting for has returned from memory, T0 continues execution. In the example in Figure 2.6, the data is ready, so T0 continues. Since there were enough wavefronts and processing element operations to cover the long memory latencies, the compute unit does not idle. This method of memory latency hiding helps the GPU compute device achieve maximum performance. -If none of T0 – T3 are runnable, the compute unit waits (stalls) until one of T0 – T3 is ready to execute. In the example shown in Figure 2.7, T0 is the first to continue execution. +If none of T0 - T3 are runnable, the compute unit waits (stalls) until one of T0 - T3 is ready to execute. In the example shown in Figure 2.7, T0 is the first to continue execution. .. image:: images/2.7.png @@ -1260,7 +1260,7 @@ An OpenCL application consists of a host program (C/C++) and an optional kernel Compiling the Host Program ########################### -In order to compile the host program, users must install the OpenCL Compiler and language runtime on the ROCm, On Ubuntu is rocm-opencl-dev which provides all the necessary OpenCL runtime headers and libraries required by the host compiler. If wish to support application build with the historical APPS SDK sets an environmental variable named AMDAPPSDKROOT to the path of the directory in which the ROCm OpenCL is installed. It should be /opt/rocm/opencl. The runtime headers and libraries are placed in the install directory under the “include” and “lib” sub-folders, respectively. +In order to compile the host program, users must install the OpenCL Compiler and language runtime on the ROCm, On Ubuntu is rocm-opencl-dev which provides all the necessary OpenCL runtime headers and libraries required by the host compiler. If wish to support application build with the historical APPS SDK sets an environmental variable named AMDAPPSDKROOT to the path of the directory in which the ROCm OpenCL is installed. It should be /opt/rocm/opencl. The runtime headers and libraries are placed in the install directory under the "include" and "lib" sub-folders, respectively. While building the host program, these headers and libraries must be included in the project by choosing the appropriate options for the targeted operating system, IDE, and compiler. @@ -1273,13 +1273,13 @@ To compile OpenCL applications on Linux, gcc or the Intel C compiler must be ins 1. Compile all the C++ files (Template.cpp), and get the object files. 64-bit object files on 64-bit system:: - + g++ -o Template.o -c Template.cpp -I$ROCMOPENCL/include 2. Link all the object files generated in the previous step to the OpenCL library and create an executable. For linking to a 64-bit library:: - + g++ -o Template Template.o -lOpenCL -L$ROCMOPENCL/lib/x86_64 @@ -1326,7 +1326,7 @@ Note: Most of the examples in this chapter are shown using runtime C APIs. In or **Example creation of program objects from an external file :** -:: +:: std::ifstream f("my_kernel.cl"); std::stringstream st; @@ -1366,40 +1366,40 @@ Suppose a program object has been created as follows: Next, the program object can be built for all the devices in the context or for a list of selected devices. -* To build the program for all the devices, “NULL” must be passed against the target device list argument, as shown below: +* To build the program for all the devices, "NULL" must be passed against the target device list argument, as shown below: + +:: -:: - clBuildProgram(program, 0, NULL, NULL, NULL, NULL); * To build for any particular GPU device or a list of devices : - -:: - - int nDevices = 0; + +:: + + int nDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevices); cl_device_id * devices = malloc(nDevices * sizeof(cl_device_id)); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, nDevices * sizeof(cl_device_id), devices, NULL); * To build for the nth GPU device in a list of devices: -:: - +:: + clBuildProgram(program, 1, &devices[n], NULL, NULL, NULL); * To build for the first n number of GPU devices -:: +:: clBuildProgram(program, n, devices, NULL, NULL, NULL); **Build Options:** -A list of options can be passed during program build to control each stage of the building process. The full list includes various categories of options, such as preprocessor, compiler, optimization, linker, and debugger. Some of them are standard (specified by Khronos); others are vendor-specific. For details about the standard options, see the clBuildProgram API’s description in the OpenCL specification. +A list of options can be passed during program build to control each stage of the building process. The full list includes various categories of options, such as preprocessor, compiler, optimization, linker, and debugger. Some of them are standard (specified by Khronos); others are vendor-specific. For details about the standard options, see the clBuildProgram API's description in the OpenCL specification. -For information about the frequently used standard build options, see “Supported Standard OpenCL Compiler Options”. +For information about the frequently used standard build options, see "Supported Standard OpenCL Compiler Options". -For information about AMD-developed supplemental options and environment variables, see “AMD-Developed Supplemental Compiler Options”. +For information about AMD-developed supplemental options and environment variables, see "AMD-Developed Supplemental Compiler Options". **Special note for building OpenCL 2.0 programs:** @@ -1415,7 +1415,7 @@ OpenCL provides a way to check and query the compilation/linking errors that occ **Example:** -:: +:: cl_int err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (err != CL_SUCCESS) @@ -1440,8 +1440,8 @@ The user must compile each program object separately. This step may be a little **Example (derived from the OpenCL specification):** Consider the following program source: - -:: + +:: #include #include @@ -1453,7 +1453,7 @@ Consider the following program source: This kernel includes two headers, foo.h and mydir/myinc.h. So first create the program objects corresponding to each header as follows: -:: +:: cl_program foo_pg = clCreateProgramWithSource(context, 1, &foo_header_src, NULL, &err); @@ -1466,10 +1466,10 @@ Suppose the program source described above is given by program_A and is loaded v Now, these headers can be passed as embedded headers along with the program object -:: +:: - cl_program input_headers[2] = { foo_pg, myinc_pg }; - char * input_header_names[2] = { “foo.h”, “mydir/myinc.h” }; + cl_program input_headers[2] = { foo_pg, myinc_pg }; + char * input_header_names[2] = { "foo.h", "mydir/myinc.h" }; clCompileProgram(program_A, 0, NULL, // num_devices & device_list NULL, // compile_options @@ -1508,8 +1508,8 @@ Supported Standard OpenCL Compiler Options ########################################### The frequently-used build options are: - * -I dir — Add the directory dir to the list of directories to be searched for header files. When parsing #include directives, the OpenCL compiler resolves relative paths using the current working directory of the application. - * -D name — Predefine name as a macro, with definition = 1. For -D name=definition, the contents of definition are tokenized and processed as if they appeared during the translation phase three in a #define directive. In particular, the definition is truncated by embedded newline characters. + * -I dir -- Add the directory dir to the list of directories to be searched for header files. When parsing #include directives, the OpenCL compiler resolves relative paths using the current working directory of the application. + * -D name -- Predefine name as a macro, with definition = 1. For -D name=definition, the contents of definition are tokenized and processed as if they appeared during the translation phase three in a #define directive. In particular, the definition is truncated by embedded newline characters. -D options are processed in the order they are given in the options argument to ``clBuildProgram``. For additional build options, see the :ref:OpenCL specification. @@ -1521,16 +1521,16 @@ AMD-Developed Supplemental Compiler Options The following supported options are not part of the OpenCL specification: - * -g — This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or - cygwin/minGW under Windows. For more details, see Chapter 4, “Debugging and Profiling OpenCL.” This option does not affect the default optimization of the OpenCL code. - * -O0 — Specifies to the compiler not to optimize. This is equivalent to the OpenCL standard option -cl-opt-disable. - * -f[no-]bin-source — Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, “OpenCL BinaryImage Format (BIF) v2.0.” by default, this option does NOT generate the source. - * -f[no-]bin-llvmir — Does [not] generate LLVM IR in the .llvmir section. - For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, this option GENERATES the LLVM IR. - * -f[no-]bin-amdil — Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, this option does NOT generate the AMD IL. - * -f[no-]bin-exe — Does [not] generate the executable (ISA) in the .text section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, this option GENERATES the ISA. - * -f[no-]bin-hsail — Does [not] generate HSAIL/BRIG in the binary. By default, this option does NOT generate HSA IL/BRIG in the binary. - * -save-temps[=] — This option dumps intermediate temporary files, such as IL and ISA code, for each OpenCL kernel. If is not given, temporary files are saved in the default temporary directory (the current directory for Linux, C:\Users \\AppData\Local for Windows). If is given, those temporary files are saved with the given + * -g -- This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or + cygwin/minGW under Windows. For more details, see Chapter 4, "Debugging and Profiling OpenCL." This option does not affect the default optimization of the OpenCL code. + * -O0 -- Specifies to the compiler not to optimize. This is equivalent to the OpenCL standard option -cl-opt-disable. + * -f[no-]bin-source -- Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, "OpenCL BinaryImage Format (BIF) v2.0." by default, this option does NOT generate the source. + * -f[no-]bin-llvmir -- Does [not] generate LLVM IR in the .llvmir section. + For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, this option GENERATES the LLVM IR. + * -f[no-]bin-amdil -- Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, this option does NOT generate the AMD IL. + * -f[no-]bin-exe -- Does [not] generate the executable (ISA) in the .text section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, this option GENERATES the ISA. + * -f[no-]bin-hsail -- Does [not] generate HSAIL/BRIG in the binary. By default, this option does NOT generate HSA IL/BRIG in the binary. + * -save-temps[=] -- This option dumps intermediate temporary files, such as IL and ISA code, for each OpenCL kernel. If is not given, temporary files are saved in the default temporary directory (the current directory for Linux, C:\Users \\AppData\Local for Windows). If is given, those temporary files are saved with the given . If is an absolute path prefix, such as C:\your\work\dir\mydumpprefix, those temporaries are saved under C:\your\work\dir, with mydumpprefix as prefix to all temporary names. For example, @@ -1540,13 +1540,13 @@ The following supported options are not part of the OpenCL specification: | _temp_nn_xxx_yyy.il, _temp_nn_xxx_yyy.isa | - + | -save-temps=aaa | under the default directory | aaa_nn_xxx_yyy.il, aaa_nn_xxx_yyy.isa | - + | -save-temps=C:\you\dir\bbb | under C:\you\dir | bbb_nn_xxx_yyy.il, bbb_nn_xxx_yyy.isa @@ -1556,8 +1556,8 @@ where xxx and yyy are the device name and kernel name for this build, respective To avoid source changes, there are two environment variables that can be used to change CL options during the runtime. -* AMD_OCL_BUILD_OPTIONS — Overrides the CL options specified in clBuildProgram(). -* AMD_OCL_BUILD_OPTIONS_APPEND — Appends options to those specified in clBuildProgram(). +* AMD_OCL_BUILD_OPTIONS -- Overrides the CL options specified in clBuildProgram(). +* AMD_OCL_BUILD_OPTIONS_APPEND -- Appends options to those specified in clBuildProgram(). .. _Creating-device-specific-binaries: @@ -1567,7 +1567,7 @@ To generate pre-built device-specific binaries from the OpenCL C source or from 1. Create the program object from OpenCL C source using clCreateProgramWithSource(). -2. Build (i.e. compile and link) the program object (for details, see the “Generating program executable” section). +2. Build (i.e. compile and link) the program object (for details, see the "Generating program executable" section). 3. Read the device-specific binaries from the program object using clGetProgramInfo() as shown below: @@ -1576,20 +1576,20 @@ To generate pre-built device-specific binaries from the OpenCL C source or from //Get the number of devices attached with program object cl_uint nDevices = 0; clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &nDevices, NULL); - + //Get the Id of all the attached devices cl_device_id *devices = new cl_device_id[nDevices]; clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * nDevices, devices, NULL); - + // Get the sizes of all the binary objects size_t *pgBinarySizes = new size_t[nDevices]; lGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, pgBinarySizes, NULL); - + // Allocate storage for each binary objects unsigned char **pgBinaries = new unsigned char*[nDevices]; for (cl_uint i = 0; i < nDevices; i++) { pgBinaries[i] = new unsigned char[pgBinarySizes[i]]; } - + // Get all the binary objects clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) * nDevices, pgBinaries, NULL); @@ -1604,25 +1604,25 @@ The runtime system assigns the work in the command queues to the underlying devi ============================= ====================================================== OpenCL API Function Description ============================= ====================================================== -clCreateCommandQueueWith Create a command queue for a specific device +clCreateCommandQueueWith Create a command queue for a specific device Properties (in OpenCL 2.0) (CPU,GPU.) -clCreateCommandQueue() -(in OpenCL 1.x; deprecated -in OpenCL 2.0) +clCreateCommandQueue() +(in OpenCL 1.x; deprecated +in OpenCL 2.0) -clCreateKernel() Creates a kernel object from the program object. +clCreateKernel() Creates a kernel object from the program object. clCreateBuffer() Creates a buffer object for use via OpenCL kernels. -clSetKernelArg() Set the kernel arguments, and enqueue the kernel in a +clSetKernelArg() Set the kernel arguments, and enqueue the kernel in a clEnqueueNDRangeKernel() command queue. -clEnqueueReadBuffer(), Enqueue a command in a command queue to read from a -clEnqueueWriteBuffer() buffer object to host memory, or write to the buffer +clEnqueueReadBuffer(), Enqueue a command in a command queue to read from a +clEnqueueWriteBuffer() buffer object to host memory, or write to the buffer object from host memory clEnqueueWaitForEvents() Wait for the specified events to complete. -============================= ====================================================== +============================= ====================================================== The commands can be broadly classified into three categories. @@ -1644,7 +1644,7 @@ Running the Program Creating Kernel Objects *********************** -After a program is created and built, the next step is to run the kernel code on the devices. Running the kernel code requires the creation of one or more kernel objects for each kernel function (declared as “ kernel” or “kernel”). Kernel objects are run-time objects that bind the specific kernel function with the argument values to be used while executing it. +After a program is created and built, the next step is to run the kernel code on the devices. Running the kernel code requires the creation of one or more kernel objects for each kernel function (declared as " kernel" or "kernel"). Kernel objects are run-time objects that bind the specific kernel function with the argument values to be used while executing it. The clCreateKernel API creates a kernel object from a program object by using the name of the kernel function passed with program object. The arguments to kernel objects are set by the following APIs: @@ -1657,10 +1657,10 @@ SVM pointers as the argument value. A sample kernel definition is shown below. :: - + kernel void sample_kernel( global const uchar *normalPtr, global uchar *svmPtr) - { - … + { + ... } To create a kernel object for the above kernel, you must pass the program object corresponding to the kernel to the clCreateKernel function. Assuming that the program object containing the above kernel function has been created and built as program, a kernel object for the above kernel would be created as follows: @@ -1700,7 +1700,7 @@ A command queue (host or device) is created by using the clCreateCommandQueueWit **Example: To create a default device-side out-of-order command queue with a specific size** :: - + cl_queue_properties prop[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, CL_QUEUE_SIZE, maxQueueSize, 0 }; cl_command_queue commandQueue = clCreateCommandQueueWithProperties(context, deviceId, props, &status); @@ -1709,7 +1709,7 @@ Running a Kernel (from the host) ********************************* After a command queue has been created, the queue can be used to en-queue the commands to the associated device. The clEnqueueNDRangeKernel API en-queues a command to execute a kernel to a device. During the kernel en- queue, one must specify the total number of kernel instances or work-items to be executed by the device and the size of each work-group or block. This information is set by the work_dim, global_work_size, local_work_size and global_work_offset arguments. Like any other command en-queuing API, the clEnqueueNDRangeKernel returns an event object that conveys information about the en-queued kernel and can be used to synchronization other commands dependent on this kernel. In this API, a list of events that need to complete before this particular command can be executed can be specified. -For example, suppose a kernel object and command queue, named “kernel” and “commandQueue” respectively, have already been created. Suppose you want to launch the kernel over a 2-D dimensional space having total work-items +For example, suppose a kernel object and command queue, named "kernel" and "commandQueue" respectively, have already been created. Suppose you want to launch the kernel over a 2-D dimensional space having total work-items {1024x1024} and each block/group size {16x16}. To do this, the kernel can be en-queued into the command queue as follows: | cl_uint workDim = 2; @@ -1740,7 +1740,7 @@ For GPU processing, the OpenCL compiler generates an intermediate representation Profiling OpenCL ============================== -This chapter discusses how to profile OpenCL programs running on AMD GPU and CPU compute devices. The preferred method is to debug with AMD CodeXL, as described in “AMD CodeXL GPU Debugger.” The second method, described in “Debugging CPU Kernels with GDB,” is to use experimental features provided by ROCm (GNU project debugger, GDB) to debug kernels on x86 CPUs running Linux. +This chapter discusses how to profile OpenCL programs running on AMD GPU and CPU compute devices. The preferred method is to debug with AMD CodeXL, as described in "AMD CodeXL GPU Debugger." The second method, described in "Debugging CPU Kernels with GDB," is to use experimental features provided by ROCm (GNU project debugger, GDB) to debug kernels on x86 CPUs running Linux. .. _AMD-CodeXL-GPU: @@ -1749,7 +1749,7 @@ Downloading and installing CodeXL and Radeon Compute Profiler Download the latest version of CodeXL from the CodeXL home page: http://developer.amd.com/tools-and-sdks/opencl-zone/codexl/ -Radeon Compute Profiler is a performance analysis tool that gathers data from the API run-time and GPU for OpenCL™ and ROCm/HSA applications +Radeon Compute Profiler is a performance analysis tool that gathers data from the API run-time and GPU for OpenCL(TM) and ROCm/HSA applications RCP is installed when you you use rocm-dev upon instal of the driver. You can access the source code at https://github.com/GPUOpen-Tools/RCP @@ -1759,20 +1759,20 @@ Either install the tar archive, or install the .deb package. **Tar archive:** -1. Download the AMD_CodeXL_Linux*.tar.gz 64-bit Linux tar package at https://github.com/GPUOpen-Tools/CodeXL/releases +1. Download the AMD_CodeXL_Linux*.tar.gz 64-bit Linux tar package at https://github.com/GPUOpen-Tools/CodeXL/releases 2. Run: - $ tar –xvzf CodeXL_Linux*.tar.gz + $ tar -xvzf CodeXL_Linux*.tar.gz **Debian package :** 1. Download the ``amdcodexl-*.deb 64-bit Linux Debian package.`` 2. Run: ``$ sudo dpkg -i amdcodexl_x.x.x-1_amd64.deb `` - + 3. Run: ``$ sudo apt-get -f install`` -Or build the project from source code https://github.com/GPUOpen-Tools/CodeXL +Or build the project from source code https://github.com/GPUOpen-Tools/CodeXL Using CodeXL for profiling ########################### @@ -1784,7 +1784,7 @@ Two modes in CodeXL are particularly useful for profiling: GPU Profile Mode ***************** -The GPU Profile Mode helps developers analyze and profile OpenCL™ host and device code. Developers can profile the entire application or only the kernels by using one of the following modes: +The GPU Profile Mode helps developers analyze and profile OpenCL(TM) host and device code. Developers can profile the entire application or only the kernels by using one of the following modes: * Entire application profile: Collect application trace mode * Kernel profile: Collect GPU performance counter mode @@ -1803,13 +1803,13 @@ While running your application in the GPU Profile mode, CodeXL collects valuable * **Timeline visualization:** Visualize host and device execution in a timeline chart - View number of OpenCL™ contexts and command queues created and the relationships between these items + View number of OpenCL(TM) contexts and command queues created and the relationships between these items - View data transfer operations and kernel executions on the device + View data transfer operations and kernel executions on the device Determine proper synchronization and load balancing - - + + .. image:: images/4.3.png :align: center @@ -1818,30 +1818,30 @@ While running your application in the GPU Profile mode, CodeXL collects valuable Includes a helpful list of best practices Includes recommendations to improve program performance - + * **Summary pages:** Find top bottlenecks I/O bound Compute bound - - + + .. image:: images/4.4.png :align: center - * **Kernel occupancy:** Estimate OpenCL™ kernel occupancy for AMD APUs and GPUs + * **Kernel occupancy:** Estimate OpenCL(TM) kernel occupancy for AMD APUs and GPUs Visual indication of the limiting kernel resources for number of wavefronts in flight View the maximum number of wavefronts in flight limited by - –Work group size + -Work group size - –Number of allocated scalar or vector registers + -Number of allocated scalar or vector registers - –Amount of allocated LDS + -Amount of allocated LDS - –View the maximum resource limit for the GPU device + -View the maximum resource limit for the GPU device .. image:: images/4.5.png @@ -1859,9 +1859,9 @@ The Analyze Mode provides a nice way to begin writing your kernel and to compile The Analyze Mode allows a user to do the following: -* **Edit your OpenCL™ kernel inside CodeXL editor** +* **Edit your OpenCL(TM) kernel inside CodeXL editor** Create a new file - Drag and drop an existing OpenCL™ kernel file + Drag and drop an existing OpenCL(TM) kernel file * **Highlight keywords** The CodeXL editor highlights keywords for easier editing @@ -1874,9 +1874,9 @@ The Analyze Mode allows a user to do the following: * Choose your target device The Analyze Mode enables to compile to any supported device target, without the need to install the device -* Fix OpenCL™ compiler errors and warnings in which the kernel file is the only input +* Fix OpenCL(TM) compiler errors and warnings in which the kernel file is the only input View OpenCL compilation errors and fix immediately. -* Edit OpenCL™ Compiler options with an easy options tab +* Edit OpenCL(TM) Compiler options with an easy options tab CodeXL summarizes all the OpenCL options so that it is easy to use them. @@ -1916,15 +1916,15 @@ The following list contains the major static C++ features supported by this exte * Kernel and function overloading. * Inheritance: - | – Strict inheritance. - | – Friend classes. - | – Multiple inheritance. + | - Strict inheritance. + | - Friend classes. + | - Multiple inheritance. * Templates: - | –Kernel templates. - | –Member templates. - | –Template default argument. - | –Limited class templates (the virtual. keyword is not exposed). - | –Partial template specialization + | -Kernel templates. + | -Member templates. + | -Template default argument. + | -Limited class templates (the virtual. keyword is not exposed). + | -Partial template specialization * Namespaces. * References. * this operator. @@ -1946,7 +1946,7 @@ Static C++ features not supported by this extension are: * The language specified in this extension can be easily expanded to support these features. Relations with ISO/IEC C++ -*************************** +*************************** This extension focuses on documenting the differences between the OpenCL Static C++ kernel language and the ISO/IEC Programming languages C++ specification. Where possible, this extension leaves technical definitions to the ISO/IEC specification. @@ -1983,7 +1983,7 @@ To compile a program that contains static C++ kernels and functions, the applica where language is defined as one of the following: - * clc – the source language is considered to be OpenCL C, as defined in the + * clc - the source language is considered to be OpenCL C, as defined in the The OpenCL Programming Language version 1.21. * clc++ - the source language is considered to be OpenCL C++, as defined in the following sections of the this document. @@ -2036,7 +2036,7 @@ As per of the static C++ language specification, a number of restrictions limit Also, the rules for well-formed programs as defined by Section 13 of the static C++ language specification are lifted to apply to both kernel and function declarations. -The overloading resolution is per Section 13.1 of the static C++ language specification, but extended to account for vector types. The algorithm for “best viable function”, Section 13.3.3 of the static C++ language specification, is extended for vector types by inducing a partial-ordering as a function of the partial-ordering of its elements. Following the existing rules for vector types in the OpenCL 1.2 specification, explicit conversion between vectors is not allowed. (This reduces the number of possible overloaded functions with respect to vectors, but this is not expected to be a particular burden to developers because explicit conversion can always be applied at the point of function evocation.) +The overloading resolution is per Section 13.1 of the static C++ language specification, but extended to account for vector types. The algorithm for "best viable function", Section 13.3.3 of the static C++ language specification, is extended for vector types by inducing a partial-ordering as a function of the partial-ordering of its elements. Following the existing rules for vector types in the OpenCL 1.2 specification, explicit conversion between vectors is not allowed. (This reduces the number of possible overloaded functions with respect to vectors, but this is not expected to be a particular burden to developers because explicit conversion can always be applied at the point of function evocation.) For overloaded kernels, the following syntax is used as part of the kernel name: @@ -2103,7 +2103,7 @@ Examples Passing a Class from the Host to the Device and Back ****************************************************** -The class definition must be the same on the host code and the device code, besides the members’ type in the case of vectors. If the class includes vector data types, the definition must conform to the table that appears on Section 6.1.2 +The class definition must be the same on the host code and the device code, besides the members' type in the case of vectors. If the class includes vector data types, the definition must conform to the table that appears on Section 6.1.2 of the OpenCL Programming Specification 1.2, Corresponding API type for @@ -2136,10 +2136,10 @@ OpenCL Language types. int x; } - MyFunc () + MyFunc () { tempClass = new(Test); - ... // Some OpenCL startup code – create context, queue, etc. + ... // Some OpenCL startup code - create context, queue, etc. cl_mem classObj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Test), &tempClass, event); clEnqueueMapBuffer(...,classObj,...); tempClass.setX(10); @@ -2147,17 +2147,17 @@ OpenCL Language types. clEnqueueNDRange(..., fooKernel, ...); clEnqueueMapBuffer(...,classObj,...); //class is passed back to the Host } - + Kernel Overloading ******************* This example shows how to define and use mangled_name for kernel overloading, and how to choose the right kernel from the host code. Assume the following kernels are defined: -:: +:: __attribute__((mangled_name(testAddFloat4))) kernel void testAdd(global float4 * src1, global float4 * src2, global float4 * dst) - { + { int tid = get_global_id(0); dst[tid] = src1[tid] + src2[tid]; } @@ -2215,7 +2215,7 @@ OpenCL 2.0 and 2.1 features are provided with the ROCm 2.4 OpenCL Language Runti For guidelines on how to migrate from OpenCL 1.2 to OpenCL 2.1 and for information about querying for image- and device-specific extensions, see Portability considerations. -For a list of the new and deprecated functions, “New and deprecated functions in OpenCL 2.0.” +For a list of the new and deprecated functions, "New and deprecated functions in OpenCL 2.0." .. _Shared-virtual-Memory: @@ -2234,7 +2234,7 @@ Support for SVM does not imply or require that the host and the OpenCL devices i A caveat, however, concerns situations in which the host and the OpenCL devices access the same region of memory at the same time. It would be highly inefficient for the host and the OpenCL devices to have a consistent view of the memory for each load/store from any device/host. In general, the memory model of the language or architecture implementation determines how or when a memory location written by one thread or agent is visible to another. The memory model also determines to what extent the programmer can control the scope of such accesses. -OpenCL 2.0 adopts the memory model defined in C++11 with some extensions. The memory orders taken from C++11 are: "relaxed", "acquire", "release", “acquire-release”, and "sequential consistent". +OpenCL 2.0 adopts the memory model defined in C++11 with some extensions. The memory orders taken from C++11 are: "relaxed", "acquire", "release", "acquire-release", and "sequential consistent". OpenCL 2.0 introduces a new (C++11-based) set of atomic operations with specific memory-model based semantics. Atomic operations are indivisible: a thread or agent cannot see partial results. The atomic operations supported are: @@ -2255,7 +2255,7 @@ OpenCL 2.0 introduces the concept of "memory scope", which limits the extent to OpenCL 2.0 further differentiates between coarse-grained SVM buffer sharing and fine-grained SVM (buffer and system) sharing mechanisms. These mechanisms define the granularity at which the SVM buffers are shared. Updates to coarse-grained or fine-grained SVM are visible to other devices at synchronization points: - + * For coarse-grained SVM, the synchronization points are: the mapping or un- mapping of the SVM memory and kernel launch or completion. This means that any updates are visible only at the end of the kernel or at the point of un-mapping the region of memory. Coarse-grained buffer memory has a fixed virtual address for all the devices it is allocated on. In the AMD implementation, the physical memory is allocated on Device Memory. @@ -2292,7 +2292,7 @@ Some applications do not require fine-grained atomics to ensure that the SVM is For example, while searching in parallel on a binary search tree , coarse-grain buffers are usually sufficient. In general, coarse-grain buffers provide faster access compared to fine grain buffers as the memory is not required to be consistent across devices. -:: +:: for (i = 0; i < keys_per_wi; i++) { key = search_keys[init_id + i]; tmp_node = root; @@ -2316,10 +2316,10 @@ The host creates two buffers, svmTreeBuf and svmSearchBuf, to hold the given tre The next task is to create the tree and populate the svmTreeBuf using ``clSVMEnqueueMap`` and ``clSVMEnqueueUnmap``. The host-code method, cpuCreateBinaryTree, illustrates this mechanism; note the calls to these map/unmap APIs. The host then creates the keys to be searched in svmSearchBuf, as the cpuInitSearchKeys method illustrates. Next, it enqueues the kernel to search the binary tree for the given keys in the svmSearchBuf, and it sets the parameters to the kernel using clSetKernelArgSVMPointer: -:: +:: int status = clSetKernelArgSVMPointer(sample_kernel, 0, (void *)(svmTreeBuf)); - + status = clSetKernelArgSVMPointer(sample_kernel, 1, (void *)(svmSearchBuf)); Note that the routine passes both svmTreeBuf and svmSearchBuf to the kernel as parameters. The following node structure demonstrates how to create the tree on the host using pointers to the left and right children: @@ -2363,7 +2363,7 @@ Updates to the tree occur on the host (CPU) or on the GPU, but not on both simul Because the tree is created on the host, and because OpenCL 1.2 disallows SVM, implementing these steps is difficult in OpenCL 1.2. In OpenCL 1.2, you must store the tree as arrays, copy the arrays to the GPU memory (specifying the appropriate offsets), and then copy the arrays back to the host. -The “data” is the tree created by the host as a coarse-grain buffer and is passed to the kernel as an input pointer. +The "data" is the tree created by the host as a coarse-grain buffer and is passed to the kernel as an input pointer. .. image:: images/6.1.png :align: center @@ -2392,15 +2392,15 @@ Generic example **************** In OpenCL 1.2, the developer needed to write three functions for a pointer p that can reference the local, private, or global address space:: - - void fooL (local int *p) { … } - void fooP (private int *p) { … } - void fooG (global int *p) { … } - + + void fooL (local int *p) { ... } + void fooP (private int *p) { ... } + void fooG (global int *p) { ... } + In OpenCL 2.0, the developer needs to write only one function:: - + void foo (int *p) As foo is a generic function, the compiler will accept calls to it with pointers to any address space except the constant address space. @@ -2421,7 +2421,7 @@ OpenCL sample, addMul2d is a generic function that uses generic address spaces f :: float4 addMul2D (uchar4 *src, float *filter, int2 filterDim, int width) - { + { int i, j; float4 sum = (float4)(0); for(i = 0; i < (filterDim.y); i++) @@ -2450,7 +2450,7 @@ OpenCL 2.0 allows kernels to enqueue other kernels. It provides a new construct, kernels. In addition, OpenCL 2.0 deprecates the run-time API call ``clCreateCommandQueue``, in favor of a new call, ``clCreateCommandQueueWithProperties``, that can create device-side command queues. -Because it eliminates the overhead of returning kernel-launch control to the host, device-side enqueue can in many cases improve application performance. Some platforms (such as AMD’s) provide a standard way of enqueuing work to the hardware, which can further improve the performance. Device-side enqueue has been observed to reduce by the overhead of enqueuing by more than 3x in some cases. +Because it eliminates the overhead of returning kernel-launch control to the host, device-side enqueue can in many cases improve application performance. Some platforms (such as AMD's) provide a standard way of enqueuing work to the hardware, which can further improve the performance. Device-side enqueue has been observed to reduce by the overhead of enqueuing by more than 3x in some cases. Applications that are inherently recursive or that require additional processing can derive particular benefit. A classic example of the latter case is a tree search that discovers new nodes when traversing from the root to the leaves. @@ -2458,19 +2458,19 @@ Device enqueue is also useful in determining when all the workgroups of the pare Workgroup/subgroup-level functions *********************************** -OpenCL 2.0 introduces new built-in functions that operate at the workgroup or subgroup level. (A workgroup comprises one or more subgroups; the vendor handles the exact subgroup implementation.) For example, on AMD platforms, a subgroup maps to a “wavefront”. (For details, see the AMD OpenCL User Guide.) +OpenCL 2.0 introduces new built-in functions that operate at the workgroup or subgroup level. (A workgroup comprises one or more subgroups; the vendor handles the exact subgroup implementation.) For example, on AMD platforms, a subgroup maps to a "wavefront". (For details, see the AMD OpenCL User Guide.) Basically, a wavefront is an execution unit on the GPU. The OpenCL specification requires that all work items in a workgroup/subgroup executing the kernel handle these new functions; otherwise, their results may be undefined. OpenCL 2.0 defines the following new built-in functions. Note that it also defines similar functions for subgroups under the cl_khr_subgroups extensions in CL_DEVICE_EXTENSIONS. -1. work_group_all and work_group_any: These functions test a given predicate on all work items in the workgroup. The “all” version effectively performs an AND operation on all predicates and returns the result to all work items; similarly, the “any” operation performs an OR operation. Thus, using the “all” function returns true if the predicate is true for all work items; “any” returns true if it is true for at least one work item. +1. work_group_all and work_group_any: These functions test a given predicate on all work items in the workgroup. The "all" version effectively performs an AND operation on all predicates and returns the result to all work items; similarly, the "any" operation performs an OR operation. Thus, using the "all" function returns true if the predicate is true for all work items; "any" returns true if it is true for at least one work item. 2. work_group_broadcast: This function broadcasts a local value from each work item to all the others in the workgroup. 3. work_group_reduce: Given an operation, work_group_reduce performs the reduction operation on all work items and returns the result. The operation can be min, max or add. For example, when called for an array using the add operation, the function returns the sum of the array elements. -4. work_group_inclusive/exclusive_scan: The “scan” operation is a prefix operation, which performs a reduction up to the work-item ID. If it includes the current ID, the function applies an inclusive scan; otherwise, if it covers everything up to but not including the current work item, it applies an exclusive scan. Again, the operation can be min, max or add. +4. work_group_inclusive/exclusive_scan: The "scan" operation is a prefix operation, which performs a reduction up to the work-item ID. If it includes the current ID, the function applies an inclusive scan; otherwise, if it covers everything up to but not including the current work item, it applies an exclusive scan. Again, the operation can be min, max or add. OpenCL 2.0 introduces a Khronos sub-group extension. Sub-groups are a logical abstraction of the hardware SIMD execution model akin to wavefronts, warps, or vectors and permit programming closer to the hardware in a vendor-independent manner. This extension includes a set of cross-sub-group built-in functions that match the set of the cross-work-group built-in functions specified above. @@ -2547,7 +2547,7 @@ The kernel is rewritten in OpenCL 2.0 to enqueue itself. (For full details, see Finally, the kernel launches itself again using device enqueue, but with new bounds: -:: +:: void (^binarySearch_device_enqueue_wrapper_blk)(void) = ^{binarySearch_device_enqueue_multiKeys_child(outputArray, @@ -2561,12 +2561,12 @@ Finally, the kernel launches itself again using device enqueue, but with new bou int err_ret = enqueue_kernel(defQ,CLK_ENQUEUE_FLAGS_WAIT_KERNEL,ndrange1,binarySe arch_device_enqueue_wrapper_blk); It also checks for missing keys; absent any such keys, the search stops by forgoing further enqueues:: - - /**** Search continues only if at least one key is found in previous search ****/ + + /**** Search continues only if at least one key is found in previous search ****/ int Flag = atomic_load_explicit(&,memory_order_seq_cst); if(Flag == 0) - return; - + return; + The advantage is that when the input array is large, the OpenCL 2.0 version divides the input array into 1024-sized chunks. The chunk in which the given key falls is found and another kernel is enqueued which further divides it into 1024- sized chunks, and so on. In OpenCL 1.2, as the whole array is taken as the NDRange, a huge number of work groups require processing. @@ -2604,9 +2604,9 @@ Atomic Loads/Stores This sample illustrates atomic loads/stores with the use of memory orders. The first step is to create this memory on the host:: - + buffer = (int * ) clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER, (N+1)*sizeof(int), 4); - + atomicBuffer = (int * ) clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, (N+1)*sizeof(int), 4); @@ -2633,7 +2633,7 @@ The kernel next stores (100+i), where i is the ID of the work-item into atomicBu After the atomic operation, the updates on fine-grain variables (such as buffer) will also be available at the host. The CPU checks for the following to ensure that the results are OK: :: - + for (i=0;i*)&atomicBuffer[i], std::memory_order_acquire) != (100+i)); /* check the results now */ @@ -2728,12 +2728,12 @@ Pipe. The memory allocated in the above function can be passed to kernels as read- only or write-only pipes. The pipe objects can only be passed as kernel arguments or kernel functions and cannot be declared inside a kernel or as program-scoped objects. Also, a set of built-in functions have been added to operate on the pipes. The important ones are: - + read_pipe (pipe p, gentype * ptr: for reading packet from pipe p into ptr. - + write_pipe (pipe p, gentype * ptr: for writing packet pointed to by ptr to pipe p. -To ensure you have enough space in the pipe structure for reading and writing (before you actually do it), you can use built-in functions to “reserve” enough space. For example, you could reserve room by calling reserve_read_pipe or reserve_write_pipe. These functions return a reservation ID, which can be used when the actual operations are performed. Similarly, the standard has built-in functions for workgroup level reservations, such as work_group_reserve_read_pipe and work_group_reserve_write_pipe and for the workgroup order (in the program). These workgroup built-in functions operate at the workgroup level. Ordering across workgroups is undefined. Calls to commit_read_pipe and commit_write_pipe, as the names suggest, commit the actual operations (read/write). +To ensure you have enough space in the pipe structure for reading and writing (before you actually do it), you can use built-in functions to "reserve" enough space. For example, you could reserve room by calling reserve_read_pipe or reserve_write_pipe. These functions return a reservation ID, which can be used when the actual operations are performed. Similarly, the standard has built-in functions for workgroup level reservations, such as work_group_reserve_read_pipe and work_group_reserve_write_pipe and for the workgroup order (in the program). These workgroup built-in functions operate at the workgroup level. Ordering across workgroups is undefined. Calls to commit_read_pipe and commit_write_pipe, as the names suggest, commit the actual operations (read/write). Usage ****** @@ -2748,8 +2748,8 @@ The host creates the pipe, which both kernels will use, as follows: &status); This code makes a pipe that the program kernels can access (read/write). The host creates two kernels, producer_kernel and consumer_kernel. The producer kernel first reserves enough space for the write pipe:: - - //reserve space in pipe for writing random numbers. + + //reserve space in pipe for writing random numbers. reserve_id_t rid = work_group_reserve_write_pipe(rng_pipe, szgr); Next, the kernel writes and commits to the pipe by invoking the following functions: @@ -2760,14 +2760,14 @@ Next, the kernel writes and commits to the pipe by invoking the following functi //reserve pipe for reading reserve_id_t rid = work_group_reserve_read_pipe(rng_pipe, szgr); if(is_valid_reserve_id(rid)) { - //read random number from the pipe. read_pipe(rng_pipe,rid,lid, &rn); work_group_commit_read_pipe(rng_pipe, rid); + //read random number from the pipe. read_pipe(rng_pipe,rid,lid, &rn); work_group_commit_read_pipe(rng_pipe, rid); } The consumer_kernel then uses this set of random number and constructs the histogram. The CPU creates the same histogram and verifies whether the histogram created by the kernel is correct. Here, lid is the local id of the work item, obtained by get_local_id(0). The example code demonstrates how you can use a pipe as a convenient data structure that allows two kernels to communicate. -In OpenCL 1.2, this kind of communication typically involves the host – although kernels can communicate without returning control to the host. Pipes, however, ease programming by reducing the amount of code that some applications require. +In OpenCL 1.2, this kind of communication typically involves the host - although kernels can communicate without returning control to the host. Pipes, however, ease programming by reducing the amount of code that some applications require. .. _Program-scope-global-Variables: @@ -2815,7 +2815,7 @@ Creating sRGB image objects is similar to creating an image object of existing s :: - cl_image_format imageFormat; + cl_image_format imageFormat; imageFormat.image_channel_data_type = CL_UNORM_INT8; imageFormat.image_channel_order = CL_sRGBA cl_mem imageObj = clCreateImage( @@ -2836,7 +2836,7 @@ The following is a kernel sample that illustrates how to read an sRGB image obje :: - // Read sRGBA image object (input) and convert it to linear RGB + // Read sRGBA image object (input) and convert it to linear RGB values(results) kernel void sample_kernel( read_only image2d_t input, sampler_t imageSampler, global float *xOffsets, global float *yOffsets, global float4 *results ) // input: sRGBA image object @@ -2939,7 +2939,7 @@ The name of extension is standardized and must contain the following elements wi * cl_khr_ - for extensions approved by Khronos Group. For example: ``cl_khr_fp64`` * cl_ext_ - for extensions provided collectively by multiple vendors. For example: ``cl_ext_device_fission`` - * cl__ – for extension provided by a specific vendor. For example: ``cl_amd_media_ops`` + * cl__ - for extension provided by a specific vendor. For example: ``cl_amd_media_ops`` The OpenCL Specification states that all API functions of the extension must have names in the form of clKHR, clEXT, or cl. All enumerated values must be in the form of CL__KHR, CL__EXT, or CL__. @@ -2967,17 +2967,17 @@ There are special directives for the OpenCL compiler to enable or disable availa #pragma OPENCL EXTENSION all: -The is described in Section A.1, “Extension Name -Convention.”. The second form allows to address all extensions at once. The token can be either: +The is described in Section A.1, "Extension Name +Convention.". The second form allows to address all extensions at once. The token can be either: -* **enable** - the extension is enabled if it is supported, or the error is reported if the specified extension is not supported or token “all” is used. +* **enable** - the extension is enabled if it is supported, or the error is reported if the specified extension is not supported or token "all" is used. * **disable** - the OpenCL implementation/compiler behaves as if the specified extension does not exist. * **all** - only core functionality of OpenCL is used and supported, all extensions are ignored. If the specified extension is not supported then a warning is issued by the compiler. The order of directives in #pragma OPENCL EXTENSION is important: a later directive with the same extension name overrides any previous one. The initial state of the compiler is set to ignore all extensions as if it was explicitly set with the following directive:: - + #pragma OPENCL EXTENSION all : disable This means that the extensions must be explicitly enabled to be used in kernel programs. @@ -2998,7 +2998,7 @@ Use the following function to get an extension function pointer. This returns the address of the extension function specified by the FunctionName string. The returned value must be appropriately cast to a function pointer type, specified in the extension spec and header file. -A return value of NULL means that the specified function does not exist in the CL implementation. A non-NULL return value does not guarantee that the extension function actually exists – queries described in sec. 2 or 3 must be done to ensure the extension is supported. +A return value of NULL means that the specified function does not exist in the CL implementation. A non-NULL return value does not guarantee that the extension function actually exists - queries described in sec. 2 or 3 must be done to ensure the extension is supported. The ``clGetExtensionFunctionAddress()`` function cannot be used to get core API function addresses. @@ -3007,16 +3007,16 @@ List of Supported Extensions that are Khronos-Approved For a complete list of the supported extensions, see the OpenCL 1.2 and OpenCL 2.0 specification documents. The typical extensions in OpenCL 1.2 are: -* cl_khr_global_int32_base_atomics – basic atomic operations on 32-bit integers in global memory. -* cl_khr_global_int32_extended_atomics – extended atomic operations on 32-bit integers in global memory. -* cl_khr_local_int32_base_atomics – basic atomic operations on 32-bit integers in local memory. -* cl_khr_local_int32_extended_atomics – extended atomic operations on 32-bit integers in local memory. -* cl_khr_int64_base_atomics – basic atomic operations on 64-bit integers in both global and local memory. -* cl_khr_int64_extended_atomics – extended atomic operations on 64-bit integers in both global and local memory. -* cl_khr_3d_image_writes – supports kernel writes to 3D images. -* cl_khr_byte_addressable_store – this eliminates the restriction of not allowing writes to a pointer (or array elements) of types less than 32-bit wide in kernel program. -* cl_khr_gl_sharing – allows association of OpenGL context or share group with CL context for interoperability. -* cl_khr_icd – the OpenCL Installable Client Driver (ICD) that lets developers select from multiple OpenCL runtimes which may be installed on a system. +* cl_khr_global_int32_base_atomics - basic atomic operations on 32-bit integers in global memory. +* cl_khr_global_int32_extended_atomics - extended atomic operations on 32-bit integers in global memory. +* cl_khr_local_int32_base_atomics - basic atomic operations on 32-bit integers in local memory. +* cl_khr_local_int32_extended_atomics - extended atomic operations on 32-bit integers in local memory. +* cl_khr_int64_base_atomics - basic atomic operations on 64-bit integers in both global and local memory. +* cl_khr_int64_extended_atomics - extended atomic operations on 64-bit integers in both global and local memory. +* cl_khr_3d_image_writes - supports kernel writes to 3D images. +* cl_khr_byte_addressable_store - this eliminates the restriction of not allowing writes to a pointer (or array elements) of types less than 32-bit wide in kernel program. +* cl_khr_gl_sharing - allows association of OpenGL context or share group with CL context for interoperability. +* cl_khr_icd - the OpenCL Installable Client Driver (ICD) that lets developers select from multiple OpenCL runtimes which may be installed on a system. * cl_khr_d3d10_sharing - allows association of D3D10 context or share group with CL context for interoperability. * cl_dx9_media_sharing * Cl_khr_fp16 @@ -3048,7 +3048,7 @@ The typical extensions in OpenCL 2.0 are: cl_ext Extensions ********************** -* cl_ext_device_fission - Support for device fission in OpenCL™. For more information about this extension, see: http://www.khronos.org/registry/cl/extensions/ext/cl_ext_device_fission.txt +* cl_ext_device_fission - Support for device fission in OpenCL(TM). For more information about this extension, see: http://www.khronos.org/registry/cl/extensions/ext/cl_ext_device_fission.txt * cl_ext_atomic_counters_32 - Support for 32-bit atomic counters. For more information about this extension, see: https://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt @@ -3060,7 +3060,7 @@ This section describes the AMD vendor-specific extensions. cl_amd_fp64 *************** -Before using double data types, double-precision floating point operators, and/or double-precision floating point routines in OpenCL™ C kernels, include the +Before using double data types, double-precision floating point operators, and/or double-precision floating point routines in OpenCL(TM) C kernels, include the #pragma OPENCL EXTENSION cl_amd_fp64 : enable directive. See Table A.1 for a list of supported routines. cl_amd_vec3 @@ -3109,23 +3109,23 @@ cl_amd_compile_options *********************** This extension adds the following options, which are not part of the OpenCL specification. -* -g — This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or cygwin/minGW under Windows. For more details, see Chapter 4, “Debugging and Profiling OpenCL.” This option does not affect the default optimization of the OpenCL code. -* -O0 — Specifies to the compiler not to optimize. This is equivalent to the +* -g -- This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or cygwin/minGW under Windows. For more details, see Chapter 4, "Debugging and Profiling OpenCL." This option does not affect the default optimization of the OpenCL code. +* -O0 -- Specifies to the compiler not to optimize. This is equivalent to the OpenCL standard option -cl-opt-disable. -* -f[no-]bin-source — Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, the source is NOT generated. -* -f[no-]bin-llvmir — Does [not] generate LLVM IR in the .llvmir section. - For more information, see Appendix C, “OpenCL Binary Image Format (BIF) - v2.0.” By default, LLVM IR IS generated. -* -f[no-]bin-amdil — Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By Default, AMD IL is NOT generated. -* -f[no-]bin-exe — Does [not] generate the executable (ISA) in .text section. - For more information, see Appendix C, “OpenCL Binary Image Format (BIF) - v2.0.” By default, the executable IS generated. +* -f[no-]bin-source -- Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, the source is NOT generated. +* -f[no-]bin-llvmir -- Does [not] generate LLVM IR in the .llvmir section. + For more information, see Appendix C, "OpenCL Binary Image Format (BIF) + v2.0." By default, LLVM IR IS generated. +* -f[no-]bin-amdil -- Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By Default, AMD IL is NOT generated. +* -f[no-]bin-exe -- Does [not] generate the executable (ISA) in .text section. + For more information, see Appendix C, "OpenCL Binary Image Format (BIF) + v2.0." By default, the executable IS generated. * -f[no-]bin-hsail Does [not] generate HSAIL/BRIG in the binary. By default, HSA IL/BRIG is NOT generated. To avoid source changes, there are two environment variables that can be used to change CL options during the runtime. -* AMD_OCL_BUILD_OPTIONS — Overrides the CL options specified in clBuildProgram(). -* AMD_OCL_BUILD_OPTIONS_APPEND — Appends options to the options specified in clBuildProgram(). +* AMD_OCL_BUILD_OPTIONS -- Overrides the CL options specified in clBuildProgram(). +* AMD_OCL_BUILD_OPTIONS_APPEND -- Appends options to the options specified in clBuildProgram(). cl_amd_offline_devices *********************** @@ -3137,7 +3137,7 @@ This extension provides the ability to register event callbacks for states other cl_amd_popcnt ************** -This extension introduces a “population count” function called popcnt. This extension was taken into core OpenCL 1.2, and the function was renamed popcount. The core 1.2 popcount function (documented in section 6.12.3 of the OpenCL Specification) is identical to the AMD extension popcnt function. +This extension introduces a "population count" function called popcnt. This extension was taken into core OpenCL 1.2, and the function was renamed popcount. The core 1.2 popcount function (documented in section 6.12.3 of the OpenCL Specification) is identical to the AMD extension popcnt function. cl_amd_media_ops ****************** @@ -3148,14 +3148,14 @@ This extension adds the following built-in functions to the OpenCL language. Not | uint amd_pack(float4 src) | Return value | ((((uint)src[0]) & 0xFF) << 0) + ((((uint)src[1]) & 0xFF) << 8) + ((((uint)src[2]) & 0xFF) << 16) + ((((uint)src[3]) & 0xFF) << 24) - + | | Built-in function: amd_unpack0 | floatn amd_unpack0 (uintn src) | Return value for each vector component | (float)(src[i] & 0xFF) | - + | Built-in function: amd_unpack1 | floatn amd_unpack1 (uintn src) | Return value for each vector component @@ -3170,7 +3170,7 @@ This extension adds the following built-in functions to the OpenCL language. Not | floatn amd_unpack3(uintn src) | Return value for each vector component | (float)((src[i] >> 24) & 0xFF) - + | | Built-in function: amd_bitalign | uintn amd_bitalign (uintn src0, uintn src1, uintn src2) @@ -3185,8 +3185,8 @@ This extension adds the following built-in functions to the OpenCL language. Not | Built-in function: amd_lerp | uintn amd_lerp (uintn src0, uintn src1, uintn src2) | Return value for each vector component - | (((((src0[i] >> 0) & 0xFF) + ((src1[i] >> 0) & 0xFF) + ((src2[i] >> 0) & 1)) >> 1) << 0) + (((((src0[i] >> 8) & 0xFF) + ((src1[i] - | >> 8) & 0xFF) + ((src2[i] >> 8) & 1)) >> 1) << 8) + (((((src0[i] >> 16) & 0xFF) + ((src1[i] >> 16) & 0xFF) + ((src2[i] >> 16) & + | (((((src0[i] >> 0) & 0xFF) + ((src1[i] >> 0) & 0xFF) + ((src2[i] >> 0) & 1)) >> 1) << 0) + (((((src0[i] >> 8) & 0xFF) + ((src1[i] + | >> 8) & 0xFF) + ((src2[i] >> 8) & 1)) >> 1) << 8) + (((((src0[i] >> 16) & 0xFF) + ((src1[i] >> 16) & 0xFF) + ((src2[i] >> 16) & |1)) >> 1) << 16) + (((((src0[i] >> 24) & 0xFF) + ((src1[i] >> 24) & 0xFF) + ((src2[i] >> 24) & 1)) >> 1) << 24) ; | | Built-in function: amd_sad @@ -3207,7 +3207,7 @@ This extension adds the following built-in functions to the OpenCL language. Not | abs(((src0[i] >> 16) & 0xFF) - ((src1[i] >> 16) & 0xFF)) + | abs(((src0[i] >> 24) & 0xFF) - ((src1[i] >> 24) & 0xFF)); | - + | Built-in function: amd_sadhi | uintn amd_sadhi (uintn src0, uintn src1, uintn src2) | Return value for each vector component @@ -3220,21 +3220,21 @@ For more information, see: http://www.khronos.org/registry/cl/extensions/amd/cl_ cl_amd_printf **************** -The OpenCL™ Specification 1.1 and 1.2 support the optional AMD extension cl_amd_printf, which provides printf capabilities to OpenCL C programs. To use this extension, an application first must include:: - +The OpenCL(TM) Specification 1.1 and 1.2 support the optional AMD extension cl_amd_printf, which provides printf capabilities to OpenCL C programs. To use this extension, an application first must include:: + #pragma OPENCL EXTENSION cl_amd_printf : enable. Built-in function:: - - printf( constant char * restrict format, …); + + printf( constant char * restrict format, ...); This function writes output to the stdout stream associated with the host application. The format string is a character sequence that: -–is null-terminated and composed of zero and more directives, +-is null-terminated and composed of zero and more directives, -–ordinary characters (i.e. not %), which are copied directly to the output stream unchanged, and +-ordinary characters (i.e. not %), which are copied directly to the output stream unchanged, and -–conversion specifications, each of which can result in fetching zero or more arguments, converting them, and then writing the final result to the output stream. +-conversion specifications, each of which can result in fetching zero or more arguments, converting them, and then writing the final result to the output stream. The format string must be resolvable at compile time; thus, it cannot be dynamically created by the executing program. (Note that the use of variadic arguments in the built-in printf does not imply its use in other built- ins; more importantly, it is not valid to use printf in user-defined functions or kernels.) @@ -3243,55 +3243,55 @@ The OpenCL C printf closely matches the definition found as part of the C99 stan * A 32-bit floating point argument is not converted to a 64-bit double, unless the extension cl_khr_fp64 is supported and enabled, as defined in section 9.3 of the OpenCL Specification 1.1. This includes the double variants if cl_khr_fp64 is supported and defined in the corresponding compilation unit. * 64-bit integer types can be printed using %ld / %lx / %lu . * %lld / %llx / %llu are not supported and reserved for 128-bit integer types (long long). -* All OpenCL vector types (section 6.1.2 of the OpenCL Specification 1.1) can be explicitly passed and printed using the modifier vn, where n can be 2, 3, 4, 8, or 16. This modifier appears before the original conversion specifier for the vector’s component type (for example, to print a float4 %v4f). Since vn is a conversion specifier, it is valid to apply optional flags, such as field width and precision, just as it is when printing the component types. Since a vector is an aggregate type, the comma separator is used between the components: 0:1, … , n-2:n-1. +* All OpenCL vector types (section 6.1.2 of the OpenCL Specification 1.1) can be explicitly passed and printed using the modifier vn, where n can be 2, 3, 4, 8, or 16. This modifier appears before the original conversion specifier for the vector's component type (for example, to print a float4 %v4f). Since vn is a conversion specifier, it is valid to apply optional flags, such as field width and precision, just as it is when printing the component types. Since a vector is an aggregate type, the comma separator is used between the components: 0:1, ... , n-2:n-1. cl_amd_predefined_macros ************************* -The following macros are predefined when compiling OpenCL™ C kernels. These macros are defined automatically based on the device for which the code is being compiled. +The following macros are predefined when compiling OpenCL(TM) C kernels. These macros are defined automatically based on the device for which the code is being compiled. GPU devices: - | __Barts__ - | __Bheem__ - | __Bonaire__ - | __Caicos__ - | __Capeverde__ - | __Carrizo__ - | __Cayman__ - | __Cedar__ - | __Cypress__ + | __Barts__ + | __Bheem__ + | __Bonaire__ + | __Caicos__ + | __Capeverde__ + | __Carrizo__ + | __Cayman__ + | __Cedar__ + | __Cypress__ | __Devastator__ - | __Hainan__ - | __Iceland__ - | __Juniper__ - | __Kalindi__ - | __Kauai__ - | __Lombok__ - | __Loveland__ - | __Mullins__ - | __Oland__ - | __Pitcairn__ - | __RV710__ - | __RV730__ - | __RV740__ - | __RV770__ - | __RV790__ - | __Redwood__ - | __Scrapper__ - | __Spectre__ - | __Spooky__ - | __Tahiti__ - | __Tonga__ - | __Turks__ + | __Hainan__ + | __Iceland__ + | __Juniper__ + | __Kalindi__ + | __Kauai__ + | __Lombok__ + | __Loveland__ + | __Mullins__ + | __Oland__ + | __Pitcairn__ + | __RV710__ + | __RV730__ + | __RV740__ + | __RV770__ + | __RV790__ + | __Redwood__ + | __Scrapper__ + | __Spectre__ + | __Spooky__ + | __Tahiti__ + | __Tonga__ + | __Turks__ | __WinterPark__ - | __GPU__ + | __GPU__ CPU devices: - | __CPU__ - | __X86__ - | __X86_64__ + | __CPU__ + | __X86__ + | __X86_64__ Note that GPU or CPU are predefined whenever a GPU or CPU device is the compilation target. @@ -3300,11 +3300,11 @@ An example kernel is provided below. :: #pragma OPENCL EXTENSION cl_amd_printf : enable const char* getDeviceName() { - #ifdef Cayman + #ifdef Cayman return "Cayman"; - #elif Barts + #elif Barts return "Barts"; - #elif Cypress + #elif Cypress return "Cypress"; #elif defined( Juniper ) return "Juniper"; @@ -3334,12 +3334,12 @@ An example kernel is provided below. return "UnknownDevice"; kernel void test_pf(global int* a) { - printf("Device Name: %s\n", getDeviceName()); + printf("Device Name: %s\n", getDeviceName()); } cl_amd_bus_addressable_memory ****************************** -This extension defines an API for peer-to-peer transfers between AMD GPUs and other PCIe device, such as third-party SDI I/O devices. Peer-to-peer transfers have extremely low latencies by not having to use the host’s main memory or the CPU (see Figure A.1). This extension allows sharing a memory allocated by the graphics driver to be used by other devices on the PCIe bus (peer-to-peer transfers) by exposing a write-only bus address. It also allows memory allocated on other PCIe devices (non-AMD GPU) to be directly accessed by AMD GPUs. One possible use of this is for a video capture device to directly write into the GPU memory using its DMA.This extension is supported only on AMD FirePro™ professional graphics cards. +This extension defines an API for peer-to-peer transfers between AMD GPUs and other PCIe device, such as third-party SDI I/O devices. Peer-to-peer transfers have extremely low latencies by not having to use the host's main memory or the CPU (see Figure A.1). This extension allows sharing a memory allocated by the graphics driver to be used by other devices on the PCIe bus (peer-to-peer transfers) by exposing a write-only bus address. It also allows memory allocated on other PCIe devices (non-AMD GPU) to be directly accessed by AMD GPUs. One possible use of this is for a video capture device to directly write into the GPU memory using its DMA.This extension is supported only on AMD FirePro(TM) professional graphics cards. .. image:: images/a.1.png @@ -3367,11 +3367,11 @@ Extensions Brazos Llano Trinity Cape Verde3 Turks4 Caym cl_khr_byte_addressable_store Yes Yes Yes Yes Yes Yes Yes Yes cl_ext_device_fission onlyCPU only CPU onlyCPU No No No No No cl_amd_device_attribute_query Yes Yes Yes Yes Yes Yes Yes Yes - cl_khr_fp64 onlyCPU only CPU onlyCPU Yes Yes Yes No Yes + cl_khr_fp64 onlyCPU only CPU onlyCPU Yes Yes Yes No Yes cl_amd_fp64 onlyCPU only CPU onlyCPU Yes Yes Yes No Yes cl_amd_vec3 Yes Yes Yes Yes Yes Yes Yes Yes cl_khr_d3d10_sharing Yes Yes Yes Yes Yes Yes Yes Yes - cl_amd_media_ops Yes Yes Yes Yes Yes Yes Yes Yes + cl_amd_media_ops Yes Yes Yes Yes Yes Yes Yes Yes cl_amd_printf Yes Yes Yes Yes Yes Yes Yes Yes cl_amd_popcnt Yes Yes Yes Yes Yes Yes Yes Yes cl_khr_3d_image_writes Yes Yes Yes Yes Yes Yes Yes Yes @@ -3380,13 +3380,13 @@ Extensions Brazos Llano Trinity Cape Verde3 Turks4 Caym **Table A.1 Extension Support for AMD GPU Devices 1** -1. AMD Radeon™ HD 79XX series. -2. AMD Radeon™ HD 78XX series. -3. AMD Radeon™ HD 77XX series. -4. AMD Radeon™ HD 75XX series and AMD Radeon™ HD 76XX series. -5. AMD Radeon™ HD 69XX series. -6. AMD Radeon™ HD 68XX series. -7. ATI Radeon™ HD 59XX series and 58XX series, AMD FirePro™ V88XX series and V87XX series. +1. AMD Radeon(TM) HD 79XX series. +2. AMD Radeon(TM) HD 78XX series. +3. AMD Radeon(TM) HD 77XX series. +4. AMD Radeon(TM) HD 75XX series and AMD Radeon(TM) HD 76XX series. +5. AMD Radeon(TM) HD 69XX series. +6. AMD Radeon(TM) HD 68XX series. +7. ATI Radeon(TM) HD 59XX series and 58XX series, AMD FirePro(TM) V88XX series and V87XX series. Note that an atomic counter is a device-level counter that can be added / decremented by different work-items, where the atomicity of the operation is guaranteed. The access to the counter is done only through add/dec built-in functions; thus, no two work-items have the same value returned in the case that a given kernel only increments or decrements the counter. (Also see http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt.) @@ -3435,13 +3435,13 @@ Note that an atomic counter is a device-level counter that can be added / decrem +---------------------------------+------------+-----------+----------+-----------------------------+ | cl_amd_offline_devices | Yes |Yes | Yes | No | +---------------------------------+------------+-----------+----------+-----------------------------+ - + **Table A.2 Extension Support for Older AMD GPUs and CPUs** -1. ATI Radeon™ HD 5700 series, AMD Mobility Radeon™ HD 5800 series, AMD FirePro™ V5800 series, AMD Mobility FirePro™ M7820. -2. ATI Radeon™ HD 5600 Series, ATI Radeon™ HD 5600 Series, ATI Radeon™ HD 5500 Series, AMD Mobility Radeon™ HD 5700 Series, AMD Mobility Radeon™ HD 5600 Series, AMD FirePro™ V4800 Series, AMD FirePro™ V3800 Series, AMD Mobility FirePro™ M5800 -3. ATI Radeon™ HD 5400 Series, AMD Mobility Radeon™ HD 5400 Series +1. ATI Radeon(TM) HD 5700 series, AMD Mobility Radeon(TM) HD 5800 series, AMD FirePro(TM) V5800 series, AMD Mobility FirePro(TM) M7820. +2. ATI Radeon(TM) HD 5600 Series, ATI Radeon(TM) HD 5600 Series, ATI Radeon(TM) HD 5500 Series, AMD Mobility Radeon(TM) HD 5700 Series, AMD Mobility Radeon(TM) HD 5600 Series, AMD FirePro(TM) V4800 Series, AMD FirePro(TM) V3800 Series, AMD Mobility FirePro(TM) M5800 +3. ATI Radeon(TM) HD 5400 Series, AMD Mobility Radeon(TM) HD 5400 Series 4. Available on all devices that have double-precision, including all Southern Island devices. 5. Environment variable CPU_IMAGE_SUPPORT must be set. @@ -3468,7 +3468,7 @@ Using ICD Sample code that is part of the SDK contains examples showing how to query the platform API and call the functions that require a valid platform parameter. This is a pre-ICD code snippet. :: - + context = clCreateContextFromType(0, dType, NULL, @@ -3479,12 +3479,12 @@ This is a pre-ICD code snippet. :: The ICD-compliant version of this code follows. :: - + /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ - + cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); @@ -3523,27 +3523,27 @@ The ICD-compliant version of this code follows. get whatever the * implementation thinks we should be using. */ - + cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; /* Use NULL for backward compatibility */ cl_context_properties* cprops = (NULL == platform) ? NULL : cps; - + context = clCreateContextFromType(cprops, dType, NULL, NULL, &status); Another example of a pre-ICD code snippet follows. :: - + status = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &numDevices); - + The ICD-compliant version of the code snippet is:: - + status= clGetDeviceiDs(platform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &nurnDevices); - + .. Note:::: It is recommended that the host code look at the platform vendor string when searching for the desired OpenCL platform, instead of using the platform name string. The platform name string might change, whereas the platform vendor string remains constant for a particular vendor's implementation. .. _BIF: @@ -3565,7 +3565,7 @@ The BIF can have other special sections for debugging, etc. It also contains sev * .rodata for storing the OpenCL runtime control data. * other ELF special sections required for forming an ELF (for example: ``.strtab, .symtab, .shstrtab`` ). -By default, OpenCL generates a binary that has LLVM IR, and the executable for the GPU (,.llvmir, .amdil, and .text sections), as well as LLVM IR and the executable for the CPU (.llvmir and .text sections). The BIF binary always contains a .comment section, which is a readable C string. The default behavior can be changed with the BIF options described in Section C.2, “BIF Options,” page C-3. +By default, OpenCL generates a binary that has LLVM IR, and the executable for the GPU (,.llvmir, .amdil, and .text sections), as well as LLVM IR and the executable for the CPU (.llvmir and .text sections). The BIF binary always contains a .comment section, which is a readable C string. The default behavior can be changed with the BIF options described in Section C.2, "BIF Options," page C-3. The LLVM IR enables recompilation from LLVM IR to the target. When a binary is used to run on a device for which the original program was not generated and the original device is feature-compatible with the current device, OpenCL recompiles the LLVM IR to generate a new code for the device. Note that the LLVM IR is only universal within devices that are feature-compatible in the same device type, not across different device types. This means that the LLVM IR for the CPU is not compatible with the LLVM IR for the GPU. The LLVM IR for a GPU works only for GPU devices that have equivalent feature sets. @@ -3646,11 +3646,11 @@ BIF Options ************* OpenCL provides the following options to control what is contained in the binary. --f[no-]bin-source — [not] generate OpenCL source in .source section. +-f[no-]bin-source -- [not] generate OpenCL source in .source section. --f[no-]bin-llvmir — [not] generate LLVM IR in .llvmir section. +-f[no-]bin-llvmir -- [not] generate LLVM IR in .llvmir section. --f[no-]bin-exe — [not] generate the executable (ISA) in .text section. The option syntax follows the GCC option syntax. +-f[no-]bin-exe -- [not] generate the executable (ISA) in .text section. The option syntax follows the GCC option syntax. By default, OpenCL generates the .llvmir section, .amdil section, and .text section. The following are examples for using these options: Example 1: Generate executable for execution: @@ -3700,7 +3700,7 @@ A processing element is arranged as a five-way or four-way (depending on the GPU type) very long instruction word (VLIW) processor (see bottom of Figure D.2). Up to five scalar operations (or four, depending on the GPU type) can be co-issued in a VLIW instruction, each of which are executed on one of the corresponding five ALUs. ALUs can execute single-precision floating point or integer operations. One of the five ALUs also can perform transcendental operations (sine, cosine, logarithm, etc.). Double-precision floating point operations are processed (where supported) by connecting two or four of the ALUs (excluding the transcendental core) to perform a single double-precision operation. The processing element also contains one branch execution unit to handle branch instructions. -Different GPU compute devices have different numbers of processing elements. For example, the ATI Radeon™ HD 5870 GPU has 20 compute units, each with +Different GPU compute devices have different numbers of processing elements. For example, the ATI Radeon(TM) HD 5870 GPU has 20 compute units, each with 16 processing elements, and each processing elements contains five ALUs; this yields 1600 physical ALUs. @@ -4004,7 +4004,7 @@ The following code segment shows how to create an OpenCL-OpenGL interoperability glXDestroyContext(glXGetCurrentDisplay(), gGlCtx); continue; } - else + else { //Interoperable device found std::cout<<"Interoperable device found "<`_ + * `Anaconda(R) with Numba acceleration `_ When to Use Anaconda ********************* -Use Anaconda when you’re handling large-scale data-analytics, +Use Anaconda when you're handling large-scale data-analytics, scientific and engineering problems that require you to manipulate large data arrays. @@ -193,7 +193,7 @@ HC Programming Guide **What is the Heterogeneous Compute (HC) API ?** -It’s a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We’ll be following these developments closely, and you’ll see HC move quickly to include standard C++ capabilities. +It's a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We'll be following these developments closely, and you'll see HC move quickly to include standard C++ capabilities. The Heterogeneous Compute Compiler (HCC) provides two important benefits: @@ -209,7 +209,7 @@ Ease of development Full control over the machine - * Access AMD scratchpad memories (“LDS”) + * Access AMD scratchpad memories ("LDS") * Fully control data movement, prefetch and discard * Fully control asynchronous kernel launch and completion * Get device-side dependency resolution for kernel and data commands (without host involvement) @@ -360,7 +360,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly * Math functions resembling those in the "math.h" header included with standard C++ compilers * Built-in functions for accessing specific GPU hardware capabilities -This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. +This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different. * :ref:`HIP-GUIDE` @@ -408,7 +408,7 @@ hipLaunchKernelGGL(vector_square, /* compute kernel*/ dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/, /* launch config*/ C_d, A_d, N); /* arguments to the compute kernel */ -hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); +hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. It also specifies additional defines and keywords for function types, address spaces, and optimization controls. (See the :ref:`Kernel_language` for a full description). Here's an example of defining a simple 'vector_square' kernel. @@ -485,6 +485,6 @@ OpenCL Best Practices * :ref:`Optimization-Opencl` - - + + diff --git a/Programming_Guides/hcc-guide.rst b/Programming_Guides/hcc-guide.rst index da7d1343..b0a87f8c 100644 --- a/Programming_Guides/hcc-guide.rst +++ b/Programming_Guides/hcc-guide.rst @@ -22,7 +22,7 @@ The Heterogeneous Compute Compiler (HCC) provides two important benefits: **Full control over the machine** - * Access AMD scratchpad memories (“LDS”) + * Access AMD scratchpad memories ("LDS") * Fully control data movement, prefetch and discard * Fully control asynchronous kernel launch and completion * Get device-side dependency resolution for kernel and data commands (without host involvement) diff --git a/Programming_Guides/hcc-profile.rst b/Programming_Guides/hcc-profile.rst index 7e195fc0..80ec2ef0 100644 --- a/Programming_Guides/hcc-profile.rst +++ b/Programming_Guides/hcc-profile.rst @@ -27,7 +27,7 @@ Kernel Commands ++++++++++++++++ This shows the simplest trace output for kernel commands with no additional verbosity flags:: - + $ HCC_PROFILE=2 ./my-hcc-app ... profile: kernel; Im2Col; 17.8 us; profile: kernel; tg_betac_alphaab; 32.6 us; @@ -36,7 +36,7 @@ This shows the simplest trace output for kernel commands with no additional verb :: PROFILE: TYPE; KERNEL_NAME ; DURATION; - + This example shows profiled kernel commands with full verbose output:: $ HCC_PROFILE=2 HCC_PROFILE_VERBOSE=0xf ./my-hcc-app ... @@ -77,7 +77,7 @@ This example shows memory copy commands with full verbose output: * Sync or Async. Synchronous copies indicate the host waits for the completion for the copy. Asynchronous copies are launched by the host without waiting for the copy to complete. * Fast or Slow. Fast copies use the GPUs optimized copy routines from the hsa_amd_memory_copy routine. Slow copies typically involve unpinned host memory and can't take the fast path. * For example `HostToDevice_async_fast. - + * DURATION: command duration measured in us. This is measured using the GPU timestamps and represents the command execution on the acclerator device. * START: command start time in ns. (if HCC_PROFILE_VERBOSE & 0x2) * STOP: command stop time in ns. (if HCC_PROFILE_VERBOSE & 0x2) @@ -94,7 +94,7 @@ Barrier commands are only enabled if HCC_PROFILE_VERBOSE 0x An example barrier command with full vebosity:: profile: barrier; deps:0_acq:none_rel:sys; 5.3 us; 94858731419410; 94858731424690; #0.0.2; - PROFILE: TYPE; BARRIER_NAME ; DURATION; START ; STOP ; ID ; + PROFILE: TYPE; BARRIER_NAME ; DURATION; START ; STOP ; ID ; * PROFILE: always "profile:" to distinguish it from other output. * TYPE: the command type: either kernel, copy, copyslo, or barrier. The examples and descriptions in this section are all copy commands. Copy indicates that the runtime used a call to the fast hsa memory copy routine while copyslo indicates that the copy was implemented with staging buffers or another less optimal path. copy computes the commands using device-side timestamps while copyslo computes the bandwidth based on host timestamps. diff --git a/Programming_Guides/hip-programming-guide.rst b/Programming_Guides/hip-programming-guide.rst index 5114b8ca..5762cc35 100644 --- a/Programming_Guides/hip-programming-guide.rst +++ b/Programming_Guides/hip-programming-guide.rst @@ -56,7 +56,7 @@ hipEventSynchronize Developers can control the release scope for hipEvents: -* By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. +* By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags: @@ -103,4 +103,4 @@ By default staging buffers are used for unpinned memory transfers. Environment v * HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. * HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. * HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. - + diff --git a/Programming_Guides/hip-programming.rst b/Programming_Guides/hip-programming.rst index 487670d1..3f05b811 100644 --- a/Programming_Guides/hip-programming.rst +++ b/Programming_Guides/hip-programming.rst @@ -1,6 +1,6 @@ .. _hip-pro: - + ##################### HIP Programming Guide ##################### @@ -10,7 +10,7 @@ Host Memory Introduction ------------- - + hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system. There are two use cases for this host memory: * Faster HostToDevice and DeviceToHost Data Transfers: The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with --unpinned and --pinned switches for the hipBusBandwidth tool. @@ -55,7 +55,7 @@ hipEventSynchronize Developers can control the release scope for hipEvents: - * By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. + * By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags: diff --git a/Programming_Guides/hip_install.rst b/Programming_Guides/hip_install.rst index 05ddcb54..350dae9d 100644 --- a/Programming_Guides/hip_install.rst +++ b/Programming_Guides/hip_install.rst @@ -96,7 +96,7 @@ By default, HIP uses HCC to compile programs. To use HIP-Clang, add -DHIP_COMPIL cd HIP mkdir build cd build - cmake .. + cmake .. make make install @@ -111,7 +111,7 @@ Here's a richer command-line that overrides the default paths: cd HIP mkdir build - cd build + cd build cmake -DHSA_PATH=/path/to/hsa -DHCC_HOME=/path/to/hcc -DCMAKE_INSTALL_PREFIX=/where/to/install/hip -DCMAKE_BUILD_TYPE=Release .. make make install diff --git a/Programming_Guides/hip_port.rst b/Programming_Guides/hip_port.rst index 59d5c498..75f8a786 100644 --- a/Programming_Guides/hip_port.rst +++ b/Programming_Guides/hip_port.rst @@ -44,7 +44,7 @@ Like the CUDA Driver API, the Module API provides additional control over how co ============ ================================= ================== ================= =========== Format APIs NVCC HCC HIP-CLANG -============ ================================= ================== ================= =========== +============ ================================= ================== ================= =========== Code Object hipModuleLoad, hipModuleLoadData .cubin or PTX text .hsaco .hsaco Fat Binary hipModuleLoadFatBin .fatbin Under Development .hip_fatbin ============ ================================= ================== ================= =========== @@ -115,10 +115,10 @@ CUDA applications may want to mix CUDA driver code with HIP code (see example be ============== =============== =================== HIP Type CU Driver Type CUDA Runtime Type ============== =============== =================== -hipModule_t CUmodule -hipFunction_t CUfunction -hipCtx_t CUcontext -hipDevice_t CUdevice +hipModule_t CUmodule +hipFunction_t CUfunction +hipCtx_t CUcontext +hipDevice_t CUdevice hipStream_t CUstream cudaStream_t hipEvent_t CUevent cudaEvent_t hipArray CUarray cudaArray @@ -227,7 +227,7 @@ The below sample shows how to use hipModuleGetFunction. std::vectorargBuffer(2); memcpy(&argBuffer[0], &Ad, sizeof(void*)); memcpy(&argBuffer[1], &Bd, sizeof(void*)); - + size_t size = argBuffer.size()*sizeof(void*); void *config[] = { @@ -274,7 +274,7 @@ HIP supports texture driver APIs however texture reference should be declared in texture tex; - void myFunc () + void myFunc () { // ... diff --git a/Programming_Guides/hip_profiling.rst b/Programming_Guides/hip_profiling.rst index 8c167d3e..9cea76ec 100644 --- a/Programming_Guides/hip_profiling.rst +++ b/Programming_Guides/hip_profiling.rst @@ -1,5 +1,5 @@ -.. _hip_profiling: +.. _hip_profiling: ################### Profiling HIP Code @@ -25,12 +25,12 @@ Profiling information can viewed in the CodeXL visualization tool or printed dir * :ref:`How to enable profiling at HIP build time` * :ref:`Tracing and Debug` - + * :ref:`Tracing HIP APIs` - * :ref:`Color` - - - + * :ref:`Color` + + + .. _CodeXL Profiling: CodeXL Profiling @@ -57,7 +57,7 @@ Using rocm-profiler performance counter collection rocm-profiler can record performance counter information to provide greater insight inside a kernel, such as the memory bandwidth, ALU busy percentage, and cache statistics. Collecting the common set of useful counters requires passing the counter configuration files for two passes: :: - + $ /opt/rocm/bin/rocm-profiler -C -O --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass1 --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass2 .. _Using CodeXL to view profiling results: @@ -109,7 +109,7 @@ HIP can generate markers at function beginning and end which are displayed on th # Use profile to generate timeline view: export HIP_PROFILE_API=1 $ /opt/rocm/bin/rocm-profiler -A -T - + Or $ /opt/rocm/bin/rocm-profiler -e HIP_PROFILE_API=1 -A -T @@ -130,32 +130,32 @@ Markers can be used to define application-specific events that will be recorded Markers have a specific begin and end time, and can be nested. Nested calls are displayed hierarchically in the CodeXL GUI, with each level of the hierarchy occupying a different row. The HIP APis are defined in "hip_profile.h":: - - #include - + + #include + HIP_BEGIN_MARKER(const char *markerName, const char *groupName); - HIP_END_MARKER(); - + HIP_END_MARKER(); + HIP_BEGIN_MARKER("Setup", "MyAppGroup"); // ... // application code for setup // ... HIP_END_MARKER(); - + For C++ codes, HIP also provides a scoped marker which records the start time when constructed and the end time when the scoped marker is destructed at the end of the scope. This provides a convenient, single-line mechanism to record an event that neatly corresponds to a region of code. :: - void FunctionFoo(...) + void FunctionFoo(...) { - HIP_SCOPED_MARKER("FunctionFoo", "MyAppGroup"); // Marker starts recording here. - + HIP_SCOPED_MARKER("FunctionFoo", "MyAppGroup"); // Marker starts recording here. + // ... // Function implementation - // ... - + // ... + // Marker destroyed here and records end time stamp. }; - + The HIP marker API is only supported on ROCm platform. The marker macros are defined on CUDA platforms and will compile, but are silently ignored at runtime. This `HIP sample `_ shows the profiler marker API used in a small application. @@ -177,21 +177,21 @@ Demangling C++ Kernel Names HIP includes the ``hipdemangleatp`` tool which can post-process an ATP file to "demangle" C++ names. Mangled kernel names encode the C++ arguments and other information, and are guaranteed to be unique even for cases such as operator overloading. However, the mangled names can be quite verbose. For example: :: - + ZZ39gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4RN2hc16accelerator_viewEPKflS3_lPfliiiiiiffEN3_EC__719__cxxamp_trampolineElililiiiiiiS3_iS3_S4_ff **hipdemangleatp** will convert this into the more readable:: - + gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4 The hipdemangleatp tool operates on the ATP file "in-place" and thus replaces the input file with the demangled version. :: - + $ hipdemangleatp myfile.atp The kernel name is also shown in some of the summary htlm files (Top10 kernels). These can be regenerated from the demangled ATP file by re-running rocm-profiler: :: - + $ rocm-profiler -T --atpfile myfile.atp A future version of CodeXL may directly integrate demangle functionality. @@ -230,7 +230,7 @@ Reducing timeline trace output file size If the application is already recording the HIP APIs, the HSA APIs are somewhat redundant and the ATP file size can be substantially reduced by not recording these APIs. HIP includes a text file that lists all of the HSA APIs and can assist in this filtering: :: - $ rocm-profiler -F hip/bin/hsa-api-filter-cxl.txt + $ rocm-profiler -F hip/bin/hsa-api-filter-cxl.txt This file can be copied and edited to provide more selective HSA event recording. @@ -246,8 +246,8 @@ Recent pre-built packages of HIP are always built with profiling support enabled $ mkdir build && cd build $ cmake .. -DCOMPILE_HIP_ATP_MARKER $ make install - - + + 2. Install ROCm-Profiler Installing HIP from the `rocm `_ pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions here. 3. Recompile the target application @@ -277,7 +277,7 @@ The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API Heres a specific example showing the output of the square program running on HIP:: - $ HIP_TRACE_API=1 ./square.hip.out + $ HIP_TRACE_API=1 ./square.hip.out hip-api tid:1:HIP initialized short_tid#1 (maps to full_tid: 0x7f6183b097c0) <> @@ -300,7 +300,7 @@ Heres a specific example showing the output of the square program running on HIP PASSED! HIP_TRACE_API supports multiple levels of debug information: - + * 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset. * 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernelGGL, hipLaunchModuleKernel * 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*. diff --git a/Programming_Guides/hipporting-driver-api.rst b/Programming_Guides/hipporting-driver-api.rst index 9de8c622..42892fa7 100644 --- a/Programming_Guides/hipporting-driver-api.rst +++ b/Programming_Guides/hipporting-driver-api.rst @@ -11,7 +11,7 @@ CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have signifi * Both APIs support events, streams, memory management, memory copy, and error handling. * Both APIs deliver similar performance. * Driver APIs calls begin with the prefix **cu** while Runtime APIs begin with the prefix cuda. For example, the Driver API API contains 'cuEventCreate' while the Runtime API contains 'cudaEventCreate', with similar functionality. -* The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines ``CUDA_ERROR_INVALID_VALUE`` while the Runtime API defines ``cudaErrorInvalidValue`` +* The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines ``CUDA_ERROR_INVALID_VALUE`` while the Runtime API defines ``cudaErrorInvalidValue`` The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs. @@ -142,78 +142,78 @@ The ``hipModule_t`` interface does not support ``cuModuleLoadDataEx`` function, For example (CUDA):: CUmodule module; - void *imagePtr = ...; // Somehow populate data pointer with code object - + void *imagePtr = ...; // Somehow populate data pointer with code object + const int numOptions = 1; CUJit_option options[numOptions]; - void * optionValues[numOptions]; - + void * optionValues[numOptions]; + options[0] = CU_JIT_MAX_REGISTERS; unsigned maxRegs = 15; - optionValues[0] = (void*)(&maxRegs); - - cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); - + optionValues[0] = (void*)(&maxRegs); + + cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + CUfunction k; cuModuleGetFunction(&k, module, "myKernel"); - -HIP:: + +HIP:: hipModule_t module; - void *imagePtr = ...; // Somehow populate data pointer with code object - + void *imagePtr = ...; // Somehow populate data pointer with code object + const int numOptions = 1; hipJitOption options[numOptions]; - void * optionValues[numOptions]; - + void * optionValues[numOptions]; + options[0] = hipJitOptionMaxRegisters; unsigned maxRegs = 15; - optionValues[0] = (void*)(&maxRegs); - + optionValues[0] = (void*)(&maxRegs); + // hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and // cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); - + hipFunction_t k; hipModuleGetFunction(&k, module, "myKernel"); - -The below sample shows how to use hipModuleGetFunction. + +The below sample shows how to use hipModuleGetFunction. :: - + #include #include #include #include #include - + #define LEN 64 - #define SIZE LEN<<2 - + #define SIZE LEN<<2 + #ifdef __HIP_PLATFORM_HCC__ #define fileName "vcpy_isa.co" #endif - + #ifdef __HIP_PLATFORM_NVCC__ #define fileName "vcpy_isa.ptx" - #endif - + #endif + #define kernel_name "hello_world" - + int main(){ float *A, *B; hipDeviceptr_t Ad, Bd; A = new float[LEN]; B = new float[LEN]; - + for(uint32_t i=0;iargBuffer(2); memcpy(&argBuffer[0], &Ad, sizeof(void*)); memcpy(&argBuffer[1], &Bd, sizeof(void*)); - + size_t size = argBuffer.size()*sizeof(void*); - + void *config[] = { HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END }; - + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); - + hipMemcpyDtoH(B, Bd, SIZE); for(uint32_t i=0;i tex; - void myFunc () + void myFunc () { // ... diff --git a/README.md b/README.md index 979789b9..e1c99092 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ - ## ROCm Documentation - - Repository for ROCm documentation rendered by ReadtheDocs + ## ROCm Documentation + + Repository for ROCm documentation rendered by ReadtheDocs diff --git a/ROCm.rst b/ROCm.rst index 6e7d23ab..1dbce90e 100644 --- a/ROCm.rst +++ b/ROCm.rst @@ -33,7 +33,7 @@ This guide provides documentation on the ROCm programming model and programming * listings of supported mathematical functions * C++ features supported in host and device code * technical specifications of various devices -* introduction to the low-level driver API +* introduction to the low-level driver API | - `ROCm Languages `_ @@ -69,11 +69,11 @@ Performance and optimization for various device types such as GCN devices `GCN ISA Manuals `_ -* `GCN 1.1 `_ - For information on ISA Manual for Hawaii (Sea Islands Series Instruction Set Architecture) +* `GCN 1.1 `_ - For information on ISA Manual for Hawaii (Sea Islands Series Instruction Set Architecture) * `GCN 2.0 `_ - For information on ISA Manual for Fiji and Polaris (AMD Accelerated Parallel Processing technology) -* `Vega `_ - Provides “Vega” Instruction Set Architecture, Program Organization, Mode register and more details. +* `Vega `_ - Provides "Vega" Instruction Set Architecture, Program Organization, Mode register and more details. * `Inline GCN ISA Assembly Guide `_ - Covers various concepts of AMDGCN Assembly, DS Permute Instructions, Parameters to a Kernel, GPR Counting. @@ -81,7 +81,7 @@ Performance and optimization for various device types such as GCN devices `ROCm API References `_ -* `ROCr System Runtime API `_ +* `ROCr System Runtime API `_ * `HCC Language Runtime API `_ @@ -93,7 +93,7 @@ Performance and optimization for various device types such as GCN devices * `Math Library API `_ - Includes HIP MAth API with hcRNG, clBLAS, clSPARSE APIs -* `Deep Learning API `_ - Includes MIOpen API and MIOpenGEMM APIs +* `Deep Learning API `_ - Includes MIOpen API and MIOpenGEMM APIs @@ -104,7 +104,7 @@ Performance and optimization for various device types such as GCN devices * `GCN Assembler and Disassembler `_ -* `GCN Assembler Tools `_ - AMDGPU ISA Assembler +* `GCN Assembler Tools `_ - AMDGPU ISA Assembler * `ROCm-GDB `_ - ROCm-GDB tool includes installtion, configuration, and working of Debugger and APIs @@ -112,20 +112,20 @@ Performance and optimization for various device types such as GCN devices * `ROCm-Tracer `_ - ROCm Tracer - provides a generic independent from specific runtime profiler to trace API and asynchronous activity. Includes details on library source tree, steps to build and run the test -* `CodeXL `_ +* `CodeXL `_ * `GPUperfAPI `_ - GPU Performance API, cloning, system requiments, and source code directory layout -`AOMP `_ +`AOMP `_ Provides details on AOMP, a scripted build of LLVM and supporting software. Supports OpenMP target offload on AMD GPUs. Since AOMP is a clang/llvm compiler, it also supports GPU offloading with HIP, CUDA, and OpenCL. -`ROCmValidationSuite `_ +`ROCmValidationSuite `_ -Provides details on ROCm Validation Suite (RVS), a system administrator’s and cluster manager’s tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform. +Provides details on ROCm Validation Suite (RVS), a system administrator's and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform. | @@ -135,7 +135,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and | This section provides details on rocFFT,it is a AMD's software library compiled with the CUDA compiler using HIP tools for running on Nvidia GPU devices. | `rocBLAS `_ -| This section provides details on rocBLAS, it is a library for BLAS on ROCm.rocBLAS is implemented in the HIP programming language and optimized for AMD’s latest discrete GPUs. +| This section provides details on rocBLAS, it is a library for BLAS on ROCm.rocBLAS is implemented in the HIP programming language and optimized for AMD's latest discrete GPUs. | `hipBLAS `_ | This section provides details on hipBLAS, it is a BLAS marshalling library, with multiple supported backends. hipBLAS exports an interface that does not require the client to change. Currently,it supports :ref:`rocblas` and cuBLAS as backends. @@ -153,7 +153,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and | This section provides details on clBLAS. It makes easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. | `clSPARSE `_ -| This section provides details on clSPARSE, it is an OpenCL library which implements Sparse linear algebra routines. +| This section provides details on clSPARSE, it is an OpenCL library which implements Sparse linear algebra routines. | `clRNG `_ | This section provides details on clRNG,This is a library for uniform random number generation in OpenCL. @@ -171,10 +171,10 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and | This section provides details on rocSPARSE.It is a library that contains basic linear algebra subroutines for sparse matrices and vectors written in HiP for GPU devices. It is designed to be used from C and C++ code. | `rocThrust `_ -| This section provides details on rocThrust. It is a parallel algorithmn library. +| This section provides details on rocThrust. It is a parallel algorithmn library. -| `hipCUB `_ This section provides details on hipCUB. -| It is a thin wrapper library on top of rocPRIM or CUB. It enables developers to port the project using CUB library to the HIP layer and to +| `hipCUB `_ This section provides details on hipCUB. +| It is a thin wrapper library on top of rocPRIM or CUB. It enables developers to port the project using CUB library to the HIP layer and to | run them on AMD hardware. | `ROCm SMI Library `_ This section provides details on ROCm SMI library. The ROCm System Management Interface Library, or ROCm SMI library is part of the Radeon Open Compute ROCm software stack. It is a C library for linux that provides a user space interface for applications to monitor and control GPU aplications. @@ -182,7 +182,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and | `RCCL `_ This section provides details on ROCm Communications Collectives Library. It is a stand alone library of standard collective communication routines for GPUS, implememting all-reduce, all gather, reduce, broadcast, and reduce scatter. | `AMD MivisionX `_ -This section provides information on AMD’s graph optimization engine. +This section provides information on AMD's graph optimization engine. `ROCm Compiler SDK `_ @@ -192,7 +192,7 @@ This section provides information on AMD’s graph optimization engine. | `ROCm Code Object Format `_ | This section describes about application binary interface (ABI) provided by the AMD, implementation of the HSA runtime. It also provides details on Kernel, AMD Queue and Signals. - + | `ROCm Device Library `_ | Documentation on instruction related to ROCm Device Library overview,Building and Testing related information with respect to Device Library is provided. @@ -225,7 +225,7 @@ This section provides information on AMD’s graph optimization engine. | ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU memory. Complete indoemation related to ROCmRDMA is Documented here. | `UCX `_ -| This section gives information related to UCX, How to install, Running UCX and much more +| This section gives information related to UCX, How to install, Running UCX and much more | `MPI `_ | This section gives information related to MPI. diff --git a/ROCm_API_References/BLAS1.rst b/ROCm_API_References/BLAS1.rst index 9e97ec07..e554efa9 100644 --- a/ROCm_API_References/BLAS1.rst +++ b/ROCm_API_References/BLAS1.rst @@ -7,7 +7,7 @@ BLAS1 functions SWAP - Swap elements from 2 vectors ------------------------------------ .. doxygenfunction:: clblasCswap() - + .. doxygenfunction:: clblasDswap() .. doxygenfunction:: clblasSswap() @@ -30,7 +30,7 @@ SSCAL - Scales a complex vector by a real constant .. doxygenfunction:: clblasCsscal() .. doxygenfunction:: clblasZdscal() - + COPY - Copies elements from vector X to vector Y -------------------------------------------------- @@ -51,7 +51,7 @@ AXPY - Scale X and add to Y .. doxygenfunction:: clblasSaxpy() .. doxygenfunction:: clblasZaxpy() - + DOT - Dot product of two vectors @@ -73,8 +73,8 @@ ROTG - Constructs givens plane rotation .. doxygenfunction:: clblasCrotg() .. doxygenfunction:: clblasDrotg() - -.. doxygenfunction:: clblasSrotg() + +.. doxygenfunction:: clblasSrotg() .. doxygenfunction:: clblasZrotg() @@ -105,14 +105,14 @@ ROTM - Apply modified givens rotation for points in the plane NRM2 - Euclidean norm of a vector ------------------------------------- +------------------------------------ .. doxygenfunction:: clblasDnrm2() .. doxygenfunction:: clblasDznrm2() .. doxygenfunction:: clblasScnrm2() -.. doxygenfunction:: clblasSnrm2() +.. doxygenfunction:: clblasSnrm2() iAMAX - Index of max absolute value ------------------------------------ @@ -126,7 +126,7 @@ iAMAX - Index of max absolute value ASUM - Sum of absolute values ------------------------------------- +------------------------------------ .. doxygenfunction:: clblasDasum() .. doxygenfunction:: clblasDzasum() diff --git a/ROCm_API_References/BLAS2.rst b/ROCm_API_References/BLAS2.rst index 41662805..b4c699ac 100644 --- a/ROCm_API_References/BLAS2.rst +++ b/ROCm_API_References/BLAS2.rst @@ -21,14 +21,14 @@ SYMV - Symmetric matrix-Vector multiplication HEMV - Hermitian matrix-vector multiplication ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasChemv() .. doxygenfunction:: clblasZhemv() TRMV - Triangular matrix vector multiply ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasCtrmv() .. doxygenfunction:: clblasDtrmv() @@ -39,7 +39,7 @@ TRMV - Triangular matrix vector multiply TRSV - Triangular matrix vector Solve ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasCtrsv() .. doxygenfunction:: clblasDtrsv() @@ -49,7 +49,7 @@ TRSV - Triangular matrix vector Solve .. doxygenfunction:: clblasZtrsv() GER - General matrix rank 1 operation ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasDger() .. doxygenfunction:: clblasSger() @@ -110,7 +110,7 @@ TPMV - Triangular packed matrix-vector multiply TPSV - Triangular packed matrix vector solve ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasCtpsv() .. doxygenfunction:: clblasStpsv() @@ -143,9 +143,9 @@ SPR - Symmetric packed matrix rank 1 update .. doxygenfunction:: clblasSspr() - + HPR - Hermitian packed matrix rank 1 update ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasChpr() .. doxygenfunction:: clblasZhpr() @@ -153,7 +153,7 @@ HPR - Hermitian packed matrix rank 1 update SPR2 - Symmetric packed matrix rank 2 update ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasDspr2() @@ -167,7 +167,7 @@ HPR2 - Hermitian packed matrix rank 2 update .. doxygenfunction:: clblasZhpr2() - + GBMV - General banded matrix-vector multiplication --------------------------------------------------- .. doxygenfunction:: clblasCgbmv() @@ -210,7 +210,7 @@ HBMV - Hermitian banded matrix-vector multiplication TBSV - Solving triangular banded matrix ---------------------------------------------- +--------------------------------------------- .. doxygenfunction:: clblasCtbsv() diff --git a/ROCm_API_References/BLAS3.rst b/ROCm_API_References/BLAS3.rst index a78df278..7f3d7708 100644 --- a/ROCm_API_References/BLAS3.rst +++ b/ROCm_API_References/BLAS3.rst @@ -44,7 +44,7 @@ TRSM - Solving triangular systems of equations SYRK - Symmetric rank-k update of a matrix --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasCsyrk() @@ -58,7 +58,7 @@ SYRK - Symmetric rank-k update of a matrix SYR2K - Symmetric rank-2k update to a matrix --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasSsyr2k() .. doxygenfunction:: clblasZsyr2k() @@ -66,7 +66,7 @@ SYR2K - Symmetric rank-2k update to a matrix SYMM - Symmetric matrix-matrix multiply --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasCsymm() .. doxygenfunction:: clblasDsymm() @@ -80,7 +80,7 @@ SYMM - Symmetric matrix-matrix multiply HEMM - Hermitian matrix-matrix multiplication --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasChemm() @@ -89,7 +89,7 @@ HEMM - Hermitian matrix-matrix multiplication HERK - Hermitian rank-k update to a matrix --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasCherk() @@ -100,7 +100,7 @@ HERK - Hermitian rank-k update to a matrix HER2K - Hermitian rank-2k update to a matrix --------------------------------------------- +-------------------------------------------- .. doxygenfunction:: clblasCher2k() diff --git a/ROCm_API_References/HCC-API.rst b/ROCm_API_References/HCC-API.rst index 13abbcab..e9024358 100644 --- a/ROCm_API_References/HCC-API.rst +++ b/ROCm_API_References/HCC-API.rst @@ -46,7 +46,7 @@ For example: :: - `` hcchcc-config –cxxflags –ldflagsfoo.cpp -o foo `` + `` hcchcc-config -cxxflags -ldflagsfoo.cpp -o foo `` HCC built-in macros ******************** @@ -143,4 +143,4 @@ HC supports capturing memory pointer by a GPU kernel. ``` // allocate GPU memory through the HSA API int* gpu_pointer; hsa_memory_allocate(..., &gpu_pointer); ... parallel_for_each(ext, [=](index i) [[hc]] { gpu_pointer[i[0]]++; } -``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ``` \ No newline at end of file +``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ``` diff --git a/ROCm_API_References/HIP-MATH.rst b/ROCm_API_References/HIP-MATH.rst index bc20d3c6..1fc84001 100644 --- a/ROCm_API_References/HIP-MATH.rst +++ b/ROCm_API_References/HIP-MATH.rst @@ -1,6 +1,6 @@ .. _HIP-MATH: -HIP MATH APIs Documentation +HIP MATH APIs Documentation ############################ HIP supports most of the device functions supported by CUDA. Way to find the unsupported one is to search for the function and check its description @@ -9,14 +9,14 @@ HIP supports most of the device functions supported by CUDA. Way to find the uns For Developers If you add or fixed a device function, make sure to add a signature of the function and definition later. -For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components -The way to add to the header is, +For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components +The way to add to the header is, -:: +:: -__device__ static float __dotf(float4, float4); +__device__ static float __dotf(float4, float4); /*Way down in the file....*/ -__device__ static inline float __dotf(float4 x, float4 y) { +__device__ static inline float __dotf(float4 x, float4 y) { /*implementation*/ } @@ -70,7 +70,7 @@ atan2f ********* :: - + __device__ float atan2f(float y, float x); **Description:** Supported @@ -90,7 +90,7 @@ __device__ float atanf(float x); atanhf ********* -:: +:: __device__ float atanhf(float x); @@ -110,7 +110,7 @@ __device__ float cbrtf(float x); ceilf ********* -:: +:: __device__ float ceilf(float x); @@ -121,7 +121,7 @@ __device__ float ceilf(float x); copysignf ********* -:: +:: __device__ float copysignf(float x, float y); @@ -132,7 +132,7 @@ copysignf cosf ********* -:: +:: __device__ float cosf(float x); @@ -142,7 +142,7 @@ __device__ float cosf(float x); coshf ********* -:: +:: __device__ float coshf(float x); @@ -152,7 +152,7 @@ __device__ float coshf(float x); cospif ********* -:: +:: __device__ float cospif(float x); @@ -162,7 +162,7 @@ __device__ float cospif(float x); cyl_bessel_i0f ********* -:: +:: //__device__ float cyl_bessel_i0f(float x); @@ -172,7 +172,7 @@ cyl_bessel_i0f cyl_bessel_i1f ********* -:: +:: //__device__ float cyl_bessel_i1f(float x); @@ -181,8 +181,8 @@ cyl_bessel_i1f erfcf ********* - :: - + :: + __device__ float erfcf(float x); @@ -191,7 +191,7 @@ erfcf erfcinvf ********* -:: +:: __device__float erfcinvf(float y); @@ -200,7 +200,7 @@ __device__float erfcinvf(float y); erfcxf ********* -:: +:: __device__ float erfcxf(float x); @@ -209,7 +209,7 @@ erfcxf erff ********* -:: +:: __device__ float erff(float x); @@ -219,7 +219,7 @@ __device__ float erff(float x); erfinvf ********* -:: +:: __device__ float erfinvf(float y); @@ -229,7 +229,7 @@ __device__ float erfinvf(float y); exp10f ********* -:: +:: __device__ float exp10f(float x); @@ -239,7 +239,7 @@ __device__ float exp10f(float x); exp2f ********* -:: +:: _device__ float exp2f(float x); @@ -250,7 +250,7 @@ _device__ float exp2f(float x); expf ********* -:: +:: __device__ float expf(float x); @@ -261,7 +261,7 @@ __device__ float expf(float x); expm1f ********* -:: +:: __device__ float expm1f(float x); @@ -272,7 +272,7 @@ __device__ float expm1f(float x); fabsf ********* :: - + __device__ float fabsf(float x); @@ -281,7 +281,7 @@ fabsf fdimf ********* -:: +:: __device__ float fdimf(float x, float y); @@ -291,7 +291,7 @@ __device__ float fdimf(float x, float y); fdivide ********* -:: +:: __device__ float fdividef(float x, float y); @@ -301,7 +301,7 @@ __device__ float fdividef(float x, float y); floorf ********* -:: +:: __device__ float floorf(float x); @@ -311,7 +311,7 @@ __device__ float floorf(float x); fmaf ********* -:: +:: __device__ float fmaf(float x, float y, float z); @@ -321,7 +321,7 @@ __device__ float fmaf(float x, float y, float z); fmaxf ********* -:: +:: __device__ float fmaxf(float x, float y); @@ -331,7 +331,7 @@ __device__ float fmaxf(float x, float y); fminf ********* -:: +:: __device__ float fminf(float x, float y); @@ -341,7 +341,7 @@ __device__ float fminf(float x, float y); fmodf ********* -:: +:: __device__ float fmodf(float x, float y); @@ -351,8 +351,8 @@ __device__ float fmodf(float x, float y); frexpf ********* -:: - +:: + //__device__ float frexpf(float x, int* nptr); @@ -361,7 +361,7 @@ frexpf hypotf ********* -:: +:: __device__ float hypotf(float x, float y); @@ -371,7 +371,7 @@ __device__ float hypotf(float x, float y); ilogbf ********* -:: +:: __device__ float ilogbf(float x); @@ -381,7 +381,7 @@ __device__ float ilogbf(float x); isfinite ********* -:: +:: __device__ int isfinite(float a); @@ -391,7 +391,7 @@ __device__ int isfinite(float a); isinf ********* -:: +:: __device__ unsigned isinf(float a); @@ -401,7 +401,7 @@ isinf isnan ********* -:: +:: __device__ unsigned isnan(float a); @@ -411,7 +411,7 @@ isnan j0f ********* -:: +:: __device__ float j0f(float x); @@ -421,7 +421,7 @@ __device__ float j0f(float x); j1f ********* -:: +:: __device__ float j1f(float x); @@ -431,7 +431,7 @@ j1f jnf ********* -:: +:: __device__ float jnf(int n, float x); @@ -440,7 +440,7 @@ __device__ float jnf(int n, float x); ldexpf ********* -:: +:: __device__ float ldexpf(float x, int exp); @@ -450,7 +450,7 @@ __device__ float ldexpf(float x, int exp); lgammaf ********* -:: +:: //__device__ float lgammaf(float x); @@ -460,7 +460,7 @@ lgammaf llrintf ********* -:: +:: __device__ long long int llrintf(float x); @@ -470,7 +470,7 @@ __device__ long long int llrintf(float x); llroundf ********* -:: +:: __device__ long long int llroundf(float x); @@ -480,7 +480,7 @@ __device__ long long int llroundf(float x); log10f ********* -:: +:: __device__ float log10f(float x); @@ -490,7 +490,7 @@ __device__ float log10f(float x); log1pf ********* -:: +:: __device__ float log1pf(float x); @@ -500,7 +500,7 @@ __device__ float log1pf(float x); logbf ********* -:: +:: __device__ float logbf(float x); @@ -510,7 +510,7 @@ __device__ float logbf(float x); lrintf ********* -:: +:: __device__ long int lrintf(float x); @@ -520,7 +520,7 @@ __device__ long int lrintf(float x); lroundf ********* -:: +:: __device__ long int lroundf(float x); @@ -530,7 +530,7 @@ __device__ long int lroundf(float x); modff ********* -:: +:: //__device__ float modff(float x, float *iptr); @@ -540,7 +540,7 @@ modff nanf ********* -:: +:: __device__ float nanf(const char* tagp); @@ -550,7 +550,7 @@ nanf nearbyintf ********* -:: +:: __device__ float nearbyintf(float x); @@ -560,7 +560,7 @@ __device__ float nearbyintf(float x); nextafterf ********* -:: +:: //__device__ float nextafterf(float x, float y); @@ -570,7 +570,7 @@ nextafterf norm3df ********* -:: +:: __device__ float norm3df(float a, float b, float c); @@ -580,7 +580,7 @@ norm3df norm4df ********* -:: +:: __device__ float norm4df(float a, float b, float c, float d); @@ -590,7 +590,7 @@ __device__ float norm4df(float a, float b, float c, float d); normcdff ********* -:: +:: __device__ float normcdff(float y); @@ -600,7 +600,7 @@ __device__ float normcdff(float y); normcdfinvf ********* -:: +:: __device__ float normcdfinvf(float y); @@ -610,7 +610,7 @@ normcdfinvf normf ********* -:: +:: __device__ float normf(int dim, const float *a); @@ -620,7 +620,7 @@ __device__ float normf(int dim, const float *a); powf ********* -:: +:: __device__ float powf(float x, float y); @@ -630,8 +630,8 @@ powf rcbrtf ********* -:: - +:: + __device__ float rcbrtf(float x); @@ -640,7 +640,7 @@ rcbrtf remainderf ********* -:: +:: __device__ float remainderf(float x, float y); @@ -649,8 +649,8 @@ remainderf remquof ********* -:: - +:: + __device__ float remquof(float x, float y, int *quo); @@ -659,7 +659,7 @@ remquof rhypotf ********* -:: +:: __device__ float rhypotf(float x, float y); @@ -669,7 +669,7 @@ __device__ float rhypotf(float x, float y); rintf ********* -:: +:: __device__ float rintf(float x); @@ -678,7 +678,7 @@ rintf rnorm3df ********* -:: +:: __device__ float rnorm3df(float a, float b, float c); @@ -688,7 +688,7 @@ rnorm3df rnorm4df ********* -:: +:: __device__ float rnorm4df(float a, float b, float c, float d); @@ -698,7 +698,7 @@ rnorm4df rnormf ********* -:: +:: __device__ float rnormf(int dim, const float* a); @@ -708,7 +708,7 @@ __device__ float rnormf(int dim, const float* a); roundf ********* -:: +:: __device__ float roundf(float x); @@ -718,7 +718,7 @@ roundf rsqrtf ********* -:: +:: __device__ float rsqrtf(float x); @@ -728,7 +728,7 @@ rsqrtf scalblnf ********* -:: +:: __device__ float scalblnf(float x, long int n); @@ -738,7 +738,7 @@ scalblnf scalbnf ********* -:: +:: __device__ float scalbnf(float x, int n); @@ -748,7 +748,7 @@ scalbnf signbit ********* -:: +:: __device__ int signbit(float a); @@ -757,7 +757,7 @@ signbit sincosf ********* -:: +:: __device__ void sincosf(float x, float *sptr, float *cptr); @@ -767,7 +767,7 @@ sincosf sincospif ********* -:: +:: __device__ void sincospif(float x, float *sptr, float *cptr); @@ -777,7 +777,7 @@ __device__ void sincospif(float x, float *sptr, float *cptr); sinf ********* -:: +:: __device__ float sinf(float x); @@ -787,7 +787,7 @@ __device__ float sinf(float x); sinhf ********* -:: +:: __device__ float sinhf(float x); @@ -797,7 +797,7 @@ __device__ float sinhf(float x); sinpif ********* -:: +:: __device__ float sinpif(float x); @@ -807,8 +807,8 @@ __device__ float sinpif(float x); sqrtf ********* -:: - +:: + __device__ float sqrtf(float x); **Description:** Supported @@ -817,7 +817,7 @@ __device__ float sqrtf(float x); tanf ********* -:: +:: __device__ float tanf(float x); @@ -826,8 +826,8 @@ tanf tanhf -********* - :: +********* + :: __device__ float tanhf(float x); @@ -837,7 +837,7 @@ tanhf tgammaf ********* -:: +:: __device__ float tgammaf(float x); @@ -847,8 +847,8 @@ tgammaf truncf ********* -:: - +:: + __device__ float truncf(float x); @@ -857,7 +857,7 @@ truncf y0f ********* -:: +:: __device__ float y0f(float x); @@ -867,7 +867,7 @@ __device__ float y0f(float x); y1f ********* -:: +:: __device__ float y1f(float x); @@ -876,7 +876,7 @@ __device__ float y1f(float x); ynf ********* -:: +:: __device__ float ynf(int n, float x); @@ -886,7 +886,7 @@ ynf acos ********* -:: +:: __device__ double acos(double x); @@ -896,7 +896,7 @@ acos acosh ********* -:: +:: __device__ double acosh(double x); @@ -906,7 +906,7 @@ __device__ double acosh(double x); asin ********* -:: +:: __device__ double asin(double x); @@ -916,7 +916,7 @@ asin asinh ********* -:: +:: __device__ double asinh(double x); @@ -926,8 +926,8 @@ asinh atan ********* -:: - +:: + __device__ double atan(double x); @@ -936,8 +936,8 @@ atan atan2 ********* -:: - +:: + __device__ double atan2(double y, double x); @@ -946,7 +946,7 @@ atan2 atanh ********* -:: +:: __device__ double atanh(double x); @@ -956,8 +956,8 @@ atanh cbrt ********* -:: - +:: + __device__ double cbrt(double x); @@ -967,7 +967,7 @@ cbrt ceil ********* :: - + __device__ double ceil(double x); @@ -976,7 +976,7 @@ ceil copysign ********* -:: +:: __device__ double copysign(double x, double y); @@ -985,7 +985,7 @@ copysign cos ********* -:: +:: __device__ double cos(double x); @@ -995,7 +995,7 @@ cos cosh ********* -:: +:: __device__ double cosh(double x); @@ -1005,7 +1005,7 @@ cosh cospi ********* -:: +:: __device__ double cospi(double x); @@ -1015,7 +1015,7 @@ cospi cyl_bessel_i0 ****************** -:: +:: //__device__ double cyl_bessel_i0(double x); @@ -1025,7 +1025,7 @@ cyl_bessel_i0 cyl_bessel_i1 ****************** -:: +:: //__device__ double cyl_bessel_i1(double x); @@ -1035,8 +1035,8 @@ cyl_bessel_i1 erf ********* -:: - +:: + __device__ double erf(double x); @@ -1046,7 +1046,7 @@ erf erfc ********* :: - + __device__ double erfc(double x); @@ -1055,7 +1055,7 @@ erfc erfcinv ********* -:: +:: __device__ double erfcinv(double y); @@ -1065,7 +1065,7 @@ erfcinv erfcx ********* -:: +:: __device__ double erfcx(double x); @@ -1075,7 +1075,7 @@ erfcx erfinv ********* -:: +:: __device__ double erfinv(double x); @@ -1085,7 +1085,7 @@ erfinv exp ********* -:: +:: __device__ double exp(double x); @@ -1095,7 +1095,7 @@ exp exp10 ********* -:: +:: __device__ double exp10(double x); @@ -1105,7 +1105,7 @@ exp10 exp2 ********* -:: +:: __device__ double exp2(double x); @@ -1115,7 +1115,7 @@ exp2 expm1 ********* -:: +:: __device__ double expm1(double x); @@ -1125,7 +1125,7 @@ expm1 fabs ********* -:: +:: __device__ double fabs(double x); @@ -1135,7 +1135,7 @@ fabs fdim ********* -:: +:: __device__ double fdim(double x, double y); @@ -1145,7 +1145,7 @@ fdim floor ********* -:: +:: __device__ double floor(double x); @@ -1155,7 +1155,7 @@ floor fma ********* -:: +:: __device__ double fma(double x, double y, double z); @@ -1165,7 +1165,7 @@ fma fmax ********* -:: +:: __device__ double fmax(double x, double y); @@ -1175,7 +1175,7 @@ fmax fmin ********* -:: +:: __device__ double fmin(double x, double y); @@ -1186,16 +1186,16 @@ fmin fmod ********* :: - + __device__ double fmod(double x, double y); - + **Description:** Supported frexp ********* -:: +:: //__device__ double frexp(double x, int *nptr); @@ -1205,7 +1205,7 @@ frexp hypot ********* -:: +:: __device__ double hypot(double x, double y); @@ -1215,7 +1215,7 @@ hypot ilogb ********* -:: +:: __device__ double ilogb(double x); @@ -1226,7 +1226,7 @@ ilogb isfinite ********* :: - + __device__ int isfinite(double x); @@ -1235,7 +1235,7 @@ isfinite isinf ********* -:: +:: __device__ unsigned isinf(double x); @@ -1245,7 +1245,7 @@ isinf isnan ********* -:: +:: __device__ unsigned isnan(double x); @@ -1256,7 +1256,7 @@ isnan j0 ********* :: - + __device__ double j0(double x); @@ -1265,7 +1265,7 @@ j0 j1 ********* -:: +:: __device__ double j1(double x); @@ -1275,7 +1275,7 @@ j1 jn ********* -:: +:: __device__ double jn(int n, double x); @@ -1285,7 +1285,7 @@ jn ldexp ********* -:: +:: __device__ double ldexp(double x, int exp); @@ -1295,7 +1295,7 @@ ldexp lgamma ********* -:: +:: __device__ double lgamma(double x); @@ -1305,7 +1305,7 @@ lgamma llrint ********* -:: +:: __device__ long long llrint(double x); @@ -1315,7 +1315,7 @@ llrint llround ********* -:: +:: __device__ long long llround(double x); @@ -1325,7 +1325,7 @@ llround log ********* -:: +:: __device__ double log(double x); @@ -1336,16 +1336,16 @@ log log10 ********* :: - + __device__ double log10(double x); - + **Description:** Supported log1p ********* -:: +:: __device__ double log1p(double x); @@ -1355,7 +1355,7 @@ log1p log2 ********* -:: +:: __device__ double log2(double x); @@ -1365,7 +1365,7 @@ log2 logb ********* -:: +:: __device__ double logb(double x); @@ -1375,7 +1375,7 @@ logb lrint ********* -:: +:: __device__ long int lrint(double x); @@ -1385,7 +1385,7 @@ lrint lround ********* -:: +:: __device__ long int lround(double x); @@ -1395,7 +1395,7 @@ lround modf ********* -:: +:: //__device__ double modf(double x, double *iptr); @@ -1405,7 +1405,7 @@ modf nan ********* -:: +:: __device__ double nan(const char* tagp); @@ -1415,7 +1415,7 @@ nan nearbyint ********* -:: +:: __device__ double nearbyint(double x); @@ -1425,7 +1425,7 @@ nearbyint nextafter ********* -:: +:: __device__ double nextafter(double x, double y); @@ -1435,7 +1435,7 @@ nextafter norm ********* -:: +:: __device__ double norm(int dim, const double* t); @@ -1445,7 +1445,7 @@ norm norm3d ********* -:: +:: __device__ double norm3d(double a, double b, double c); @@ -1455,7 +1455,7 @@ norm3d norm4d ********* -:: +:: __device__ double norm4d(double a, double b, double c, double d); @@ -1465,7 +1465,7 @@ norm4d normcdf ********* -:: +:: __device__ double normcdf(double y); @@ -1475,7 +1475,7 @@ normcdf normcdfinv ********* -:: +:: __device__ double normcdfinv(double y); @@ -1485,7 +1485,7 @@ normcdfinv pow ********* -:: +:: __device__ double pow(double x, double y); @@ -1495,7 +1495,7 @@ pow rcbrt ********* -:: +:: __device__ double rcbrt(double x); @@ -1505,7 +1505,7 @@ rcbrt remainder ********* -:: +:: __device__ double remainder(double x, double y); @@ -1515,7 +1515,7 @@ remainder remquo ********* -:: +:: //__device__ double remquo(double x, double y, int *quo); @@ -1525,7 +1525,7 @@ remquo rhypot ********* -:: +:: __device__ double rhypot(double x, double y); @@ -1535,7 +1535,7 @@ rhypot rint ********* -:: +:: __device__ double rint(double x); @@ -1545,7 +1545,7 @@ rint rnorm ********* -:: +:: __device__ double rnorm(int dim, const double* t); @@ -1555,7 +1555,7 @@ rnorm rnorm3d ********* -:: +:: __device__ double rnorm3d(double a, double b, double c); @@ -1566,7 +1566,7 @@ rnorm3d rnorm4d ********* :: - + __device__ double rnorm4d(double a, double b, double c, double d); @@ -1575,7 +1575,7 @@ rnorm4d round ********* -:: +:: __device__ double round(double x); @@ -1585,7 +1585,7 @@ round rsqrt ********* -:: +:: __device__ double rsqrt(double x); @@ -1595,7 +1595,7 @@ rsqrt scalbln ********* -:: +:: __device__ double scalbln(double x, long int n); @@ -1605,7 +1605,7 @@ scalbln scalbn ********* -:: +:: __device__ double scalbn(double x, int n); @@ -1615,7 +1615,7 @@ scalbn signbit ********* -:: +:: __device__ int signbit(double a); @@ -1625,7 +1625,7 @@ signbit sin ********* -:: +:: __device__ double sin(double a); @@ -1635,7 +1635,7 @@ sin sincos ********* -:: +:: __device__ void sincos(double x, double *sptr, double *cptr); @@ -1645,7 +1645,7 @@ sincos sincospi ********* -:: +:: __device__ void sincospi(double x, double *sptr, double *cptr); @@ -1655,7 +1655,7 @@ sincospi sinh ********* -:: +:: __device__ double sinh(double x); @@ -1665,7 +1665,7 @@ sinh sinpi ********* -:: +:: __device__ double sinpi(double x); @@ -1675,7 +1675,7 @@ sinpi sqrt ********* -:: +:: __device__ double sqrt(double x); @@ -1685,7 +1685,7 @@ sqrt tan ********* -:: +:: __device__ double tan(double x); @@ -1695,7 +1695,7 @@ tan tanh ********* -:: +:: __device__ double tanh(double x); @@ -1705,7 +1705,7 @@ tanh tgamma ********* -:: +:: __device__ double tgamma(double x); @@ -1715,7 +1715,7 @@ tgamma trunc ********* -:: +:: __device__ double trunc(double x); @@ -1725,7 +1725,7 @@ trunc y0 ********* -:: +:: __device__ double y0(double x); @@ -1735,7 +1735,7 @@ y0 y1 ********* -:: +:: __device__ double y1(double y); @@ -1745,7 +1745,7 @@ y1 yn ********* -:: +:: __device__ double yn(int n, double x); @@ -1755,7 +1755,7 @@ yn __cosf ********* -:: +:: __device__float __cosf(float x); @@ -1765,7 +1765,7 @@ __cosf __exp10f ********* -:: +:: __device__float __exp10f(float x); @@ -1775,7 +1775,7 @@ __exp10f __expf ********* -:: +:: __device__float __expf(float x); @@ -1785,7 +1785,7 @@ __expf __fadd_rd ********* -:: +:: __device__ staticfloat __fadd_rd(float x, float y); @@ -1795,7 +1795,7 @@ __fadd_rd __fadd_rn ********* -:: +:: __device__ staticfloat __fadd_rn(float x, float y); @@ -1805,7 +1805,7 @@ __fadd_rn __fadd_ru ********* -:: +:: __device__ staticfloat __fadd_ru(float x, float y); @@ -1815,7 +1815,7 @@ __fadd_ru __fadd_rz ********* -:: +:: __device__ staticfloat __fadd_rz(float x, float y); @@ -1825,7 +1825,7 @@ __fadd_rz __fdiv_rd ********* -:: +:: __device__ staticfloat __fdiv_rd(float x, float y); @@ -1835,7 +1835,7 @@ __fdiv_rd __fdiv_rn ********* -:: +:: __device__ staticfloat __fdiv_rn(float x, float y); @@ -1845,7 +1845,7 @@ __fdiv_rn __fdiv_ru ********* -:: +:: __device__ staticfloat __fdiv_ru(float x, float y); @@ -1855,7 +1855,7 @@ __fdiv_ru __fdiv_rz ********* -:: +:: __device__ staticfloat __fdiv_rz(float x, float y); @@ -1865,7 +1865,7 @@ __fdiv_rz __fdividef ********* -:: +:: __device__ staticfloat __fdividef(float x, float y); @@ -1875,7 +1875,7 @@ __fdividef __fmaf_rd ********* -:: +:: __device__float __fmaf_rd(float x, float y, float z); @@ -1885,7 +1885,7 @@ __fmaf_rd __fmaf_rn ********* -:: +:: __device__float __fmaf_rn(float x, float y, float z); @@ -1895,7 +1895,7 @@ __fmaf_rn __fmaf_ru ********* -:: +:: __device__float __fmaf_ru(float x, float y, float z); @@ -1905,7 +1905,7 @@ __fmaf_ru __fmaf_rz ********* -:: +:: __device__float __fmaf_rz(float x, float y, float z); @@ -1915,7 +1915,7 @@ __fmaf_rz __fmul_rd ********* -:: +:: __device__ staticfloat __fmul_rd(float x, float y); @@ -1925,7 +1925,7 @@ __fmul_rd __fmul_rn ********* -:: +:: __device__ staticfloat __fmul_rn(float x, float y); @@ -1935,7 +1935,7 @@ __fmul_rn __fmul_ru ********* -:: +:: __device__ staticfloat __fmul_ru(float x, float y); @@ -1945,7 +1945,7 @@ __fmul_ru __fmul_rz ********* -:: +:: __device__ staticfloat __fmul_rz(float x, float y); @@ -1955,7 +1955,7 @@ __fmul_rz __frcp_rd ********* -:: +:: __device__float __frcp_rd(float x); @@ -1965,7 +1965,7 @@ __frcp_rd __frcp_rn ********* -:: +:: __device__float __frcp_rn(float x); @@ -1975,7 +1975,7 @@ __frcp_rn __frcp_ru ********* -:: +:: __device__float __frcp_ru(float x); @@ -1985,7 +1985,7 @@ __frcp_ru __frcp_rz ********* -:: +:: __device__float __frcp_rz(float x); @@ -1995,7 +1995,7 @@ __frcp_rz __frsqrt_rn ****************** -:: +:: __device__float __frsqrt_rn(float x); @@ -2005,7 +2005,7 @@ __frsqrt_rn __fsqrt_rd ****************** -:: +:: __device__float __fsqrt_rd(float x); @@ -2014,7 +2014,7 @@ __fsqrt_rd __fsqrt_rn -:: +:: __device__float __fsqrt_rn(float x); @@ -2023,7 +2023,7 @@ __device__float __fsqrt_rn(float x); __fsqrt_ru ********* -:: +:: __device__float __fsqrt_ru(float x); @@ -2033,7 +2033,7 @@ __fsqrt_ru __fsqrt_rz ********* -:: +:: __device__float __fsqrt_rz(float x); @@ -2043,7 +2043,7 @@ __fsqrt_rz __fsub_rd ********* -:: +:: __device__ staticfloat __fsub_rd(float x, float y); @@ -2053,7 +2053,7 @@ __fsub_rd __fsub_rn ********* -:: +:: __device__ staticfloat __fsub_rn(float x, float y); @@ -2063,7 +2063,7 @@ __fsub_rn __fsub_ru ********* -:: +:: __device__ staticfloat __fsub_ru(float x, float y); @@ -2073,7 +2073,7 @@ __fsub_ru __log10f ********* -:: +:: __device__float __log10f(float x); @@ -2083,7 +2083,7 @@ __log10f __log2f ********* -:: +:: __device__float __log2f(float x); @@ -2093,7 +2093,7 @@ __log2f __logf ********* -:: +:: __device__float __logf(float x); @@ -2103,7 +2103,7 @@ __logf __powf ********* -:: +:: __device__float __powf(float base, float exponent); @@ -2113,8 +2113,8 @@ __powf __saturatef ********* -:: - +:: + __device__ staticfloat __saturatef(float x); @@ -2123,7 +2123,7 @@ __saturatef __sincosf ********* -:: +:: __device__void __sincosf(float x, float *s, float *c); @@ -2133,7 +2133,7 @@ __sincosf __sinf ********* -:: +:: __device__float __sinf(float x); @@ -2143,7 +2143,7 @@ __sinf __tanf ********* -:: +:: __device__float __tanf(float x); @@ -2153,7 +2153,7 @@ __tanf __dadd_rd ********* -:: +:: __device__ staticdouble __dadd_rd(double x, double y); @@ -2163,7 +2163,7 @@ __dadd_rd __dadd_rn ********* -:: +:: __device__ staticdouble __dadd_rn(double x, double y); @@ -2173,8 +2173,8 @@ __dadd_rn __dadd_ru ********* -:: - +:: + __device__ staticdouble __dadd_ru(double x, double y); @@ -2183,7 +2183,7 @@ __dadd_ru __dadd_rz ********* -:: +:: __device__ staticdouble __dadd_rz(double x, double y); @@ -2193,7 +2193,7 @@ __dadd_rz __ddiv_rd ********* -:: +:: __device__ staticdouble __ddiv_rd(double x, double y); @@ -2203,7 +2203,7 @@ __ddiv_rd __ddiv_rn ********* -:: +:: __device__ staticdouble __ddiv_rn(double x, double y); @@ -2213,7 +2213,7 @@ __ddiv_rn __ddiv_ru ********* -:: +:: __device__ staticdouble __ddiv_ru(double x, double y); @@ -2223,7 +2223,7 @@ __ddiv_ru __ddiv_rz ********* -:: +:: __device__ staticdouble __ddiv_rz(double x, double y); @@ -2233,7 +2233,7 @@ __ddiv_rz __dmul_rd ********* -:: +:: __device__ staticdouble __dmul_rd(double x, double y); @@ -2244,7 +2244,7 @@ __dmul_rd __dmul_rn ********* :: - + __device__ staticdouble __dmul_rn(double x, double y); @@ -2254,7 +2254,7 @@ __dmul_rn __dmul_ru ********* :: - + __device__ staticdouble __dmul_ru(double x, double y); @@ -2264,7 +2264,7 @@ __dmul_ru __dmul_rz ********* :: - + __device__ staticdouble __dmul_rz(double x, double y); @@ -2273,7 +2273,7 @@ __dmul_rz __drcp_rd ********* -:: +:: __device__double __drcp_rd(double x); @@ -2283,7 +2283,7 @@ __drcp_rd __drcp_rn ********* -:: +:: __device__double __drcp_rn(double x); @@ -2293,8 +2293,8 @@ __drcp_rn __drcp_ru ********* -:: - +:: + __device__double __drcp_ru(double x); @@ -2303,7 +2303,7 @@ __drcp_ru __drcp_rz ********* -:: +:: __device__double __drcp_rz(double x); @@ -2313,7 +2313,7 @@ __drcp_rz __dsqrt_rd ********* -:: +:: __device__double __dsqrt_rd(double x); @@ -2323,7 +2323,7 @@ __dsqrt_rd __dsqrt_rn ********* -:: +:: __device__double __dsqrt_rn(double x); @@ -2333,7 +2333,7 @@ __dsqrt_rn __dsqrt_ru ********* -:: +:: __device__double __dsqrt_ru(double x); @@ -2343,7 +2343,7 @@ __dsqrt_ru __dsqrt_rz ********* -:: +:: __device__double __dsqrt_rz(double x); @@ -2353,7 +2353,7 @@ __dsqrt_rz __dsub_rd ********* -:: +:: __device__ staticdouble __dsub_rd(double x, double y); @@ -2364,7 +2364,7 @@ __dsub_rd __dsub_rn ********* -:: +:: __device__ staticdouble __dsub_rn(double x, double y); @@ -2374,7 +2374,7 @@ __dsub_rn __dsub_ru ********* -:: +:: __device__ staticdouble __dsub_ru(double x, double y); @@ -2384,7 +2384,7 @@ __dsub_ru __dsub_rz ********* -:: +:: __device__ staticdouble __dsub_rz(double x, double y); @@ -2394,7 +2394,7 @@ __dsub_rz __fma_rd ********* -:: +:: __device__double __fma_rd(double x, double y, double z); @@ -2404,7 +2404,7 @@ __fma_rd __fma_rn ********* -:: +:: __device__double __fma_rn(double x, double y, double z); @@ -2414,7 +2414,7 @@ __fma_rn __fma_ru ********* -:: +:: __device__double __fma_ru(double x, double y, double z); @@ -2424,7 +2424,7 @@ __fma_ru __fma_rz ********* -:: +:: __device__double __fma_rz(double x, double y, double z); @@ -2434,7 +2434,7 @@ __fma_rz __brev ********* -:: +:: __device__ unsigned int __brev( unsigned int x); @@ -2444,7 +2444,7 @@ __brev __brevll ********* -:: +:: __device__ unsigned long long int __brevll( unsigned long long int x); @@ -2454,7 +2454,7 @@ __brevll __byte_perm ********* -:: +:: __device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s); @@ -2464,7 +2464,7 @@ __byte_perm __clz ********* -:: +:: __device__ unsigned int __clz(int x); @@ -2474,8 +2474,8 @@ __clz __clzll ********* -:: - +:: + __device__ unsigned int __clzll(long long int x); @@ -2484,7 +2484,7 @@ __clzll __ffs ********* -:: +:: __device__ unsigned int __ffs(int x); @@ -2494,7 +2494,7 @@ __ffs __ffsll ********* -:: +:: __device__ unsigned int __ffsll(long long int x); @@ -2504,7 +2504,7 @@ __ffsll __hadd ********* -:: +:: __device__ static unsigned int __hadd(int x, int y); @@ -2514,7 +2514,7 @@ __hadd __mul24 ********* -:: +:: __device__ static int __mul24(int x, int y); @@ -2524,7 +2524,7 @@ __mul24 __mul64hi ********* -:: +:: __device__ long long int __mul64hi(long long int x, long long int y); @@ -2534,7 +2534,7 @@ __mul64hi __mulhi ********* -:: +:: __device__ static int __mulhi(int x, int y); @@ -2544,7 +2544,7 @@ __mulhi __popc ********* -:: +:: __device__ unsigned int __popc(unsigned int x); @@ -2554,7 +2554,7 @@ __popc __popcll ********* -:: +:: __device__ unsigned int __popcll(unsigned long long int x); @@ -2564,7 +2564,7 @@ __popcll __rhadd ********* -:: +:: __device__ static int __rhadd(int x, int y); @@ -2574,7 +2574,7 @@ __rhadd __sad ********* -:: +:: __device__ static unsigned int __sad(int x, int y, int z); @@ -2584,7 +2584,7 @@ __sad __uhadd ********* -:: +:: __device__ static unsigned int __uhadd(unsigned int x, unsigned int y); @@ -2594,7 +2594,7 @@ __uhadd __umul24 ********* -:: +:: __device__ static int __umul24(unsigned int x, unsigned int y); @@ -2605,7 +2605,7 @@ __umul24 __umul64hi ********* -:: +:: __device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y); @@ -2615,7 +2615,7 @@ __umul64hi __umulhi ********* -:: +:: __device__ static unsigned int __umulhi(unsigned int x, unsigned int y); @@ -2625,7 +2625,7 @@ __umulhi __urhadd ********* -:: +:: __device__ static unsigned int __urhadd(unsigned int x, unsigned int y); @@ -2635,7 +2635,7 @@ __urhadd __usad ********* -:: +:: __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z); @@ -2645,7 +2645,7 @@ __usad __double2float_rd ****************** -:: +:: __device__ float __double2float_rd(double x); @@ -2655,7 +2655,7 @@ __double2float_rd __double2float_rn ****************** -:: +:: __device__ float __double2float_rn(double x); @@ -2665,7 +2665,7 @@ __double2float_rn __double2float_ru ****************** -:: +:: __device__ float __double2float_ru(double x); @@ -2675,7 +2675,7 @@ __double2float_ru __double2float_rz ****************** -:: +:: __device__ float __double2float_rz(double x); @@ -2685,7 +2685,7 @@ __double2float_rz __double2hiint ****************** -:: +:: __device__ int __double2hiint(double x); @@ -2695,7 +2695,7 @@ __double2hiint __double2int_rd ****************** -:: +:: __device__ int __double2int_rd(double x); @@ -2705,7 +2705,7 @@ __double2int_rd __double2int_rn ****************** -:: +:: __device__ int __double2int_rn(double x); @@ -2715,7 +2715,7 @@ __double2int_rn __double2int_ru ****************** -:: +:: __device__ int __double2int_ru(double x); @@ -2725,7 +2725,7 @@ __double2int_ru __double2int_rz ****************** -:: +:: __device__ int __double2int_rz(double x); @@ -2735,7 +2735,7 @@ __double2int_rz __double2ll_rd ****************** -:: +:: __device__ long long int __double2ll_rd(double x); @@ -2745,7 +2745,7 @@ __double2ll_rd __double2ll_rn ****************** -:: +:: __device__ long long int __double2ll_rn(double x); @@ -2756,7 +2756,7 @@ __double2ll_rn __double2ll_ru ****************** -:: +:: __device__ long long int __double2ll_ru(double x); @@ -2766,8 +2766,8 @@ __double2ll_ru __double2ll_rz ****************** -:: - +:: + __device__ long long int __double2ll_rz(double x); @@ -2776,7 +2776,7 @@ __double2ll_rz __double2loint ****************** -:: +:: __device__ int __double2loint(double x); @@ -2786,8 +2786,8 @@ __double2loint __double2uint_rd ****************** -:: - +:: + __device__ unsigned int __double2uint_rd(double x); @@ -2796,7 +2796,7 @@ __double2uint_rd __double2uint_rn ****************** -:: +:: __device__ unsigned int __double2uint_rn(double x); @@ -2806,8 +2806,8 @@ __double2uint_rn __double2uint_ru ****************** -:: - +:: + __device__ unsigned int __double2uint_ru(double x); @@ -2816,7 +2816,7 @@ __double2uint_ru __double2uint_rz ****************** -:: +:: __device__ unsigned int __double2uint_rz(double x); @@ -2826,7 +2826,7 @@ __double2uint_rz __double2ull_rd ****************** -:: +:: __device__ unsigned long long int __double2ull_rd(double x); @@ -2836,7 +2836,7 @@ __double2ull_rd __double2ull_rn ****************** -:: +:: __device__ unsigned long long int __double2ull_rn(double x); @@ -2846,7 +2846,7 @@ __double2ull_rn __double2ull_ru ****************** -:: +:: __device__ unsigned long long int __double2ull_ru(double x); @@ -2856,7 +2856,7 @@ __double2ull_ru __double2ull_rz ****************** -:: +:: __device__ unsigned long long int __double2ull_rz(double x); @@ -2866,7 +2866,7 @@ __double2ull_rz __double_as_longlong *************************** -:: +:: __device__ long long int __double_as_longlong(double x); @@ -2876,7 +2876,7 @@ __double_as_longlong __float2half_rn ****************** -:: +:: __device__ unsigned short __float2half_rn(float x); @@ -2886,7 +2886,7 @@ __float2half_rn __half2float ****************** -:: +:: __device__ float __half2float(unsigned short); @@ -2896,7 +2896,7 @@ __half2float __float2half_rn ****************** -:: +:: __device__ __half __float2half_rn(float x); @@ -2906,7 +2906,7 @@ __float2half_rn __half2float ****************** -:: +:: __device__ float __half2float(__half); @@ -2916,7 +2916,7 @@ __half2float __float2int_rd ****************** -:: +:: __device__ int __float2int_rd(float x); @@ -2926,7 +2926,7 @@ __float2int_rd __float2int_rn ****************** -:: +:: __device__ int __float2int_rn(float x); @@ -2936,7 +2936,7 @@ __float2int_rn __float2int_ru ****************** -:: +:: __device__ int __float2int_ru(float x); @@ -2946,7 +2946,7 @@ __float2int_ru __float2int_rz ****************** -:: +:: __device__ int __float2int_rz(float x); @@ -2956,7 +2956,7 @@ __float2int_rz __float2ll_rd ****************** -:: +:: __device__ long long int __float2ll_rd(float x); @@ -2966,7 +2966,7 @@ __float2ll_rd __float2ll_rn ****************** -:: +:: __device__ long long int __float2ll_rn(float x); @@ -2976,7 +2976,7 @@ __float2ll_rn __float2ll_ru ****************** -:: +:: __device__ long long int __float2ll_ru(float x); @@ -2986,8 +2986,8 @@ __float2ll_ru __float2ll_rz ****************** -:: - +:: + __device__ long long int __float2ll_rz(float x); @@ -2996,7 +2996,7 @@ __float2ll_rz __float2uint_rd ****************** -:: +:: __device__ unsigned int __float2uint_rd(float x); @@ -3006,7 +3006,7 @@ __float2uint_rd __float2uint_rn ****************** -:: +:: __device__ unsigned int __float2uint_rn(float x); @@ -3016,7 +3016,7 @@ __float2uint_rn __float2uint_ru ****************** -:: +:: __device__ unsigned int __float2uint_ru(float x); @@ -3026,7 +3026,7 @@ __float2uint_ru __float2uint_rz ****************** -:: +:: __device__ unsigned int __float2uint_rz(float x); @@ -3036,7 +3036,7 @@ __float2uint_rz __float2ull_rd ****************** -:: +:: __device__ unsigned long long int __float2ull_rd(float x); @@ -3046,7 +3046,7 @@ __float2ull_rd __float2ull_rn ****************** -:: +:: __device__ unsigned long long int __float2ull_rn(float x); @@ -3056,7 +3056,7 @@ __float2ull_rn __float2ull_ru ****************** -:: +:: __device__ unsigned long long int __float2ull_ru(float x); @@ -3066,7 +3066,7 @@ __float2ull_ru __float2ull_rz ****************** -:: +:: __device__ unsigned long long int __float2ull_rz(float x); @@ -3076,7 +3076,7 @@ __float2ull_rz __float_as_int ****************** -:: +:: __device__ int __float_as_int(float x); @@ -3086,7 +3086,7 @@ __float_as_int __float_as_uint ****************** -:: +:: __device__ unsigned int __float_as_uint(float x); @@ -3096,7 +3096,7 @@ __float_as_uint __hiloint2double ****************** -:: +:: __device__ double __hiloint2double(int hi, int lo); @@ -3106,7 +3106,7 @@ __hiloint2double __int2double_rn ****************** -:: +:: __device__ double __int2double_rn(int x); @@ -3116,7 +3116,7 @@ __int2double_rn __int2float_rd ****************** -:: +:: __device__ float __int2float_rd(int x); @@ -3126,7 +3126,7 @@ __int2float_rd __int2float_rn ****************** -:: +:: __device__ float __int2float_rn(int x); @@ -3136,7 +3136,7 @@ __int2float_rn __int2float_ru ****************** -:: +:: __device__ float __int2float_ru(int x); @@ -3146,7 +3146,7 @@ __int2float_ru __int2float_rz ****************** -:: +:: __device__ float __int2float_rz(int x); @@ -3157,7 +3157,7 @@ __int2float_rz __int_as_float ****************** -:: +:: __device__ float __int_as_float(int x); @@ -3168,7 +3168,7 @@ __int_as_float __ll2double_rd ****************** -:: +:: __device__ double __ll2double_rd(long long int x); @@ -3178,7 +3178,7 @@ __ll2double_rd __ll2double_rn ****************** -:: +:: __device__ double __ll2double_rn(long long int x); @@ -3189,7 +3189,7 @@ __ll2double_rn __ll2double_ru ****************** -:: +:: __device__ double __ll2double_ru(long long int x); @@ -3200,7 +3200,7 @@ __ll2double_ru __ll2double_rz ****************** -:: +:: __device__ double __ll2double_rz(long long int x); @@ -3210,7 +3210,7 @@ __ll2double_rz __ll2float_rd ****************** -:: +:: __device__ float __ll2float_rd(long long int x); @@ -3220,7 +3220,7 @@ __ll2float_rd __ll2float_rn ****************** -:: +:: __device__ float __ll2float_rn(long long int x); @@ -3230,7 +3230,7 @@ __ll2float_rn __ll2float_ru ****************** -:: +:: __device__ float __ll2float_ru(long long int x); @@ -3240,7 +3240,7 @@ __ll2float_ru __ll2float_rz ****************** -:: +:: __device__ float __ll2float_rz(long long int x); @@ -3250,7 +3250,7 @@ __ll2float_rz __longlong_as_double *************************** -:: +:: __device__ double __longlong_as_double(long long int x); @@ -3260,7 +3260,7 @@ __longlong_as_double __uint2double_rn ****************** -:: +:: __device__ double __uint2double_rn(int x); @@ -3270,7 +3270,7 @@ __uint2double_rn __uint2float_rd ****************** -:: +:: __device__ float __uint2float_rd(unsigned int x); @@ -3280,7 +3280,7 @@ __uint2float_rd __uint2float_rn ****************** -:: +:: __device__ float __uint2float_rn(unsigned int x); @@ -3290,7 +3290,7 @@ __uint2float_rn __uint2float_ru ****************** -:: +:: __device__ float __uint2float_ru(unsigned int x); @@ -3300,7 +3300,7 @@ __uint2float_ru __uint2float_rz ****************** -:: +:: __device__ float __uint2float_rz(unsigned int x); @@ -3310,7 +3310,7 @@ __uint2float_rz __uint_as_float ****************** -:: +:: __device__ float __uint_as_float(unsigned int x); @@ -3320,7 +3320,7 @@ __uint_as_float __ull2double_rd ****************** -:: +:: __device__ double __ull2double_rd(unsigned long long int x); @@ -3330,7 +3330,7 @@ __ull2double_rd __ull2double_rn ****************** -:: +:: __device__ double __ull2double_rn(unsigned long long int x); @@ -3340,7 +3340,7 @@ __ull2double_rn __ull2double_ru ****************** -:: +:: __device__ double __ull2double_ru(unsigned long long int x); @@ -3350,7 +3350,7 @@ __ull2double_ru __ull2double_rz ****************** -:: +:: __device__ double __ull2double_rz(unsigned long long int x); @@ -3360,7 +3360,7 @@ __ull2double_rz __ull2float_rd ****************** -:: +:: __device__ float __ull2float_rd(unsigned long long int x); @@ -3370,7 +3370,7 @@ __ull2float_rd __ull2float_rn ****************** -:: +:: __device__ float __ull2float_rn(unsigned long long int x); @@ -3381,7 +3381,7 @@ __ull2float_rn __ull2float_ru ****************** -:: +:: __device__ float __ull2float_ru(unsigned long long int x); @@ -3391,7 +3391,7 @@ __ull2float_ru __ull2float_rz ****************** -:: +:: __device__ float __ull2float_rz(unsigned long long int x); @@ -3401,7 +3401,7 @@ __ull2float_rz __hadd ********* -:: +:: __device__ static __half __hadd(const __half a, const __half b); @@ -3411,7 +3411,7 @@ __hadd __hadd_sat ****************** -:: +:: __device__ static __half __hadd_sat(__half a, __half b); @@ -3421,7 +3421,7 @@ __hadd_sat __hfma ********* -:: +:: __device__ static __half __hfma(__half a, __half b, __half c); @@ -3431,7 +3431,7 @@ __hfma __hfma_sat ********* -:: +:: __device__ static __half __hfma_sat(__half a, __half b, __half c); @@ -3441,7 +3441,7 @@ __hfma_sat __hmul ********* -:: +:: __device__ static __half __hmul(__half a, __half b); @@ -3451,7 +3451,7 @@ __hmul __hmul_sat ********* -:: +:: __device__ static __half __hmul_sat(__half a, __half b); @@ -3461,7 +3461,7 @@ __hmul_sat __hneg ********* -:: +:: __device__ static __half __hneg(__half a); @@ -3471,7 +3471,7 @@ __hneg __hsub ********* -:: +:: __device__ static __half __hsub(__half a, __half b); @@ -3481,7 +3481,7 @@ __hsub __hsub_sat ********* -:: +:: __device__ static __half __hsub_sat(__half a, __half b); @@ -3491,7 +3491,7 @@ __hsub_sat hdiv ********* -:: +:: __device__ static __half hdiv(__half a, __half b); @@ -3501,7 +3501,7 @@ hdiv __hadd2 ********* -:: +:: __device__ static __half2 __hadd2(__half2 a, __half2 b); @@ -3511,7 +3511,7 @@ __hadd2 __hadd2_sat ****************** -:: +:: __device__ static __half2 __hadd2_sat(__half2 a, __half2 b); @@ -3521,7 +3521,7 @@ __hadd2_sat __hfma2 ********* -:: +:: __device__ static __half2 __hfma2(__half2 a, __half2 b, __half2 c); @@ -3531,7 +3531,7 @@ __hfma2 __hfma2_sat ****************** -:: +:: __device__ static __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c); @@ -3541,7 +3541,7 @@ __hfma2_sat __hmul2 ********* -:: +:: __device__ static __half2 __hmul2(__half2 a, __half2 b); @@ -3551,7 +3551,7 @@ __hmul2 __hmul2_sat ****************** -:: +:: __device__ static __half2 __hmul2_sat(__half2 a, __half2 b); @@ -3561,7 +3561,7 @@ __hmul2_sat __hsub2 ********* -:: +:: __device__ static __half2 __hsub2(__half2 a, __half2 b); @@ -3571,7 +3571,7 @@ __hsub2 __hneg2 ********* -:: +:: __device__ static __half2 __hneg2(__half2 a); @@ -3581,7 +3581,7 @@ __hneg2 __hsub2_sat ****************** -:: +:: __device__ static __half2 __hsub2_sat(__half2 a, __half2 b); @@ -3591,7 +3591,7 @@ __hsub2_sat h2div ********* -:: +:: __device__ static __half2 h2div(__half2 a, __half2 b); @@ -3601,7 +3601,7 @@ h2div __heq ********* -:: +:: __device__bool __heq(__half a, __half b); @@ -3611,7 +3611,7 @@ __heq __hge ********* -:: +:: __device__bool __hge(__half a, __half b); @@ -3621,7 +3621,7 @@ __hge __hgt ********* -:: +:: __device__bool __hgt(__half a, __half b); @@ -3631,7 +3631,7 @@ __hgt __hisinf ********* -:: +:: __device__bool __hisinf(__half a); @@ -3641,7 +3641,7 @@ __hisinf __hisnan ********* -:: +:: __device__bool __hisnan(__half a); @@ -3651,7 +3651,7 @@ __hisnan __hle ********* -:: +:: __device__bool __hle(__half a, __half b); @@ -3661,7 +3661,7 @@ __hle __hlt ********* -:: +:: __device__bool __hlt(__half a, __half b); @@ -3671,7 +3671,7 @@ __hlt __hne ********* -:: +:: __device__bool __hne(__half a, __half b); @@ -3681,7 +3681,7 @@ __hne __hbeq2 ********* -:: +:: __device__bool __hbeq2(__half2 a, __half2 b); @@ -3691,7 +3691,7 @@ __hbeq2 __hbge2 ********* -:: +:: __device__bool __hbge2(__half2 a, __half2 b); @@ -3701,7 +3701,7 @@ __hbge2 __hbgt2 ********* -:: +:: __device__bool __hbgt2(__half2 a, __half2 b); @@ -3711,7 +3711,7 @@ __hbgt2 __hble2 ********* -:: +:: __device__bool __hble2(__half2 a, __half2 b); @@ -3721,7 +3721,7 @@ __hble2 __hblt2 ********* -:: +:: __device__bool __hblt2(__half2 a, __half2 b); @@ -3731,7 +3731,7 @@ __hblt2 __hbne2 ********* -:: +:: __device__bool __hbne2(__half2 a, __half2 b); @@ -3741,7 +3741,7 @@ __hbne2 __heq2 ********* -:: +:: __device____half2 __heq2(__half2 a, __half2 b); @@ -3751,7 +3751,7 @@ __heq2 __hge2 ********* -:: +:: __device____half2 __hge2(__half2 a, __half2 b); @@ -3761,7 +3761,7 @@ __hge2 __hgt2 ********* -:: +:: __device____half2 __hgt2(__half2 a, __half2 b); @@ -3771,7 +3771,7 @@ __hgt2 __hisnan2 ********* -:: +:: __device____half2 __hisnan2(__half2 a); @@ -3781,7 +3781,7 @@ __hisnan2 __hle2 ********* -:: +:: __device____half2 __hle2(__half2 a, __half2 b); @@ -3791,7 +3791,7 @@ __hle2 __hlt2 ********* -:: +:: __device____half2 __hlt2(__half2 a, __half2 b); @@ -3801,7 +3801,7 @@ __hlt2 __hne2 ********* -:: +:: __device____half2 __hne2(__half2 a, __half2 b); @@ -3811,7 +3811,7 @@ __hne2 hceil ********* -:: +:: __device__ static __half hceil(const __half h); @@ -3821,7 +3821,7 @@ hceil hcos ********* -:: +:: __device__ static __half hcos(const __half h); @@ -3831,8 +3831,8 @@ hcos hexp ********* -:: - +:: + __device__ static __half hexp(const __half h); @@ -3841,7 +3841,7 @@ hexp hexp10 ********* -:: +:: __device__ static __half hexp10(const __half h); @@ -3851,7 +3851,7 @@ hexp10 hexp2 ********* -:: +:: __device__ static __half hexp2(const __half h); @@ -3861,7 +3861,7 @@ hexp2 hfloor ********* -:: +:: __device__ static __half hfloor(const __half h); @@ -3871,7 +3871,7 @@ hfloor hlog ********* -:: +:: __device__ static __half hlog(const __half h); @@ -3881,7 +3881,7 @@ hlog hlog10 ********* -:: +:: __device__ static __half hlog10(const __half h); @@ -3891,7 +3891,7 @@ hlog10 hlog2 ********* -:: +:: __device__ static __half hlog2(const __half h); @@ -3901,8 +3901,8 @@ hlog2 hrcp ********* -:: - +:: + //__device__ static __half hrcp(const __half h); @@ -3911,7 +3911,7 @@ hrcp hrint ********* -:: +:: __device__ static __half hrint(const __half h); @@ -3921,7 +3921,7 @@ hrint hsin ********* -:: +:: __device__ static __half hsin(const __half h); @@ -3931,7 +3931,7 @@ hsin hsqrt ********* -:: +:: __device__ static __half hsqrt(const __half a); @@ -3941,7 +3941,7 @@ hsqrt htrunc ********* -:: +:: __device__ static __half htrunc(const __half a); @@ -3951,7 +3951,7 @@ htrunc h2ceil ********* -:: +:: __device__ static __half2 h2ceil(const __half2 h); @@ -3961,7 +3961,7 @@ h2ceil h2exp ********* -:: +:: __device__ static __half2 h2exp(const __half2 h); @@ -3971,7 +3971,7 @@ h2exp h2exp10 ********* -:: +:: __device__ static __half2 h2exp10(const __half2 h); @@ -3981,7 +3981,7 @@ h2exp10 h2exp2 ********* -:: +:: __device__ static __half2 h2exp2(const __half2 h); @@ -3991,7 +3991,7 @@ h2exp2 h2floor ********* -:: +:: __device__ static __half2 h2floor(const __half2 h); @@ -4001,7 +4001,7 @@ h2floor h2log ********* -:: +:: __device__ static __half2 h2log(const __half2 h); @@ -4011,7 +4011,7 @@ h2log h2log10 ********* -:: +:: __device__ static __half2 h2log10(const __half2 h); @@ -4021,7 +4021,7 @@ h2log10 h2log2 ********* -:: +:: __device__ static __half2 h2log2(const __half2 h); @@ -4031,7 +4031,7 @@ h2log2 h2rcp ********* -:: +:: __device__ static __half2 h2rcp(const __half2 h); @@ -4041,8 +4041,8 @@ h2rcp h2rsqrt ********* -:: - +:: + __device__ static __half2 h2rsqrt(const __half2 h); @@ -4050,8 +4050,8 @@ h2rsqrt h2sin -********* -:: +********* +:: __device__ static __half2 h2sin(const __half2 h); @@ -4061,8 +4061,8 @@ h2sin h2sqrt ********* -:: - +:: + __device__ static __half2 h2sqrt(const __half2 h); @@ -4071,7 +4071,7 @@ h2sqrt __float22half2_rn ****************** -:: +:: __device____half2 __float22half2_rn(const float2 a); @@ -4081,7 +4081,7 @@ __float22half2_rn __float2half ****************** -:: +:: __device____half __float2half(const float a); @@ -4091,8 +4091,8 @@ __float2half __float2half2_rn ****************** -:: - +:: + __device____half2 __float2half2_rn(const float a); @@ -4101,7 +4101,7 @@ __float2half2_rn __float2half_rd ****************** -:: +:: __device____half __float2half_rd(const float a); @@ -4111,7 +4111,7 @@ __float2half_rd __float2half_rn ****************** -:: +:: __device____half __float2half_rn(const float a); @@ -4121,7 +4121,7 @@ __float2half_rn __float2half_ru ****************** -:: +:: __device____half __float2half_ru(const float a); @@ -4131,7 +4131,7 @@ __float2half_ru __float2half_rz ****************** -:: +:: __device____half __float2half_rz(const float a); @@ -4141,7 +4141,7 @@ __float2half_rz __floats2half2_rn ****************** -:: +:: __device____half2 __floats2half2_rn(const float a, const float b); @@ -4151,7 +4151,7 @@ __floats2half2_rn __half22float2 ****************** -:: +:: __device__float2 __half22float2(const __half2 a); @@ -4161,7 +4161,7 @@ __half22float2 __half2float ****************** -:: +:: __device__float __half2float(const __half a); @@ -4171,7 +4171,7 @@ __half2float half2half2 ****************** -:: +:: __device____half2 half2half2(const __half a); @@ -4181,7 +4181,7 @@ half2half2 __half2int_rd ****************** -:: +:: __device__int __half2int_rd(__half h); @@ -4191,7 +4191,7 @@ __half2int_rd __half2int_rn ****************** -:: +:: __device__int __half2int_rn(__half h); @@ -4201,7 +4201,7 @@ __half2int_rn __half2int_ru ****************** -:: +:: __device__int __half2int_ru(__half h); @@ -4211,7 +4211,7 @@ __half2int_ru __half2int_rz ****************** -:: +:: __device__int __half2int_rz(__half h); @@ -4221,7 +4221,7 @@ __half2int_rz __half2ll_rd ****************** -:: +:: __device__long long int __half2ll_rd(__half h); @@ -4231,7 +4231,7 @@ __half2ll_rd __half2ll_rn ****************** -:: +:: __device__long long int __half2ll_rn(__half h); @@ -4241,7 +4241,7 @@ __half2ll_rn __half2ll_ru ****************** -:: +:: __device__long long int __half2ll_ru(__half h); @@ -4251,7 +4251,7 @@ __half2ll_ru __half2ll_rz ****************** -:: +:: __device__long long int __half2ll_rz(__half h); @@ -4261,7 +4261,7 @@ __half2ll_rz __half2short_rd ****************** -:: +:: __device__short __half2short_rd(__half h); @@ -4271,7 +4271,7 @@ __half2short_rd __half2short_rn ****************** -:: +:: __device__short __half2short_rn(__half h); @@ -4281,7 +4281,7 @@ __half2short_rn __half2short_ru ****************** -:: +:: __device__short __half2short_ru(__half h); @@ -4292,7 +4292,7 @@ __half2short_ru __half2short_rz ****************** -:: +:: __device__short __half2short_rz(__half h); @@ -4302,7 +4302,7 @@ __half2short_rz __half2uint_rd ****************** -:: +:: __device__unsigned int __half2uint_rd(__half h); @@ -4312,7 +4312,7 @@ __half2uint_rd __half2uint_rn ****************** -:: +:: __device__unsigned int __half2uint_rn(__half h); @@ -4322,7 +4322,7 @@ __half2uint_rn __half2uint_ru ****************** -:: +:: __device__unsigned int __half2uint_ru(__half h); @@ -4332,7 +4332,7 @@ __half2uint_ru __half2uint_rz ****************** -:: +:: __device__unsigned int __half2uint_rz(__half h); @@ -4342,7 +4342,7 @@ __half2uint_rz __half2ull_rd ****************** -:: +:: __device__unsigned long long int __half2ull_rd(__half h); @@ -4352,7 +4352,7 @@ __half2ull_rd __half2ull_rn ****************** -:: +:: __device__unsigned long long int __half2ull_rn(__half h); @@ -4362,7 +4362,7 @@ __half2ull_rn __half2ull_ru ****************** -:: +:: __device__unsigned long long int __half2ull_ru(__half h); @@ -4372,7 +4372,7 @@ __half2ull_ru __half2ull_rz ****************** -:: +:: __device__unsigned long long int __half2ull_rz(__half h); @@ -4382,7 +4382,7 @@ __half2ull_rz __half2ushort_rd ****************** -:: +:: __device__unsigned short int __half2ushort_rd(__half h); @@ -4392,7 +4392,7 @@ __half2ushort_rd __half2ushort_rn ****************** -:: +:: __device__unsigned short int __half2ushort_rn(__half h); @@ -4402,7 +4402,7 @@ __half2ushort_rn __half2ushort_ru ****************** -:: +:: __device__unsigned short int __half2ushort_ru(__half h); @@ -4412,7 +4412,7 @@ __half2ushort_ru __half2ushort_rz ****************** -:: +:: __device__unsigned short int __half2ushort_rz(__half h); @@ -4422,7 +4422,7 @@ __half2ushort_rz __half_as_short ****************** -:: +:: __device__short int __half_as_short(const __half h); @@ -4432,7 +4432,7 @@ __half_as_short __half_as_ushort ****************** -:: +:: __device__unsigned short int __half_as_ushort(const __half h); @@ -4442,7 +4442,7 @@ __half_as_ushort __halves2half2 ****************** -:: +:: __device____half2 __halves2half2(const __half a, const __half b); @@ -4452,8 +4452,8 @@ __halves2half2 __high2float ****************** -:: - +:: + __device__float __high2float(const __half2 a); @@ -4462,7 +4462,7 @@ __high2float __high2half ****************** -:: +:: __device____half __high2half(const __half2 a); @@ -4472,7 +4472,7 @@ __high2half __high2half2 ****************** -:: +:: __device____half2 __high2half2(const __half2 a); @@ -4482,7 +4482,7 @@ __high2half2 __highs2half2 ****************** -:: +:: __device____half2 __highs2half2(const __half2 a, const __half2 b); @@ -4492,7 +4492,7 @@ __highs2half2 __int2half_rd ****************** -:: +:: __device____half __int2half_rd(int i); @@ -4502,7 +4502,7 @@ __int2half_rd __int2half_rn ****************** -:: +:: __device____half __int2half_rn(int i); @@ -4512,7 +4512,7 @@ __int2half_rn __int2half_ru ****************** -:: +:: __device____half __int2half_ru(int i); @@ -4522,7 +4522,7 @@ __int2half_ru __int2half_rz ****************** -:: +:: __device____half __int2half_rz(int i); @@ -4532,7 +4532,7 @@ __int2half_rz __ll2half_rd ****************** -:: +:: __device____half __ll2half_rd(long long int i); @@ -4542,7 +4542,7 @@ __ll2half_rd __ll2half_rn ****************** -:: +:: __device____half __ll2half_rn(long long int i); @@ -4552,7 +4552,7 @@ __ll2half_rn __ll2half_ru ****************** -:: +:: __device____half __ll2half_ru(long long int i); @@ -4562,7 +4562,7 @@ __ll2half_ru __ll2half_rz ****************** -:: +:: __device____half __ll2half_rz(long long int i); @@ -4572,7 +4572,7 @@ __ll2half_rz __low2float ****************** -:: +:: __device__float __low2float(const __half2 a); @@ -4582,7 +4582,7 @@ __low2float __low2half ****************** -:: +:: __device__ __half __low2half(const __half2 a); @@ -4592,7 +4592,7 @@ __low2half __low2half2 ****************** -:: +:: __device__ __half2 __low2half2(const __half2 a, const __half2 b); @@ -4602,7 +4602,7 @@ __low2half2 __low2half2 ****************** -:: +:: __device__ __half2 __low2half2(const __half2 a); @@ -4612,7 +4612,7 @@ __low2half2 __lowhigh2highlow ****************** -:: +:: __device__ __half2 __lowhigh2highlow(const __half2 a); @@ -4622,7 +4622,7 @@ __lowhigh2highlow __lows2half2 ****************** -:: +:: __device__ __half2 __lows2half2(const __half2 a, const __half2 b); @@ -4632,7 +4632,7 @@ __lows2half2 __short2half_rd ****************** -:: +:: __device____half __short2half_rd(short int i); @@ -4642,7 +4642,7 @@ __short2half_rd __short2half_rn ****************** -:: +:: __device____half __short2half_rn(short int i); @@ -4652,7 +4652,7 @@ __short2half_rn __short2half_ru ****************** -:: +:: __device____half __short2half_ru(short int i); @@ -4662,7 +4662,7 @@ __short2half_ru __short2half_rz ****************** -:: +:: __device____half __short2half_rz(short int i); @@ -4672,7 +4672,7 @@ __short2half_rz __uint2half_rd ****************** -:: +:: __device____half __uint2half_rd(unsigned int i); @@ -4682,7 +4682,7 @@ __uint2half_rd __uint2half_rn ****************** -:: +:: __device____half __uint2half_rn(unsigned int i); @@ -4692,7 +4692,7 @@ __uint2half_rn __uint2half_ru ****************** -:: +:: __device____half __uint2half_ru(unsigned int i); @@ -4702,7 +4702,7 @@ __uint2half_ru __uint2half_rz ****************** -:: +:: __device____half __uint2half_rz(unsigned int i); @@ -4712,7 +4712,7 @@ __uint2half_rz __ull2half_rd ****************** -:: +:: __device____half __ull2half_rd(unsigned long long int i); @@ -4722,7 +4722,7 @@ __ull2half_rd __ull2half_rn ****************** -:: +:: __device____half __ull2half_rn(unsigned long long int i); @@ -4732,7 +4732,7 @@ __ull2half_rn __ull2half_ru ****************** -:: +:: __device____half __ull2half_ru(unsigned long long int i); @@ -4742,8 +4742,8 @@ __ull2half_ru __ull2half_rz ****************** -:: - +:: + __device____half __ull2half_rz(unsigned long long int i); @@ -4752,7 +4752,7 @@ __ull2half_rz __ushort2half_rd ********* -:: +:: __device____half __ushort2half_rd(unsigned short int i); @@ -4762,7 +4762,7 @@ __ushort2half_rd __ushort2half_rn ****************** -:: +:: __device____half __ushort2half_rn(unsigned short int i); @@ -4772,7 +4772,7 @@ __ushort2half_rn __ushort2half_ru ****************** -:: +:: __device____half __ushort2half_ru(unsigned short int i); @@ -4782,7 +4782,7 @@ __ushort2half_ru __ushort2half_rz ****************** -:: +:: __device____half __ushort2half_rz(unsigned short int i); @@ -4792,7 +4792,7 @@ __ushort2half_rz __ushort_as_half ****************** -:: +:: __device____half __ushort_as_half(const unsigned short int i); diff --git a/ROCm_API_References/HIP_API/Context-Management.rst b/ROCm_API_References/HIP_API/Context-Management.rst index c5e08c69..ae7a6d34 100644 --- a/ROCm_API_References/HIP_API/Context-Management.rst +++ b/ROCm_API_References/HIP_API/Context-Management.rst @@ -15,29 +15,29 @@ hipCtxPopCurrent ---------------- .. doxygenfunction:: hipCtxPopCurrent -hipCtxPushCurrent +hipCtxPushCurrent ------------------ -.. doxygenfunction:: hipCtxPushCurrent +.. doxygenfunction:: hipCtxPushCurrent -hipCtxSetCurrent +hipCtxSetCurrent ---------------- -.. doxygenfunction:: hipCtxSetCurrent +.. doxygenfunction:: hipCtxSetCurrent -hipCtxGetCurrent +hipCtxGetCurrent ---------------- -.. doxygenfunction:: hipCtxGetCurrent +.. doxygenfunction:: hipCtxGetCurrent -hipCtxGetDevice +hipCtxGetDevice ---------------- -.. doxygenfunction:: hipCtxGetDevice +.. doxygenfunction:: hipCtxGetDevice -hipCtxGetApiVersion +hipCtxGetApiVersion -------------------- -.. doxygenfunction:: hipCtxGetApiVersion +.. doxygenfunction:: hipCtxGetApiVersion -hipCtxGetCacheConfig +hipCtxGetCacheConfig ---------------------- -.. doxygenfunction:: hipCtxGetCacheConfig +.. doxygenfunction:: hipCtxGetCacheConfig hipCtxSetSharedMemConfig -------------------------- @@ -47,25 +47,25 @@ hipCtxGetSharedMemConfig -------------------------- .. doxygenfunction:: hipCtxGetSharedMemConfig -hipCtxSynchronize +hipCtxSynchronize ------------------ -.. doxygenfunction:: hipCtxSynchronize +.. doxygenfunction:: hipCtxSynchronize -hipCtxGetFlags +hipCtxGetFlags ---------------- -.. doxygenfunction:: hipCtxGetFlags +.. doxygenfunction:: hipCtxGetFlags -hipCtxEnablePeerAccess +hipCtxEnablePeerAccess ------------------------ -.. doxygenfunction:: hipCtxEnablePeerAccess +.. doxygenfunction:: hipCtxEnablePeerAccess -hipCtxDisablePeerAccess +hipCtxDisablePeerAccess ------------------------ -.. doxygenfunction:: hipCtxDisablePeerAccess +.. doxygenfunction:: hipCtxDisablePeerAccess -hipDevicePrimaryCtxGetState +hipDevicePrimaryCtxGetState ----------------------------- -.. doxygenfunction:: hipDevicePrimaryCtxGetState +.. doxygenfunction:: hipDevicePrimaryCtxGetState hipDevicePrimaryCtxRelease ---------------------------- @@ -77,11 +77,11 @@ hipDevicePrimaryCtxRetain hipDevicePrimaryCtxReset --------------------------- -.. doxygenfunction:: hipDevicePrimaryCtxReset +.. doxygenfunction:: hipDevicePrimaryCtxReset -hipDevicePrimaryCtxSetFlags +hipDevicePrimaryCtxSetFlags ---------------------------- -.. doxygenfunction:: hipDevicePrimaryCtxSetFlags +.. doxygenfunction:: hipDevicePrimaryCtxSetFlags diff --git a/ROCm_API_References/HIP_API/Control.rst b/ROCm_API_References/HIP_API/Control.rst index f85012b5..239d4fe6 100644 --- a/ROCm_API_References/HIP_API/Control.rst +++ b/ROCm_API_References/HIP_API/Control.rst @@ -7,9 +7,9 @@ hipProfilerStart ---------------- .. doxygenfunction:: hipProfilerStart -hipProfilerStop +hipProfilerStop ---------------- -.. doxygenfunction::hipProfilerStop +.. doxygenfunction::hipProfilerStop diff --git a/ROCm_API_References/HIP_API/Device-Memory-Access.rst b/ROCm_API_References/HIP_API/Device-Memory-Access.rst index fc35e2a8..42d98d50 100644 --- a/ROCm_API_References/HIP_API/Device-Memory-Access.rst +++ b/ROCm_API_References/HIP_API/Device-Memory-Access.rst @@ -7,9 +7,9 @@ hipDeviceCanAccessPeer ------------------------ .. doxygenfunction:: hipDeviceCanAccessPeer -hipDeviceEnablePeerAccess +hipDeviceEnablePeerAccess --------------------------- -.. doxygenfunction:: hipDeviceEnablePeerAccess +.. doxygenfunction:: hipDeviceEnablePeerAccess hipDeviceDisablePeerAccess ---------------------------- @@ -23,8 +23,8 @@ hipMemcpyPeer ------------------------ .. doxygenfunction:: hipMemcpyPeer -hipMemcpyPeerAsync +hipMemcpyPeerAsync ------------------------ -.. doxygenfunction:: hipMemcpyPeerAsync +.. doxygenfunction:: hipMemcpyPeerAsync diff --git a/ROCm_API_References/HIP_API/Device-management.rst b/ROCm_API_References/HIP_API/Device-management.rst index 070e429b..81a8ef22 100644 --- a/ROCm_API_References/HIP_API/Device-management.rst +++ b/ROCm_API_References/HIP_API/Device-management.rst @@ -1,20 +1,20 @@ .. _Device-management: - + Device management ================== Device management types and functions. -hipDeviceSynchronize +hipDeviceSynchronize ----------------------- -.. doxygenfunction:: hipDeviceSynchronize +.. doxygenfunction:: hipDeviceSynchronize -hipDeviceReset +hipDeviceReset --------------- -.. doxygenfunction:: hipDeviceReset +.. doxygenfunction:: hipDeviceReset hipSetDevice ------------- @@ -24,7 +24,7 @@ hipSetDevice hipGetDevice ---------------- -.. doxygenfunction:: hipGetDevice +.. doxygenfunction:: hipGetDevice hipGetDeviceCount ----------------- @@ -53,14 +53,14 @@ hipDeviceGetLimit ------------------ .. doxygenfunction:: hipDeviceGetLimit -hipFuncSetCacheConfig +hipFuncSetCacheConfig ---------------------- -.. doxygenfunction:: hipFuncSetCacheConfig +.. doxygenfunction:: hipFuncSetCacheConfig -hipDeviceGetSharedMemConfig +hipDeviceGetSharedMemConfig --------------------------- -.. doxygenfunction:: hipDeviceGetSharedMemConfig +.. doxygenfunction:: hipDeviceGetSharedMemConfig hipDeviceSetSharedMemConfig ---------------------------- @@ -72,9 +72,9 @@ hipSetDeviceFlags .. doxygenfunction:: hipSetDeviceFlags -hipChooseDevice +hipChooseDevice ---------------- -.. doxygenfunction:: hipChooseDevice +.. doxygenfunction:: hipChooseDevice diff --git a/ROCm_API_References/HIP_API/Error.rst b/ROCm_API_References/HIP_API/Error.rst index 36f256f0..292ea368 100644 --- a/ROCm_API_References/HIP_API/Error.rst +++ b/ROCm_API_References/HIP_API/Error.rst @@ -6,17 +6,17 @@ Error Handling Error Handling types and functions. -hipGetLastError +hipGetLastError ---------------- -.. doxygenfunction:: hipGetLastError +.. doxygenfunction:: hipGetLastError -hipPeekAtLastError +hipPeekAtLastError ------------------- -.. doxygenfunction:: hipPeekAtLastError +.. doxygenfunction:: hipPeekAtLastError -hipGetErrorName +hipGetErrorName ---------------- -.. doxygenfunction:: hipGetErrorName +.. doxygenfunction:: hipGetErrorName hipGetErrorString ------------------- diff --git a/ROCm_API_References/HIP_API/Event-Management.rst b/ROCm_API_References/HIP_API/Event-Management.rst index 19d19993..f65d4d8b 100644 --- a/ROCm_API_References/HIP_API/Event-Management.rst +++ b/ROCm_API_References/HIP_API/Event-Management.rst @@ -3,13 +3,13 @@ Event Management ================= -hipEventCreateWithFlags +hipEventCreateWithFlags ------------------------ -.. doxygenfunction:: hipEventCreateWithFlags +.. doxygenfunction:: hipEventCreateWithFlags -hipEventCreate +hipEventCreate ---------------- -.. doxygenfunction:: hipEventCreate +.. doxygenfunction:: hipEventCreate hipEventRecord ---------------- @@ -29,7 +29,7 @@ hipEventElapsedTime hipEventQuery S ---------------- -.. doxygenfunction:: hipEventQuery +.. doxygenfunction:: hipEventQuery diff --git a/ROCm_API_References/HIP_API/Initialization-and-Version.rst b/ROCm_API_References/HIP_API/Initialization-and-Version.rst index b2b45e94..5b1da2db 100644 --- a/ROCm_API_References/HIP_API/Initialization-and-Version.rst +++ b/ROCm_API_References/HIP_API/Initialization-and-Version.rst @@ -12,21 +12,21 @@ hipDeviceGet ---------------- .. doxygenfunction:: hipDeviceGet -hipDeviceComputeCapability +hipDeviceComputeCapability ----------------------------- -.. doxygenfunction:: hipDeviceComputeCapability +.. doxygenfunction:: hipDeviceComputeCapability -hipDeviceGetName +hipDeviceGetName ---------------- -.. doxygenfunction:: hipDeviceGetName +.. doxygenfunction:: hipDeviceGetName -hipDeviceGetPCIBusId +hipDeviceGetPCIBusId --------------------- -.. doxygenfunction:: hipDeviceGetPCIBusId +.. doxygenfunction:: hipDeviceGetPCIBusId -hipDeviceGetByPCIBusId +hipDeviceGetByPCIBusId ----------------------- -.. doxygenfunction:: hipDeviceGetByPCIBusId +.. doxygenfunction:: hipDeviceGetByPCIBusId hipDeviceTotalMem --------------------- @@ -44,9 +44,9 @@ hipModuleLoad ---------------- .. doxygenfunction:: hipModuleLoad -hipModuleUnload +hipModuleUnload ---------------- -.. doxygenfunction:: hipModuleUnload +.. doxygenfunction:: hipModuleUnload hipModuleGetFunction --------------------- @@ -64,8 +64,8 @@ hipModuleLoadDataEx -------------------- .. doxygenfunction:: hipModuleLoadDataEx -hipModuleLaunchKernel +hipModuleLaunchKernel ---------------------- -.. doxygenfunction:: hipModuleLaunchKernel +.. doxygenfunction:: hipModuleLaunchKernel diff --git a/ROCm_API_References/HIP_API/Memory-Management.rst b/ROCm_API_References/HIP_API/Memory-Management.rst index 07da5255..fd53354c 100644 --- a/ROCm_API_References/HIP_API/Memory-Management.rst +++ b/ROCm_API_References/HIP_API/Memory-Management.rst @@ -7,9 +7,9 @@ hipPointerGetAttributes ------------------------ .. doxygenfunction:: hipPointerGetAttributes -hipMalloc +hipMalloc ------------------------ -.. doxygenfunction:: hipMalloc +.. doxygenfunction:: hipMalloc hipMallocHost ------------------------ @@ -19,9 +19,9 @@ hipHostMalloc ------------------------ .. doxygenfunction:: hipHostMalloc -hipHostAlloc +hipHostAlloc ------------------------ -.. doxygenfunction:: hipHostAlloc +.. doxygenfunction:: hipHostAlloc hipHostGetDevicePointer ------------------------ @@ -31,9 +31,9 @@ hipHostGetFlags ------------------------ .. doxygenfunction:: hipHostGetFlags -hipHostRegister +hipHostRegister ------------------------ -.. doxygenfunction:: hipHostRegister +.. doxygenfunction:: hipHostRegister hipHostUnregister ------------------------ @@ -51,9 +51,9 @@ hipFreeHost ------------------------ .. doxygenfunction:: hipFreeHost -hipMemcpy +hipMemcpy ------------------------ -.. doxygenfunction:: hipMemcpy +.. doxygenfunction:: hipMemcpy hipMemcpyHtoD ------------------------ @@ -87,13 +87,13 @@ hipMemcpyToSymbolAsync ------------------------ .. doxygenfunction:: hipMemcpyToSymbolAsync -hipMemcpyFromSymbol +hipMemcpyFromSymbol ------------------------ -.. doxygenfunction:: hipMemcpyFromSymbol +.. doxygenfunction:: hipMemcpyFromSymbol -hipMemcpyFromSymbolAsync +hipMemcpyFromSymbolAsync ------------------------ -.. doxygenfunction:: hipMemcpyFromSymbolAsync +.. doxygenfunction:: hipMemcpyFromSymbolAsync hipMemcpyAsync ------------------------ @@ -103,21 +103,21 @@ hipMemset ------------------------ .. doxygenfunction:: hipMemset -hipMemsetD8 +hipMemsetD8 ------------------------ -.. doxygenfunction:: hipMemsetD8 +.. doxygenfunction:: hipMemsetD8 -hipMemsetAsync +hipMemsetAsync ------------------------ -.. doxygenfunction:: hipMemsetAsync +.. doxygenfunction:: hipMemsetAsync -hipMemset2D +hipMemset2D ------------------------ -.. doxygenfunction:: hipMemset2D +.. doxygenfunction:: hipMemset2D -hipMemGetInfo +hipMemGetInfo ------------------------ -.. doxygenfunction:: hipMemGetInfo +.. doxygenfunction:: hipMemGetInfo hipMemPtrGetInfo ------------------------ diff --git a/ROCm_API_References/HIP_API/Stream-Management.rst b/ROCm_API_References/HIP_API/Stream-Management.rst index 946ef584..3011e056 100644 --- a/ROCm_API_References/HIP_API/Stream-Management.rst +++ b/ROCm_API_References/HIP_API/Stream-Management.rst @@ -19,21 +19,21 @@ hipDeviceGetStreamPriorityRange -------------------------------- .. doxygenfunction:: hipDeviceGetStreamPriorityRange -hipStreamDestroy +hipStreamDestroy ---------------- -.. doxygenfunction:: hipStreamDestroy +.. doxygenfunction:: hipStreamDestroy -hipStreamQuery +hipStreamQuery ---------------- -.. doxygenfunction:: hipStreamQuery +.. doxygenfunction:: hipStreamQuery hipStreamSynchronize --------------------- .. doxygenfunction:: hipStreamSynchronize -hipStreamWaitEvent +hipStreamWaitEvent ------------------- -.. doxygenfunction:: hipStreamWaitEvent +.. doxygenfunction:: hipStreamWaitEvent hipStreamGetFlags ---------------- @@ -43,6 +43,6 @@ hipStreamGetPriority --------------------- .. doxygenfunction:: hipStreamGetPriority -hipStreamAddCallback +hipStreamAddCallback --------------------- -.. doxygenfunction:: hipStreamAddCallback +.. doxygenfunction:: hipStreamAddCallback diff --git a/ROCm_API_References/ROCr-API.rst b/ROCm_API_References/ROCr-API.rst index b0595b7e..8ad3d346 100644 --- a/ROCm_API_References/ROCr-API.rst +++ b/ROCm_API_References/ROCr-API.rst @@ -24,10 +24,10 @@ common definition Initialization and Shut Down ----------------------------- -.. doxygenfunction:: hsa_init() +.. doxygenfunction:: hsa_init() :project: rocr -.. doxygenfunction:: hsa_shut_down() +.. doxygenfunction:: hsa_shut_down() :project: rocr System and Agent Information @@ -70,12 +70,12 @@ System and Agent Information .. doxygenfunction:: hsa_agent_get_info() :project: rocr -.. doxygenfunction:: hsa_agent_iterate_caches() +.. doxygenfunction:: hsa_agent_iterate_caches() :project: rocr - + .. doxygenfunction:: hsa_agent_major_extension_supported() :project: rocr - + .. doxygenfunction:: hsa_cache_get_info() :project: rocr diff --git a/ROCm_API_References/Thrust.rst b/ROCm_API_References/Thrust.rst index de16e55b..0c868090 100644 --- a/ROCm_API_References/Thrust.rst +++ b/ROCm_API_References/Thrust.rst @@ -1,7 +1,7 @@ .. _HIP-thrust: -hipThrust +hipThrust ########## HIP back-end for Thrust @@ -27,7 +27,7 @@ AMD ROCm Installation $ sudo sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list' $ sudo apt-get update $ sudo apt install rocm-dkms - + Thrust Build Steps: :: $ git clone https://github.com/ROCmSoftwarePlatform/Thrust.git @@ -46,13 +46,13 @@ Steps to follow: $ cd examples $ ./cu_to_cpp.sh $ ./script_compile_testing_hcc.sh - + To execute applications: :: $ cd Thrust/ $ ./script_run_hcc.sh foldername (eg:examples/testing/performance) - + Sample applications @@ -69,7 +69,7 @@ transform_iterator: sequence : 0 1 2 3 4 5 6 7 8 9 clamped sequence : 1 1 2 3 4 5 5 5 5 5 negated sequence : -1 -1 -2 -3 -4 -5 -5 -5 -5 -5 - negated values : -2 -5 -7 -1 -6 0 -3 -8 + negated values : -2 -5 -7 -1 -6 0 -3 -8 sort: :: @@ -106,38 +106,38 @@ expand: :: $ ./expand.out Expanding values according to counts - counts 3 5 2 0 1 3 4 2 4 - values 1 2 3 4 5 6 7 8 9 - output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9 - + counts 3 5 2 0 1 3 4 2 4 + values 1 2 3 4 5 6 7 8 9 + output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9 + Unit Test ************ -| The test suite consists of unit tests. +| The test suite consists of unit tests. | Run the following commands to perform unit testing of different components of Thrust. .. note:: Set HIP_PLATFORM to either NVCC or HCC depending on the platform being used :: - + $ cd Thrust/testing $ ./cu_to_cpp.sh $ ./script_compile_testing_hcc.sh -To execute unit tests: +To execute unit tests: :: $ cd Thrust/ $ ./script_run_hcc.sh testing/ Sample output of transform and Max element test cases :: - - ./transform.out + + ./transform.out Running 34 unit tests. .................................. Totals: 0 failures, 0 known failures, 0 errors, and 34 passes. Time: 0.366667 minutes - + ./max_element.out Running 7 unit tests. .................................. @@ -152,20 +152,20 @@ Run the following commands to exercise Performance tests in Thrust .. note:: Set HIP_PLATFORM to either NVCC or HCC depending on the platform being used :: - + $ cd Thrust/performance $ ./script_compile_performance.sh -To execute performance tests: -:: +To execute performance tests: +:: $ cd Thrust/ $ ./script_run_hcc.sh performance/ - + :: - + ./adjacent_difference.cpp.out - + @@ -195,7 +195,7 @@ To execute performance tests: - + diff --git a/ROCm_API_References/api.rst b/ROCm_API_References/api.rst index bdfb6ff3..bf80aac6 100644 --- a/ROCm_API_References/api.rst +++ b/ROCm_API_References/api.rst @@ -1,12 +1,12 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ************* rocSOLVER API ************* -This section provides details of the rocSOLVER library API as in release +This section provides details of the rocSOLVER library API as in release `ROCm 2.10 `_. @@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as in release Types ===== -Most rocSOLVER types are aliases of rocBLAS types. +Most rocSOLVER types are aliases of rocBLAS types. See rocBLAS types `here `_. Definitions @@ -312,7 +312,7 @@ rocsolver_getrs_strided_batched() Auxiliaries ========================= -rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions +rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions `here `_. rocSOLVER handle auxiliaries diff --git a/ROCm_API_References/clBLAS.rst b/ROCm_API_References/clBLAS.rst index 4ee29ec2..dacab15c 100644 --- a/ROCm_API_References/clBLAS.rst +++ b/ROCm_API_References/clBLAS.rst @@ -3,13 +3,13 @@ clBLAS API Documentation ========================= -This is an implementation of Basic Linear Algebra Subprograms, levels 1, 2 and 3 using OpenCL and optimized for the AMD GPU hardware. +This is an implementation of Basic Linear Algebra Subprograms, levels 1, 2 and 3 using OpenCL and optimized for the AMD GPU hardware. * `BLAS1 `_ - The Level 1 Basic Linear Algebra Subprograms are functions that perform vector-vector operations. + The Level 1 Basic Linear Algebra Subprograms are functions that perform vector-vector operations. * `BLAS2 `_ - The Level 2 Basic Linear Algebra Subprograms are functions that perform matrix-vector operations. + The Level 2 Basic Linear Algebra Subprograms are functions that perform matrix-vector operations. * `BLAS3 `_ - The Level 3 Basic Linear Algebra Subprograms are funcions that perform matrix-matrix operations. + The Level 3 Basic Linear Algebra Subprograms are funcions that perform matrix-matrix operations. diff --git a/ROCm_API_References/clSPARSE_API.rst b/ROCm_API_References/clSPARSE_API.rst index ae5de082..7ae923ed 100644 --- a/ROCm_API_References/clSPARSE_API.rst +++ b/ROCm_API_References/clSPARSE_API.rst @@ -17,11 +17,11 @@ Routines to initialize a clsparse object .. doxygenfunction:: cldenseInitMatrix() -.. doxygenfunction:: clsparseInitCooMatrix() +.. doxygenfunction:: clsparseInitCooMatrix() -.. doxygenfunction:: clsparseInitCsrMatrix() +.. doxygenfunction:: clsparseInitCsrMatrix() -.. doxygenfunction:: clsparseInitScalar() +.. doxygenfunction:: clsparseInitScalar() .. doxygenfunction:: clsparseInitScalar() diff --git a/ROCm_API_References/clSPARSE_api.rst b/ROCm_API_References/clSPARSE_api.rst index 7d65c9b2..f1f950c1 100644 --- a/ROCm_API_References/clSPARSE_api.rst +++ b/ROCm_API_References/clSPARSE_api.rst @@ -3,13 +3,13 @@ clSPARSE API Documentation ========================== -It is an OpenCL library implementing Sparse linear algebra routines. +It is an OpenCL library implementing Sparse linear algebra routines. - * `Dense L1 BLAS operations `_ + * `Dense L1 BLAS operations `_ Dense BLAS level 1 routines for dense vectors - + * `Sparse L2 BLAS operations `_ Sparse BLAS level 2 routines for sparse matrix dense vector - + * `Sparse L3 BLAS operations `_ - Sparse BLAS level 3 routines for sparse matrix dense matrix + Sparse BLAS level 3 routines for sparse matrix dense matrix diff --git a/ROCm_API_References/rocBLAS.rst b/ROCm_API_References/rocBLAS.rst index 642579ca..daeea376 100644 --- a/ROCm_API_References/rocBLAS.rst +++ b/ROCm_API_References/rocBLAS.rst @@ -1,7 +1,7 @@ .. _rocBLAS: ============ -rocBLAS +rocBLAS ============ .. doxygenclass:: rocblas_handle @@ -13,6 +13,6 @@ rocBLAS :members: -.. doxygenfunction:: +.. doxygenfunction:: :project: rocBLAS :members: diff --git a/ROCm_Audio_Video_Tutorials/ROCm_videos.rst b/ROCm_Audio_Video_Tutorials/ROCm_videos.rst index c50099c0..8e41706f 100644 --- a/ROCm_Audio_Video_Tutorials/ROCm_videos.rst +++ b/ROCm_Audio_Video_Tutorials/ROCm_videos.rst @@ -1,7 +1,7 @@ -Slidecast: For AMD, It’s Time to ROCm! +Slidecast: For AMD, It's Time to ROCm! https://youtu.be/LUAu4eywK5g -Video: AMD ROC – Radeon Open Compute Platform +Video: AMD ROC - Radeon Open Compute Platform https://youtu.be/dnKDFci2x2Q diff --git a/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst b/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst index 000890c7..6c23240c 100644 --- a/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst +++ b/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst @@ -113,27 +113,27 @@ The number of enabled registers must match value in compute_pgm_rsrc2.user_sgpr The following table defines SGPR registers that can be enabled and their order. ============ ============== ======================================= ================================================================== -SGPR Order Number +SGPR Order Number of Registers Name Description ============ ============== ======================================= ================================================================== First 4 Private Segment Buffer V# that can be used, together with Scratch Wave Offset as an - (enable_sgpr_private_segment_buffer) offset, to access the Private/Spill/Arg segments using a segment address. CP uses the value from - amd_queue_t.scratch_resource_descriptor. + (enable_sgpr_private_segment_buffer) offset, to access the Private/Spill/Arg segments using a segment address. CP uses the value from + amd_queue_t.scratch_resource_descriptor. then 2 Dispatch Ptr 64 bit address of AQL dispatch packet for kernel actually - (enable_sgpr_dispatch_ptr) executing. + (enable_sgpr_dispatch_ptr) executing. then 2 Queue Ptr 64 bit address of amd_queue_t object for AQL queue on which the (enable_sgpr_queue_ptr) dispatch packet was queued. - + then 2 Kernarg Segment Ptr 64 bit address of Kernarg segment. This is directly copied (enable_sgpr_kernarg_segment_ptr) from the kernarg_address in the kernel dispatch packet. Having CP load it once avoids loading it at the beginning of every wavefront. then 2 Dispatch Id 64 bit Dispatch ID of the dispatch packet being executed. - (enable_sgpr_dispatch_id) + (enable_sgpr_dispatch_id) then 2 Flat Scratch Init Value used for FLAT_SCRATCH register initialization. Refer to (enable_sgpr_flat_scratch_init) Flat scratch for more information. - + then 1 Private Segment Size The 32 bit byte size of a single work-items scratch memory (enable_sgpr_private_segment_size) allocation. This is the value from the kernel dispatch packet Private Segment Byte Size rounded up by CP to a multiple of WORD. Having CP load it once avoids loading it at the beginning of every wavefront. Not used for GFX7/GFX8 since it is the same value as the second SGPR of Flat Scratch Init. @@ -144,15 +144,15 @@ then 1 Grid Work-Group Count Y 32 bit count of the number of work-groups then 1 Grid Work-Group Count Z 32 bit count of the number of work-groups in the Z dimension (enable_sgpr_grid_workgroup_count_Z for the grid being executed. Computed from the fields in the && less than 16 previous SGPRs) kernel dispatch packet as ((grid_size.z + workgroup_size.z - 1) / workgroupSize.z). Only initialized if <16 previous SGPRs initialized. - + then 1 Work-Group Id X 32 bit work group id in X dimension of grid for wavefront. (enable_sgpr_workgroup_id_X) Always present. - + then 1 Work-Group Id Y 32 bit work group id in Y dimension of grid for wavefront. - (enable_sgpr_workgroup_id_Y) + (enable_sgpr_workgroup_id_Y) -then 1 Work-Group Id Z +then 1 Work-Group Id Z (enable_sgpr_workgroup_id_Z) 32 bit work group id in Z dimension of grid for wavefront. If present then Work-group Id Y will also be present. then 1 Work-Group Info {first_wave, 14b0000, ordered_append_term[10:0], @@ -160,7 +160,7 @@ then 1 Work-Group Info {first_wave, 14b0000, ordered_append_term[10:0], then 1 | Private Segment Wave Byte Offset 32 bit byte offset from base of scratch base of queue the | (enable_sgpr_private_segment_wave executing kernel dispatch. Must be used as an offset with | _byte_offset) Private/Spill/Arg segment address when using Scratch Segment Buffer. It must be added to Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. - + ============ ============== ======================================= ================================================================== VGPR register numbers used for enabled registers are dense starting at VGPR0: the first enabled register is VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have a VGPR number. @@ -423,7 +423,7 @@ Memory Fence scacq agent+ memfence; s_waitcnt 0; buffer_wbinvl1_vol Memory Fence screl agent+ s_waitcnt 0; memfence Memory Fence scar agent + memfence; s_waitcnt 0; buffer_wbinvl1_vol ============== ==================== ================= ======================================================== - + .. _Instruction-set-architecture: Instruction set architecture @@ -450,7 +450,7 @@ AMD AMDGPU 8 0 1 GFX8, XNACK enabled A10-8700 serie AMD AMDGPU 8 0 2 GFX8, SPI register limitation FirePro S7150, S7100, W7100; Radeon R285, R9 380, XNACK disabled, R9 385; Mobile FirePro M7170 - PCIe Gen3 atomics + PCIe Gen3 atomics AMD AMDGPU 8 0 3 GFX8, XNACK disabled, Radeon R9 Nano, R9 Fury, R9 FuryX, Pro Duo, RX 460, PCIe Gen3 atomics RX 470, RX 480; FirePro S9300x2 @@ -458,7 +458,7 @@ AMD AMDGPU 8 0 3 GFX8, XNACK disabled, Radeon R9 Nano, R9 Fury AMD AMDGPU 8 0 4 GFX8, -XNACK Legacy, Radeon R9 Nano, R9 Fury, R9 FuryX, Pro Duo, RX 460, RX 470, RX 480; FirePro S9300x2 -AMD AMDGPU 9 0 0 GFX9, -XNACK +AMD AMDGPU 9 0 0 GFX9, -XNACK AMD AMDGPU 9 0 1 GFX9, +XNACK ======= ============== ======= ======= ========== ============================== ===================================================== @@ -467,7 +467,7 @@ AMD AMDGPU 9 0 1 GFX9, +XNACK AMD Kernel Code ################### -AMD Kernel Code object is used by AMD GPU CP to set up the hardware to execute a kernel dispatch and consists of the meta data needed to initiate the execution of a kernel, including the entry point address of the machine code that implements +AMD Kernel Code object is used by AMD GPU CP to set up the hardware to execute a kernel dispatch and consists of the meta data needed to initiate the execution of a kernel, including the entry point address of the machine code that implements @@ -693,7 +693,7 @@ AMD_FLOAT_ROUND_MODE_ZERO 3 Round Toward 0 ====================================== ========= ===================================================================== .. _Denorm-Mode: - + Denorm Mode amd_float_denorm_mode_t ###################################### @@ -973,7 +973,7 @@ References * `AMD_Southern_Islands_Instruction_Set_Architecture `_ * `ROCR Runtime sources `_ * `amd_hsa_kernel_code.h `_ - * `amd_hsa_queue.h `_ + * `amd_hsa_queue.h `_ * `amd_hsa_signal.h `_ * `amd_hsa_common.h `_ * `PCI Express Atomic Operations `_ diff --git a/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst b/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst index 0ee50d42..52ac1e00 100644 --- a/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst +++ b/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst @@ -61,7 +61,7 @@ Use the following commands: -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" \ ../llvm make - + To build the library bitcodes, clone the amd_stg_open branch of this repository. Run the following commands: @@ -189,7 +189,7 @@ Programmers should consult the HSA Runtime Programmer's Reference Manual for a f Known issues ************** - + * Each HSA process creates an internal DMA queue, but there is a system-wide limit of four DMA queues. When the limit is reached HSA processes will use internal kernels for copies. **Disclaimer** diff --git a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst index e224d4d3..523bd611 100644 --- a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst +++ b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst @@ -90,7 +90,7 @@ GCN Native ISA LLVM Code Generator * :ref:`.amdgpu_metadata` * :ref:`Code Object V3 Example Source Code (-mattr=+code-object-v3)` * :ref:`Additional Documentation` - + .. _Introductio: @@ -130,7 +130,7 @@ Use the clang -target --- option to speci OS Description ============== ========================================================================================== Defaults to the unknown OS. -amdhsa Compute kernels executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm]. +amdhsa Compute kernels executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm]. amdpal Graphic shaders and compute kernels executed on AMD PAL runtime. mesa3d Graphic shaders and compute kernels executed on Mesa 3D runtime. ============== ========================================================================================== @@ -343,10 +343,10 @@ Use the clang -mcpu option to specify the AMD GPU processor. The nam | | | | | cumode | | | | | | | | [off] | | | +-----------+-------------+--------------+-------+-----------------+---------+----------------------+ - -.. _Target Features: - + +.. _Target Features: + Target Features ----------------- @@ -362,32 +362,32 @@ For example: Enable the xnack feature. -mno-xnack Disable the xnack feature. - - **AMDGPU Target Features** + + **AMDGPU Target Features** ================= ============================================================================ Target Feature Description ================= ============================================================================ - -m[no-]xnack Enable/disable generating code that has memory clauses that are compatible + -m[no-]xnack Enable/disable generating code that has memory clauses that are compatible with having XNACK replay enabled. This is used for demand paging and page migration. If XNACK replay is - enabled in the device, then if a page fault occurs the code may execute + enabled in the device, then if a page fault occurs the code may execute incorrectly if the xnack feature is not enabled. Executing code that has the feature enabled on a device that does not have XNACK replay enabled will - execute correctly, but may be less performant than code with the feature + execute correctly, but may be less performant than code with the feature disabled. -m[no-]sram-ecc Enable/disable generating code that assumes SRAM ECC is enabled/disabled. -m[no-]wavefront size64 Control the default wavefront size used when generating code for kernels. - When disabled native wavefront size 32 is used, when enabled wavefront + When disabled native wavefront size 32 is used, when enabled wavefront size 64 is used. - -m[no-]cumode Control the default wavefront execution mode used when generating code + -m[no-]cumode Control the default wavefront execution mode used when generating code for kernels. When disabled native WGP wavefront execution mode is used, when enabled CU wavefront execution mode is used (see Memory Model). -================= ============================================================================ - - +================= ============================================================================ + + .. _Address-Spaces: @@ -402,9 +402,9 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR). **Address Space Mapping** -====================== =================== +====================== =================== LLVM Address Space Memory Space -====================== =================== +====================== =================== 0 Generic (Flat) 1 Global 2 Region (GDS) @@ -414,9 +414,9 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR). 6 Constant 32-bit 7 Buffer Fat Pointer (experimental) -====================== =================== +====================== =================== -The buffer fat pointer is an experimental address space that is currently unsupported in the backend. It exposes a non-integral pointer that is in future intended to support the modelling of 128-bit buffer descriptors + a 32-bit offset into the buffer descriptor (in total encapsulating a 160-bit ‘pointer’), allowing us to use normal LLVM load/store/atomic operations to model the buffer descriptors used heavily in graphics workloads targeting the backend. +The buffer fat pointer is an experimental address space that is currently unsupported in the backend. It exposes a non-integral pointer that is in future intended to support the modelling of 128-bit buffer descriptors + a 32-bit offset into the buffer descriptor (in total encapsulating a 160-bit 'pointer'), allowing us to use normal LLVM load/store/atomic operations to model the buffer descriptors used heavily in graphics workloads targeting the backend. .. _Memory-Scopes: @@ -429,33 +429,33 @@ The memory model supported is based on the HSA memory model which is based in t This is different to the OpenCL memory model which does not have scope inclusion and requires the memory scopes to exactly match. However, this is conservatively correct for OpenCL. - **AMDHSA LLVM Sync Scopes** -================ ================================================================================================================= + **AMDHSA LLVM Sync Scopes** +================ ================================================================================================================= LLVM Sync Scope Description -================ ================================================================================================================= +================ ================================================================================================================= none The default: system. - Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation’s sync scope is: + Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation's sync scope is: * system. * agent and executed by a thread on the same agent. * workgroup and executed by a thread in the same workgroup. * wavefront and executed by a thread in the same wavefront. -agent Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation’s sync scope is: +agent Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation's sync scope is: * system or agent and executed by a thread on the same agent. * workgroup and executed by a thread in the same workgroup. * wavefront and executed by a thread in the same wavefront. -workgroup Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation’s sync scope is: +workgroup Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation's sync scope is: * system, agent or workgroup and executed by a thread in the same workgroup. * wavefront and executed by a thread in the same wavefront. -wavefront Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation’s sync scope is: +wavefront Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces (except private, or generic that accesses private) provided the other operation's sync scope is: * system, agent, workgroup or wavefront and executed by a thread in the same wavefront. singlethread Only synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) running in the same thread for all address spaces (for example, in signal handlers). one-as Same as system but only synchronizes with other operations within the same address space -================ ================================================================================================================= +================ ================================================================================================================= .. _AMDGPU-Intrinsics: @@ -474,22 +474,22 @@ AMDGPU Attributes The AMDGPU backend supports the following LLVM IR attributes. - **AMDGPU LLVM IR Attributes** + **AMDGPU LLVM IR Attributes** ============================================ ============================================================================================= LLVM Attribute Description ============================================ ============================================================================================= -“amdgpu-flat-work-group-size”=”min,max” Specify the minimum and maximum flat work group sizes that will be specified - when the kernel is dispatched. Generated by the amdgpu_flat_work_group_size +"amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that will be specified + when the kernel is dispatched. Generated by the amdgpu_flat_work_group_size CLANG attribute. -“amdgpu-implicitarg-num-bytes”=”n” Number of kernel argument bytes to add to the kernel argument block size +"amdgpu-implicitarg-num-bytes"="n" Number of kernel argument bytes to add to the kernel argument block size for the implicit arguments. This varies by OS and language -“amdgpu-num-sgpr”=”n” Specifies the number of SGPRs to use. Generated by the amdgpu_num_sgpr CLANG attribute -“amdgpu-num-vgpr”=”n” Specifies the number of VGPRs to use. Generated by the amdgpu_num_vgpr CLANG attribute -“amdgpu-waves-per-eu”=”m,n” Specify the minimum and maximum number of waves per execution unit. +"amdgpu-num-sgpr"="n" Specifies the number of SGPRs to use. Generated by the amdgpu_num_sgpr CLANG attribute +"amdgpu-num-vgpr"="n" Specifies the number of VGPRs to use. Generated by the amdgpu_num_vgpr CLANG attribute +"amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per execution unit. Generated by the amdgpu_waves_per_eu CLANG attribute -“amdgpu-ieee” true/false. Specify whether the function expects the IEEE field of the mode register to +"amdgpu-ieee" true/false. Specify whether the function expects the IEEE field of the mode register to be set on entry. Overrides the default for the calling convention. -“amdgpu-dx10-clamp” true/false. Specify whether the function expects the DX10_CLAMP field of the mode +"amdgpu-dx10-clamp" true/false. Specify whether the function expects the DX10_CLAMP field of the mode register to be set on entry. Overrides the default for the calling convention. ============================================ ============================================================================================= @@ -522,7 +522,7 @@ The AMDGPU backend uses the following ELF header: =========================== =================================== **AMDGPU ELF Header Enumeration Values** - + ========================== =============== Name Value ========================== =============== @@ -569,8 +569,8 @@ Sections An AMDGPU target ELF code object has the standard ELF sections which include: - **AMDGPU ELF Sections** - + **AMDGPU ELF Sections** + =============== ================ ==================================== Name Type Attributes =============== ================ ==================================== @@ -605,7 +605,7 @@ These sections have their standard meanings and are only generated if needed. .relaname, .rela.dyn For relocatable code objects, name is the name of the section that the relocation records apply. For example, .rela.text is the section name for relocation records associated with the .text section. - For linked shared code objects, .rela.dyn contains all the relocation records from each of the relocatable code object’s .relaname sections. + For linked shared code objects, .rela.dyn contains all the relocation records from each of the relocatable code object's .relaname sections. See Relocation Records for the relocation records supported by the AMDGPU backend. .text @@ -618,18 +618,18 @@ Note Records As required by ELFCLASS64, minimal zero byte padding must be generated after the name field to ensure the desc field is 4 byte aligned. In addition, minimal zero byte padding must be generated to ensure the desc field size is a multiple of 4 bytes. The sh_addralign field of the .note section must be at least 4 to indicate at least 8 byte alignment. -The AMDGPU backend code object uses the following ELF note records in the .note section. The Description column specifies the layout of the note record’s desc field. All fields are consecutive bytes. Note records with variable size strings have a corresponding *_size field that specifies the number of bytes, including the terminating null character, in the string. The string(s) come immediately after the preceding fields. +The AMDGPU backend code object uses the following ELF note records in the .note section. The Description column specifies the layout of the note record's desc field. All fields are consecutive bytes. Note records with variable size strings have a corresponding *_size field that specifies the number of bytes, including the terminating null character, in the string. The string(s) come immediately after the preceding fields. Additional note records can be present. **AMDGPU ELF Note Records** - -================ ============================== ========================================== + +================ ============================== ========================================== Name Type Description -================ ============================== ========================================== - “AMD” NT_AMD_AMDGPU_HSA_METADATA - “AMD” NT_AMD_AMDGPU_ISA -================ ============================== ========================================== +================ ============================== ========================================== + "AMD" NT_AMD_AMDGPU_HSA_METADATA + "AMD" NT_AMD_AMDGPU_ISA +================ ============================== ========================================== @@ -637,7 +637,7 @@ Additional note records can be present. **AMDGPU ELF Note Record Enumeration Values** ============================= ================== Name Value -============================= ================== +============================= ================== reserved 0-9 NT_AMD_AMDGPU_HSA_METADATA 10 NT_AMD_AMDGPU_ISA 11 @@ -658,7 +658,7 @@ NT_AMD_AMDGPU_ISA where: ``architecture`` - The architecture from table AMDGPU Target Triples. + The architecture from table AMDGPU Target Triples. This is always amdgcn when the target triple OS is amdhsa (see Target Triples). ``vendor`` @@ -667,22 +667,22 @@ NT_AMD_AMDGPU_ISA ``OS`` The OS from table AMDGPU Target Triples. - + ``environment`` An environment from table AMDGPU Target Triples, or blank if the environment has no affect on the execution of the code object. For the AMDGPU backend this is currently always blank. - + ``processor`` The processor from table AMDGPU Processors. For example:: - + amdgcn-amd-amdhsa--gfx901 ``NT_AMD_AMDGPU_HSA_METADATA`` - Specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm]. It is required when the target triple OS is amdhsa (see Target Triples). See Code Object Metadata for the syntax of the code object metadata string. + Specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm]. It is required when the target triple OS is amdhsa (see Target Triples). See Code Object Metadata for the syntax of the code object metadata string. .. _Symbols: @@ -692,7 +692,7 @@ Symbols Symbols include the following: **AMDGPU ELF Symbols** - + +----------------+------------+-----------+--------------------+ | Name | Type | Section | Description | +================+============+===========+====================+ @@ -744,7 +744,7 @@ Following notations are used for specifying relocation calculations: **A** Represents the addend used to compute the value of the relocatable field. **G** - Represents the offset into the global offset table at which the relocation entry’s symbol will reside during execution. + Represents the offset into the global offset table at which the relocation entry's symbol will reside during execution. **GOT** Represents the address of the global offset table. **P** @@ -784,7 +784,7 @@ The following relocation types are supported: | R_AMDGPU_REL32_HI | 11 | word32 | (S + A - P) >> 32 | +------------------------+-------+--------+--------------------------------+ - + .. _DWARF: DWARF @@ -798,7 +798,7 @@ Address Space Mapping The following address space mapping is used: AMDGPU DWARF Address Space Mapping -======================== ======================== +======================== ======================== DWARF Address Space Memory Space ======================== ======================== 1 Private (Scratch) @@ -847,7 +847,7 @@ This section provides code conventions used when the target triple OS is amdhsa Code Object Metadata +++++++++++++++++++++ -The code object metadata specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm]. It is specified by the NT_AMD_AMDGPU_HSA_METADATA note record (see Note Records) and is required when the target triple OS is amdhsa (see Target Triples). It must contain the minimum information necessary to support the ROCM kernel queries. For example, the segment sizes needed in a dispatch packet. In addition, a high level language runtime may require other information to be included. For example, the AMD OpenCL runtime records kernel argument information. +The code object metadata specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm]. It is specified by the NT_AMD_AMDGPU_HSA_METADATA note record (see Note Records) and is required when the target triple OS is amdhsa (see Target Triples). It must contain the minimum information necessary to support the ROCM kernel queries. For example, the segment sizes needed in a dispatch packet. In addition, a high level language runtime may require other information to be included. For example, the AMD OpenCL runtime records kernel argument information. The metadata is specified as a YAML formatted string (see [YAML] and YAML I/O). @@ -855,19 +855,19 @@ The metadata is represented as a single YAML document comprised of the mapping d For boolean values, the string values of false and true are used for false and true respectively. -Additional information can be added to the mappings. To avoid conflicts, any non-AMD key names should be prefixed by “vendor-name.”. +Additional information can be added to the mappings. To avoid conflicts, any non-AMD key names should be prefixed by "vendor-name.". AMDHSA Code Object Metadata Mapping +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+ | String Key | Value Type | Required? | Description | +============+========================+===========+================================================================================================================================================+ -| “Version” | sequence of 2 integers | Required | * The first integer is the major version. Currently 1. | +| "Version" | sequence of 2 integers | Required | * The first integer is the major version. Currently 1. | | | | | * The second integer is the minor version. Currently 0. | +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Printf” | sequence of strings | | Each string is encoded information about a printf function call. | +| "Printf" | sequence of strings | | Each string is encoded information about a printf function call. | | | | | The encoded information is organized as fields separated by colon | | | | | | -| | | | (‘:’):ID:N:S[0]:S[1]:...:S[N-1]:FormatString | +| | | | (':'):ID:N:S[0]:S[1]:...:S[N-1]:FormatString | | | | | | | | | | where: | | | | | ID | @@ -880,7 +880,7 @@ Additional information can be added to the mappings. To avoid conflicts, any non | | | | FormatString | | | | | The format string passed to the printf function call. | +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Kernels” | sequence of mapping | Required | Sequence of the mappings for each kernel in the code object. See AMDHSA Code Object Kernel Metadata Mapping for the definition of the mapping. | +| "Kernels" | sequence of mapping | Required | Sequence of the mappings for each kernel in the code object. See AMDHSA Code Object Kernel Metadata Mapping for the definition of the mapping. | +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -891,26 +891,26 @@ Additional information can be added to the mappings. To avoid conflicts, any non +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ | String Key | value Type | Required? | Description | +===================+========================+===========+====================================================================================================================================================+ -| “Name” | string | Required | Source name of the kernel. | +| "Name" | string | Required | Source name of the kernel. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “SymbolName” | string | Required | Name of the kernel descriptor ELF symbol. | +| "SymbolName" | string | Required | Name of the kernel descriptor ELF symbol. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Language” | string | | Source language of the kernel. Values include: | -| | | | * “OpenCL C” | -| | | | * “OpenCL C++” | -| | | | * “HCC” | -| | | | * “OpenMP” | +| "Language" | string | | Source language of the kernel. Values include: | +| | | | * "OpenCL C" | +| | | | * "OpenCL C++" | +| | | | * "HCC" | +| | | | * "OpenMP" | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “LanguageVersion” | sequence of 2 integers | | * The first integer is the major version. | +| "LanguageVersion" | sequence of 2 integers | | * The first integer is the major version. | | | | | * The second integer is the minor version. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Attrs” | mapping | | Mapping of kernel attributes. See AMDHSA Code Object Kernel Attribute Metadata Mapping for the mapping definition. | +| "Attrs" | mapping | | Mapping of kernel attributes. See AMDHSA Code Object Kernel Attribute Metadata Mapping for the mapping definition. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Arguments” | sequence of mapping | | Sequence of mappings of the kernel arguments. See AMDHSA Code Object Kernel Argument Metadata Mapping for the definition of the mapping. | +| "Arguments" | sequence of mapping | | Sequence of mappings of the kernel arguments. See AMDHSA Code Object Kernel Argument Metadata Mapping for the definition of the mapping. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “CodeProps” | mapping | | Mapping of properties related to the kernel code. See AMDHSA Code Object Kernel Code Properties Metadata Mapping for the mapping definition. | +| "CodeProps" | mapping | | Mapping of properties related to the kernel code. See AMDHSA Code Object Kernel Code Properties Metadata Mapping for the mapping definition. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ -| “DebugProps” | mapping | | Mapping of properties related to the kernel debugging. See AMDHSA Code Object Kernel Debug Properties Metadata Mapping for the mapping definition. | +| "DebugProps" | mapping | | Mapping of properties related to the kernel debugging. See AMDHSA Code Object Kernel Debug Properties Metadata Mapping for the mapping definition. | +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -922,152 +922,152 @@ Additional information can be added to the mappings. To avoid conflicts, any non +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+ | String Key | Value Type | Required? | Description | +=====================+========================+===========+=============================================================================+ -| “ReqdWorkGroupSize” | sequence of 3 integers | | The dispatch work-group size X,Y,Z must correspond to the specified values. | +| "ReqdWorkGroupSize" | sequence of 3 integers | | The dispatch work-group size X,Y,Z must correspond to the specified values. | | | | | Corresponds to the OpenCL reqd_work_group_size attribute. | +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+ -| “WorkGroupSizeHint” | sequence of 3 integers | | The dispatch work-group size X,Y,Z is likely to be the specified values. | +| "WorkGroupSizeHint" | sequence of 3 integers | | The dispatch work-group size X,Y,Z is likely to be the specified values. | | | | | Corresponds to the OpenCL work_group_size_hint attribute. | +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+ -| “VecTypeHint” | string | | The name of a scalar or vector type. | +| "VecTypeHint" | string | | The name of a scalar or vector type. | | | | | Corresponds to the OpenCL vec_type_hint attribute. | +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+ - - + + **AMDHSA Code Object Kernel Argument Metadata Mapping** - - + + +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | String Key | Value Type | Required? | Description | +=================+============+===========+===================================================================================================================================================================================================================================================================================================================================================+ -| “Name” | string | | Kernel argument name. | +| "Name" | string | | Kernel argument name. | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “TypeName” | string | | Kernel argument type name. | +| "TypeName" | string | | Kernel argument type name. | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Size” | integer | Required | Kernel argument size in bytes. | +| "Size" | integer | Required | Kernel argument size in bytes. | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “Align” | integer | Required | Kernel argument alignment in bytes. Must be a power of two. | +| "Align" | integer | Required | Kernel argument alignment in bytes. Must be a power of two. | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “ValueKind” | string | Required | Kernel argument kind that specifies how to set up the corresponding argument. Values include : | -| | | | “ByValue” | +| "ValueKind" | string | Required | Kernel argument kind that specifies how to set up the corresponding argument. Values include : | +| | | | "ByValue" | | | | | The argument is copied directly into the kernarg. | -| | | | “GlobalBuffer” | +| | | | "GlobalBuffer" | | | | | A global address space pointer to the buffer data is passed in the kernarg. | -| | | | “DynamicSharedPointer” | +| | | | "DynamicSharedPointer" | | | | | A group address space pointer to dynamically allocated LDS is passed in the kernarg. | -| | | | “Sampler” | +| | | | "Sampler" | | | | | A global address space pointer to a S# is passed in the kernarg. | -| | | | “Image” | +| | | | "Image" | | | | | A global address space pointer to a T# is passed in the kernarg. | -| | | | “Pipe” | +| | | | "Pipe" | | | | | A global address space pointer to an OpenCL pipe is passed in the kernarg. | -| | | | “Queue” | +| | | | "Queue" | | | | | A global address space pointer to an OpenCL device enqueue queue is passed in the kernarg. | -| | | | “HiddenGlobalOffsetX” | +| | | | "HiddenGlobalOffsetX" | | | | | The OpenCL grid dispatch global offset for the X dimension is passed in the kernarg. | -| | | | “HiddenGlobalOffsetY” | +| | | | "HiddenGlobalOffsetY" | | | | | The OpenCL grid dispatch global offset for the Y dimension is passed in the kernarg. | -| | | | “HiddenGlobalOffsetZ” | +| | | | "HiddenGlobalOffsetZ" | | | | | The OpenCL grid dispatch global offset for the Z dimension is passed in the kernarg. | -| | | | “HiddenNone” | +| | | | "HiddenNone" | | | | | An argument that is not used by the kernel. Space needs to be left for it, but it does not need to be set up. | -| | | | “HiddenPrintfBuffer” | +| | | | "HiddenPrintfBuffer" | | | | | A global address space pointer to the runtime printf buffer is passed in kernarg. | -| | | | “HiddenDefaultQueue” | +| | | | "HiddenDefaultQueue" | | | | | A global address space pointer to the OpenCL device enqueue queue that should be used by the kernel by default is passed in the kernarg. | -| | | | “HiddenCompletionAction” | +| | | | "HiddenCompletionAction" | | | | | TBD | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “ValueType” | Value Type | Required | Kernel argument value type. Only present if “ValueKind” is “ByValue”. For vector data types, the value is for the element type.Values include: | -| | | | * “Struct” | -| | | | * “I8” | -| | | | * “U8” | -| | | | * “I16” | -| | | | * “U16” | -| | | | * “F16” | -| | | | * “I32” | -| | | | * “U32” | -| | | | * “F32” | -| | | | * “I64” | -| | | | * “U64” | -| | | | * “F64” | +| "ValueType" | Value Type | Required | Kernel argument value type. Only present if "ValueKind" is "ByValue". For vector data types, the value is for the element type.Values include: | +| | | | * "Struct" | +| | | | * "I8" | +| | | | * "U8" | +| | | | * "I16" | +| | | | * "U16" | +| | | | * "F16" | +| | | | * "I32" | +| | | | * "U32" | +| | | | * "F32" | +| | | | * "I64" | +| | | | * "U64" | +| | | | * "F64" | ++-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| "PointeeAlign" | integer | | Alignment in bytes of pointee type for pointer type kernel argument. Must be a power of 2. Only present if "ValueKind" is "DynamicSharedPointer". | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “PointeeAlign” | integer | | Alignment in bytes of pointee type for pointer type kernel argument. Must be a power of 2. Only present if “ValueKind” is “DynamicSharedPointer”. | +| "AddrSpaceQual" | string | | Kernel argument address space qualifier. Only present if "ValueKind" is "GlobalBuffer" or "DynamicSharedPointer".Values are : | +| | | | * "Private" | +| | | | * "Global" | +| | | | * "Constant" | +| | | | * "Local" | +| | | | * "Generic" | +| | | | * "Region" | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “AddrSpaceQual” | string | | Kernel argument address space qualifier. Only present if “ValueKind” is “GlobalBuffer” or “DynamicSharedPointer”.Values are : | -| | | | * “Private” | -| | | | * “Global” | -| | | | * “Constant” | -| | | | * “Local” | -| | | | * “Generic” | -| | | | * “Region” | +| "AccQual" | string | | Kernel argument access qualifier. Only present if "ValueKind" is "Image" or "Pipe". Values are : | +| | | | * "ReadOnly" | +| | | | * "WriteOnly" | +| | | | * "ReadWrite" | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “AccQual” | string | | Kernel argument access qualifier. Only present if “ValueKind” is “Image” or “Pipe”. Values are : | -| | | | * “ReadOnly” | -| | | | * “WriteOnly” | -| | | | * “ReadWrite” | +| "ActualAcc" | string | | The actual memory accesses performed by the kernel on the kernel argument.Only present if "ValueKind" is "GlobalBuffer", "Image", or "Pipe". This may be more restrictive than indicated by "AccQual" to reflect what the kernel actual does.If not present then the runtime must assume what is implied by "AccQual" and "IsConst". Values are : | +| | | | * "ReadOnly" | +| | | | * "WriteOnly" | +| | | | * "ReadWrite" | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “ActualAcc” | string | | The actual memory accesses performed by the kernel on the kernel argument.Only present if “ValueKind” is “GlobalBuffer”, “Image”, or “Pipe”. This may be more restrictive than indicated by “AccQual” to reflect what the kernel actual does.If not present then the runtime must assume what is implied by “AccQual” and “IsConst”. Values are : | -| | | | * “ReadOnly” | -| | | | * “WriteOnly” | -| | | | * “ReadWrite” | +| "IsConst" | boolean | | Indicates if the kernel argument is const qualified. Only present if "ValueKind" is "GlobalBuffer". | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “IsConst” | boolean | | Indicates if the kernel argument is const qualified. Only present if “ValueKind” is “GlobalBuffer”. | +| "IsRestrict" | boolean | | Indicates if the kernel argument is restrict qualified. Only present if "ValueKind" is "GlobalBuffer". | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “IsRestrict” | boolean | | Indicates if the kernel argument is restrict qualified. Only present if “ValueKind” is “GlobalBuffer”. | +| "IsVolatile" | boolean | | Indicates if the kernel argument is volatile qualified. Only present if "ValueKind" is "GlobalBuffer". | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “IsVolatile” | boolean | | Indicates if the kernel argument is volatile qualified. Only present if “ValueKind” is “GlobalBuffer”. | +| "IsPipe" | boolean | | Indicates if the kernel argument is pipe qualified. Only present if "ValueKind" is "Pipe". | +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| “IsPipe” | boolean | | Indicates if the kernel argument is pipe qualified. Only present if “ValueKind” is “Pipe”. | -+-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - + **AMDHSA Code Object Kernel Code Properties Metadata Mapping** - - + + +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ | String Key | Value Type | Required? | Description | +===========================+============+===========+==========================================================================================================================+ -| “KernargSegmentSize” | integer | Required | The size in bytes of the kernarg segment that holds the values of the arguments to the kernel. | +| "KernargSegmentSize" | integer | Required | The size in bytes of the kernarg segment that holds the values of the arguments to the kernel. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “GroupSegmentFixedSize” | integer | Required | The amount of group segment memory required by a work-group in bytes. | +| "GroupSegmentFixedSize" | integer | Required | The amount of group segment memory required by a work-group in bytes. | | | | | This does not include any dynamically allocated group segment memory that may be added when the kernel is dispatched. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “PrivateSegmentFixedSize” | integer | Required | The amount of fixed private address space memory required for a work-item in bytes. | +| "PrivateSegmentFixedSize" | integer | Required | The amount of fixed private address space memory required for a work-item in bytes. | | | | | If IsDynamicCallstack is 1 then additional space must be added to this value for the call stack. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “KernargSegmentAlign” | integer | Required | The maximum byte alignment of arguments in the kernarg segment. Must be a power of 2. | +| "KernargSegmentAlign" | integer | Required | The maximum byte alignment of arguments in the kernarg segment. Must be a power of 2. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “WavefrontSize” | integer | Required | Wavefront size. Must be a power of 2. | +| "WavefrontSize" | integer | Required | Wavefront size. Must be a power of 2. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “NumSGPRs” | integer | | Number of scalar registers used by a wavefront for GFX6-GFX9. | +| "NumSGPRs" | integer | | Number of scalar registers used by a wavefront for GFX6-GFX9. | | | | | This includes the special SGPRs for VCC, Flat Scratch (GFX7-GFX9) and XNACK (for GFX8-GFX9). | | | | | It does not include the 16 SGPR added if a trap handler is enabled. It is not rounded up to the allocation granularity. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “NumVGPRs” | integer | | Number of vector registers used by each work-item for GFX6-GFX9 | +| "NumVGPRs" | integer | | Number of vector registers used by each work-item for GFX6-GFX9 | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “MaxFlatWorkgroupSize” | integer | | Maximum flat work-group size supported by the kernel in work-items. | +| "MaxFlatWorkgroupSize" | integer | | Maximum flat work-group size supported by the kernel in work-items. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “IsDynamicCallStack” | boolean | | Indicates if the generated machine code is using a dynamically sized call stack. | +| "IsDynamicCallStack" | boolean | | Indicates if the generated machine code is using a dynamically sized call stack. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ -| “IsXNACKEnabled” | boolean | | Indicates if the generated machine code is capable of supporting XNACK. | +| "IsXNACKEnabled" | boolean | | Indicates if the generated machine code is capable of supporting XNACK. | +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+ - + **AMDHSA Code Object Kernel Debug Properties Metadata Mapping** - + +-------------------------------------+------------+-----------+-------------+ | String Key | Value Type | Required? | Description | +=====================================+============+===========+=============+ -| “DebuggerABIVersion” | string | | | +| "DebuggerABIVersion" | string | | | ++-------------------------------------+------------+-----------+-------------+ +| "ReservedNumVGPRs" | integer | | | +-------------------------------------+------------+-----------+-------------+ -| “ReservedNumVGPRs” | integer | | | +| "ReservedFirstVGPR" | integer | | | +-------------------------------------+------------+-----------+-------------+ -| “ReservedFirstVGPR” | integer | | | +| "PrivateSegmentBufferSGPR" | integer | | | +-------------------------------------+------------+-----------+-------------+ -| “PrivateSegmentBufferSGPR” | integer | | | +| "WavefrontPrivateSegmentOffsetSGPR" | integer | | | +-------------------------------------+------------+-----------+-------------+ -| “WavefrontPrivateSegmentOffsetSGPR” | integer | | | -+-------------------------------------+------------+-----------+-------------+ .. _Kernel Dispatch: @@ -1085,7 +1085,7 @@ To dispatch a kernel the following actions are performed. This can occur in the 1. A pointer to an AQL queue for the kernel agent on which the kernel is to be executed is obtained. 2. A pointer to the kernel descriptor (see Kernel Descriptor) of the kernel to execute is obtained. It must be for a kernel that is contained in a code object that that was loaded by the ROCm runtime on the kernel agent with which the AQL queue is associated. 3. Space is allocated for the kernel arguments using the ROCm runtime allocator for a memory region with the kernarg property for the kernel agent that will execute the kernel. It must be at least 16 byte aligned. - 4. Kernel argument values are assigned to the kernel argument memory allocation. The layout is defined in the HSA Programmer’s Language Reference [HSA]. For AMDGPU the kernel execution directly accesses the kernel argument memory in the same way constant memory is accessed. (Note that the HSA specification allows an implementation to copy the kernel argument contents to another location that is accessed by the kernel.) + 4. Kernel argument values are assigned to the kernel argument memory allocation. The layout is defined in the HSA Programmer's Language Reference [HSA]. For AMDGPU the kernel execution directly accesses the kernel argument memory in the same way constant memory is accessed. (Note that the HSA specification allows an implementation to copy the kernel argument contents to another location that is accessed by the kernel.) 5. An AQL kernel dispatch packet is created on the AQL queue. The ROCm runtime api uses 64 bit atomic operations to reserve space in the AQL queue for the packet. The packet must be set up, and the final write must use an atomic store release to set the packet kind to ensure the packet contents are visible to the kernel agent. AQL defines a doorbell signal mechanism to notify the kernel agent that the AQL queue has been updated. These rules, and the layout of the AQL queue and kernel dispatch packet is defined in the HSA System Architecture Specification [HSA]. 6. A kernel dispatch packet includes information about the actual dispatch, such as grid and work-group size, together with information from the code object about the kernel, such as segment sizes. The ROCm runtime queries on the kernel symbol can be used to obtain the code object values which are recorded in the Code Object Metadata. 7. CP executes micro-code and is responsible for detecting and setting up the GPU to execute the wavefronts of a kernel dispatch. @@ -1170,7 +1170,7 @@ Kernel Descriptor for GFX6-GFX9 CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. Kernel Descriptor for GFX6-GFX9 - + +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Bits | Size | Field Name | Description | +=========+==========================+=====================================+================================================================================================================================================================================================================+ @@ -1186,7 +1186,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 127:98 | 30 bits | | Reserved. Must be 0. | +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 191:128 | 8 bytes | kernel_code_entry_byte_offset | Byte offset (possibly negative) from base address of kernel descriptor to kernel’s entry point instruction which must be 256 byte aligned. | +| 191:128 | 8 bytes | kernel_code_entry_byte_offset | Byte offset (possibly negative) from base address of kernel descriptor to kernel's entry point instruction which must be 256 byte aligned. | +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 383:192 | 24 bytes | | Reserved. Must be 0. | +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -1225,7 +1225,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. **compute_pgm_rsrc1 for GFX6-GFX9** - + +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Bits | Size | Field Name | Description | +=======+========================+=================================+======================================================================================================================================================================================================================================================================================+ @@ -1267,7 +1267,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. | | | | CP is responsible for filling in ``COMPUTE_PGM_RSRC1.PRIV.`` | +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 21 | 1 bit | enable_dx10_clamp | Wavefront starts execution with DX10 clamp mode enabled. | -| | | | Used by the vector ALU to force DX-10 style treatment of NaN’s (when set, clamp NaN to zero, otherwise pass NaN through). | +| | | | Used by the vector ALU to force DX-10 style treatment of NaN's (when set, clamp NaN to zero, otherwise pass NaN through). | | | | | Used by CP to set up`` COMPUTE_PGM_RSRC1.DX10_CLAMP.`` | +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 22 | 1 bit | debug_mode | Must be 0. | @@ -1289,10 +1289,10 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 32 | **Total size 4 bytes** | | | +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - + **compute_pgm_rsrc2 for GFX6-GFX9** - + +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Bits | Size | Field Name | Description | +=======+=====================+=================================================+===============================================================================================================================================================================================+ @@ -1362,10 +1362,10 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | 32 | Total size 4 bytes. | | | +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - - - Floating Point Rounding Mode Enumeration Values - + + + Floating Point Rounding Mode Enumeration Values + +-------------------------------------+-------+------------------------+ | Enumeration Name | Value | Description | +=====================================+=======+========================+ @@ -1378,7 +1378,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. | AMD_FLOAT_ROUND_MODE_ZERO | 3 | Round Toward 0 | +-------------------------------------+-------+------------------------+ - Floating Point Denorm Mode + Floating Point Denorm Mode +-------------------------------------+-------+--------------------------------------+ | Enumeration Values Enumeration Name | Value | Description | +=====================================+=======+======================================+ @@ -1392,8 +1392,8 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment. +-------------------------------------+-------+--------------------------------------+ - System VGPR Work-Item ID - + System VGPR Work-Item ID + +---------------------------------------+-------+-----------------------------------------+ | Enumeration Values Enumeration Name | Value | Description | +=======================================+=======+=========================================+ @@ -1447,13 +1447,13 @@ SGPR register initial state is defined in SGPR Register Set Up Order. | | | | FLAT_SCRATCH_HI corresponds to SGPRn-4 on GFX7, and SGPRn-6 on GFX8 (where SGPRn is the highest numbered SGPR allocated to the wave). | | | | | FLAT_SCRATCH_HI is multiplied by 256 (as it is in units of 256 bytes) and added to SH_HIDDEN_PRIVATE_BASE_VIMID to calculate the per wave FLAT SCRATCH BASE in flat memory instructions that access the scratch apperture. | | | | | | -| | | | The second SGPR is 32 bit byte size of a single work-item’s scratch memory usage. | -| | | | CP obtains this from the runtime, and it is always a multiple of DWORD. CP checks that the value in the kernel dispatch packet Private Segment Byte Size is not larger, and requests the runtime to increase the queue’s scratch size if necessary. | +| | | | The second SGPR is 32 bit byte size of a single work-item's scratch memory usage. | +| | | | CP obtains this from the runtime, and it is always a multiple of DWORD. CP checks that the value in the kernel dispatch packet Private Segment Byte Size is not larger, and requests the runtime to increase the queue's scratch size if necessary. | | | | | The kernel code must move it to FLAT_SCRATCH_LO which is SGPRn-3 on GFX7 and SGPRn-5 on GFX8. FLAT_SCRATCH_LO is used as the FLAT SCRATCH SIZE in flat memory instructions. | | | | | Having CP load it once avoids loading it at the beginning of every wavefront. GFX9 This is the 64 bit base address of the per SPI scratch backing memory managed by SPI for the queue executing the kernel dispatch. CP obtains this from the runtime | | | | | (and divides it if there are multiple Shader Arrays each with its own SPI). | | | | | The value of Scratch Wave Offset must be added by the kernel machine code and the result moved to the FLAT_SCRATCH SGPR which is SGPRn-6 and SGPRn-5. | -| | | | It is used as the FLAT SCRATCH BASE in flat memory instructions. then Private Segment Size 1 The 32 bit byte size of a (enable_sgpr_private single work-item’s scratch_segment_size) memory allocation. | +| | | | It is used as the FLAT SCRATCH BASE in flat memory instructions. then Private Segment Size 1 The 32 bit byte size of a (enable_sgpr_private single work-item's scratch_segment_size) memory allocation. | | | | | This is the value from the kernel dispatch packet Private Segment Byte Size rounded up by CP to a multiple of DWORD. | | | | | Having CP load it once avoids loading it at the beginning of every wavefront. | | | | | | @@ -1477,7 +1477,7 @@ SGPR register initial state is defined in SGPR Register Set Up Order. +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | then | Work-Group Id Z (enable_sgpr_workgroup_id _Z) | 1 | 32 bit work-group id in Z dimension of grid for wavefront. | +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| then | Work-Group Info (enable_sgpr_workgroup _info) | 1 | {first_wave, 14’b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} | +| then | Work-Group Info (enable_sgpr_workgroup _info) | 1 | {first_wave, 14'b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} | +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | then | Scratch Wave Offset (enable_sgpr_private _segment_wave_offset) | 1 | 32 bit byte offset from base of scratch base of queue executing the kernel dispatch. | | | | | Must be used as an offset with Private segment address when using Scratch Segment Buffer. | @@ -1489,7 +1489,7 @@ The order of the VGPR registers is defined, but the compiler can specify which o VGPR register initial state is defined in VGPR Register Set Up Order. VGPR Register Set Up Order - + +------------+----------------------------------------------+-----------------+----------------------------------------------------------------------+ | VGPR Order | Name (kernel descriptor enable field) | Number of VGPRs | Description | +============+==============================================+=================+======================================================================+ @@ -1542,9 +1542,9 @@ If the kernel may use flat operations to access scratch memory, the prolog code GFX6 Flat scratch is not supported. - + GFX7-8 - 1. The low word of Flat Scratch Init is 32 bit byte offset from SH_HIDDEN_PRIVATE_BASE_VIMID to the base of scratch backing memory being managed by SPI for the queue executing the kernel dispatch. This is the same value used in the Scratch Segment Buffer V# base address. The prolog must add the value of Scratch Wave Offset to get the wave’s byte scratch backing memory offset from SH_HIDDEN_PRIVATE_BASE_VIMID. Since FLAT_SCRATCH_LO is in units of 256 bytes, the offset must be right shifted by 8 before moving into FLAT_SCRATCH_LO. + 1. The low word of Flat Scratch Init is 32 bit byte offset from SH_HIDDEN_PRIVATE_BASE_VIMID to the base of scratch backing memory being managed by SPI for the queue executing the kernel dispatch. This is the same value used in the Scratch Segment Buffer V# base address. The prolog must add the value of Scratch Wave Offset to get the wave's byte scratch backing memory offset from SH_HIDDEN_PRIVATE_BASE_VIMID. Since FLAT_SCRATCH_LO is in units of 256 bytes, the offset must be right shifted by 8 before moving into FLAT_SCRATCH_LO. 2. The second word of Flat Scratch Init is 32 bit byte size of a single work-items scratch memory usage. This is directly loaded from the kernel dispatch packet Private Segment Byte Size and rounded up to a multiple of DWORD. Having CP load it once avoids loading it at the beginning of every wavefront. The prolog must move it to FLAT_SCRATCH_LO for use as FLAT SCRATCH SIZE. GFX9 @@ -1953,7 +1953,7 @@ On dGPU the kernarg backing memory is accessed as UC (uncached) to avoid needing The memory order also adds the single thread optimization constrains defined in table AMDHSA Memory Model Single Thread Optimization Constraints GFX6-GFX9. AMDHSA Memory Model Single Thread Optimization Constraints GFX6-GFX9 - + +-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | LLVM Memory | Optimization Constraints | +=============+============================================================================================================================================================================================+ @@ -2000,8 +2000,8 @@ For code objects generated by AMDGPU backend for HSA [HSA] compatible runtimes ( | | | queue_ptr | | +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | llvm.debugtrap | s_trap 0x03 | | If debugger not installed then behaves as a no-operation. The trap handler is entered and immediately returns to continue execution of the wavefront. | -| | | | If the debugger is installed, causes the debug trap to be reported by the debugger and the wavefront is put in the halt state until resumed by debugger. | -+---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ +| | | | If the debugger is installed, causes the debug trap to be reported by the debugger and the wavefront is put in the halt state until resumed by debugger. | ++---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | reserved | s_trap 0x04 | | Reserved | +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | reserved | s_trap 0x05 | | Reserved | @@ -2012,7 +2012,7 @@ For code objects generated by AMDGPU backend for HSA [HSA] compatible runtimes ( +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | reserved | s_trap 0x08 | | Reserved | +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ -| reserved | s_trap 0xfe | | Reserved | +| reserved | s_trap 0xfe | | Reserved | +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ | reserved | s_trap 0xff | | Reserved | +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -2041,17 +2041,17 @@ Note that there are always 10 available user data entries in registers - entries **PAL Compute Shader User Data Registers** - ================ ===================================================== - User Register Description - ================ ===================================================== + ================ ===================================================== + User Register Description + ================ ===================================================== 0 Global Internal Table (32-bit pointer) 1 Per-Shader Internal Table (32-bit pointer) 2 - 11 Application-Controlled User Data (10 32-bit values) 12 Spill Table (32-bit pointer) 13 - 14 Thread Group Count (64-bit pointer) - 15 GDS Range - ================ ===================================================== - + 15 GDS Range + ================ ===================================================== + .. _Graphics-User-Data: Graphics User Data @@ -2108,7 +2108,7 @@ The following table illustrates the required format: 52 vaRange::ShadowDescriptorTable High Bits ========= ============================================== -The pointer to the global internal table passed to the shader as user data is a 32-bit pointer. The top 32 bits should be assumed to be the same as the top 32 bits of the pipeline, so the shader may use the program counter’s top 32 bits. +The pointer to the global internal table passed to the shader as user data is a 32-bit pointer. The top 32 bits should be assumed to be the same as the top 32 bits of the pipeline, so the shader may use the program counter's top 32 bits. .. _Unspecified OS: @@ -2218,7 +2218,7 @@ The following syntax for register operands is supported: * Register pairs, quads, etc: s[2:3], v[10:11], ttmp[5:6], s[4:7], v[12:15], ttmp[4:7], s[8:15], ... * Register lists: [s0, s1], [ttmp0, ttmp1, ttmp2, ttmp3] * Register index expressions: v[2*2], s[1-1:2-1] - * ‘off’ indicates that an operand is not enabled + * 'off' indicates that an operand is not enabled The following extra operands are supported: @@ -2258,29 +2258,29 @@ DS *** :: - + ds_add_u32 v2, v4 offset:16 ds_write_src2_b64 v2 offset0:4 offset1:8 - ds_cmpst_f32 v2, v4, v6 + ds_cmpst_f32 v2, v4, v6 ds_min_rtn_f64 v[8:9], v2, v[4:5] - -For full list of supported instructions, refer to “LDS/GDS instructions” in ISA Manual. + +For full list of supported instructions, refer to "LDS/GDS instructions" in ISA Manual. .. _FLAT: FLAT ***** :: - + flat_load_dword v1, v[3:4] flat_store_dwordx3 v[3:4], v[5:7] flat_atomic_swap v1, v[3:4], v5 glc flat_atomic_cmpswap v1, v[3:4], v[5:6] glc slc flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc - -For full list of supported instructions, refer to “FLAT instructions” in ISA Manual. + +For full list of supported instructions, refer to "FLAT instructions" in ISA Manual. .. _MUBUF: @@ -2288,35 +2288,35 @@ For full list of supported instructions, refer to “FLAT instructions” in ISA MUBUF ****** :: - + buffer_load_dword v1, off, s[4:7], s1 buffer_store_dwordx4 v[1:4], v2, ttmp[4:7], s1 offen offset:4 glc tfe buffer_store_format_xy v[1:2], off, s[4:7], s1 buffer_wbinvl1 buffer_atomic_inc v1, v2, s[8:11], s4 idxen offset:4 slc -For full list of supported instructions, refer to “MUBUF Instructions” in ISA Manual. +For full list of supported instructions, refer to "MUBUF Instructions" in ISA Manual. .. _SMRD/SMEM: SMRD/SMEM ********** :: - + s_load_dword s1, s[2:3], 0xfc s_load_dwordx8 s[8:15], s[2:3], s4 s_load_dwordx16 s[88:103], s[2:3], s4 s_dcache_inv_vol s_memtime s[4:5] -For full list of supported instructions, refer to “Scalar Memory Operations” in ISA Manual. +For full list of supported instructions, refer to "Scalar Memory Operations" in ISA Manual. .. _SOP1: SOP1 ***** :: - + s_mov_b32 s1, s2 s_mov_b64 s[0:1], 0x80000000 s_cmov_b32 s1, 200 @@ -2325,14 +2325,14 @@ SOP1 s_swappc_b64 s[2:3], s[4:5] s_cbranch_join s[4:5] -For full list of supported instructions, refer to “SOP1 Instructions” in ISA Manual. +For full list of supported instructions, refer to "SOP1 Instructions" in ISA Manual. .. _SOP2: SOP2 ***** :: - + s_add_u32 s1, s2, s3 s_and_b64 s[2:3], s[4:5], s[6:7] s_cselect_b32 s1, s2, s3 @@ -2342,28 +2342,28 @@ SOP2 s_bfm_b64 s[2:3], s4, s6 s_bfe_i64 s[2:3], s[4:5], s6 s_cbranch_g_fork s[4:5], s[6:7] - -For full list of supported instructions, refer to “SOP2 Instructions” in ISA Manual. + +For full list of supported instructions, refer to "SOP2 Instructions" in ISA Manual. .. _SOPC: SOPC ***** :: - + s_cmp_eq_i32 s1, s2 s_bitcmp1_b32 s1, s2 s_bitcmp0_b64 s[2:3], s4 s_setvskip s3, s5 - -For full list of supported instructions, refer to “SOPC Instructions” in ISA Manual. + +For full list of supported instructions, refer to "SOPC Instructions" in ISA Manual. .. _SOPP: SOPP ***** :: - + s_barrier s_nop 2 s_endpgm @@ -2375,8 +2375,8 @@ SOPP s_sendmsg 0x1 s_sendmsg sendmsg(MSG_INTERRUPT) s_trap 1 - -For full list of supported instructions, refer to “SOPP Instructions” in ISA Manual. + +For full list of supported instructions, refer to "SOPP Instructions" in ISA Manual. Unless otherwise mentioned, little verification is performed on the operands of SOPP Instructions, so it is up to the programmer to be familiar with the range or acceptable values. @@ -2393,7 +2393,7 @@ For vector ALU instruction opcodes (VOP1, VOP2, VOP3, VOPC, VOP_DPP, VOP_SDWA), VOP1/VOP2/VOP3/VOPC examples ***************************** - + :: v_mov_b32 v1, v2 @@ -2411,9 +2411,9 @@ VOP1/VOP2/VOP3/VOPC examples VOP_DPP examples ****************** - + :: - + v_mov_b32 v0, v0 quad_perm:[0,2,1,1] v_sin_f32 v0, v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 v_mov_b32 v0, v0 wave_shl:1 @@ -2427,14 +2427,14 @@ VOP_SDWA examples ****************** :: - + v_mov_b32 v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD v_min_u32 v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD v_sin_f32 v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1 v_fract_f32 v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 - -For full list of supported instructions, refer to “Vector ALU instructions”. + +For full list of supported instructions, refer to "Vector ALU instructions". .. _Code Object V2 Predefined Symbols (-mattr=-code-object-v3): @@ -2457,7 +2457,7 @@ The AMDGPU assembler defines and updates some symbols automatically. These symbo .option.machine_version_major ++++++++++++++++++++++++++++++ -Set to the GFX major generation number of the target being assembled for. For example, when assembling for a “GFX9” target this will be set to the integer value “9”. The possible GFX major generation numbers are presented in :ref:`Processors`. +Set to the GFX major generation number of the target being assembled for. For example, when assembling for a "GFX9" target this will be set to the integer value "9". The possible GFX major generation numbers are presented in :ref:`Processors`. .. _.option.machine_version_minor: @@ -2465,10 +2465,10 @@ Set to the GFX major generation number of the target being assembled for. For ex .option.machine_version_minor ++++++++++++++++++++++++++++++ -Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a “GFX810” target this will be set to the integer value “1”. The possible GFX minor generation numbers are presented in :ref:`Processors`. +Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a "GFX810" target this will be set to the integer value "1". The possible GFX minor generation numbers are presented in :ref:`Processors`. .option.machine_version_stepping -Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`. +Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`. .. _.option.machine_version_stepping: @@ -2476,7 +2476,7 @@ Set to the GFX stepping generation number of the target being assembled for. For .option.machine_version_stepping +++++++++++++++++++++++++++++++++ -Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`. +Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`. .. _.kernel.vgpr_count: @@ -2501,7 +2501,7 @@ Code Object V2 Directives (-mattr=-code-object-v3) :: - Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the directives supported + Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the directives supported with the default configuration (Code Object V3) see :ref:`Code Object V3 Directives (-mattr=+code-object-v3)`. AMDGPU ABI defines auxiliary data in output code object. In assembly source, one can specify them with assembler directives. @@ -2520,7 +2520,7 @@ major and minor are integers that specify the version of the HSA code object tha major, minor, and stepping are all integers that describe the instruction set architecture (ISA) version of the assembly program. -vendor and arch are quoted strings. vendor should always be equal to “AMD” and arch should always be equal to “AMDGPU”. +vendor and arch are quoted strings. vendor should always be equal to "AMD" and arch should always be equal to "AMDGPU". By default, the assembler will derive the ISA version, vendor, and arch from the value of the -mcpu option that is passed to the assembler. @@ -2561,7 +2561,7 @@ Code Object V2 Example Source Code (-mattr=-code-object-v3) :: - Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the predefined symbols + Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the predefined symbols available with the default configuration (Code Object V3). Here is an example of a minimal assembly source file, defining one HSA kernel: @@ -2611,21 +2611,21 @@ The AMDGPU assembler defines and updates some symbols automatically. These symbo .amdgcn.gfx_generation_number ++++++++++++++++++++++++++++++ -Set to the GFX major generation number of the target being assembled for. For example, when assembling for a “GFX9” target this will be set to the integer value “9”. The possible GFX major generation numbers are presented in :ref:`Processors`. +Set to the GFX major generation number of the target being assembled for. For example, when assembling for a "GFX9" target this will be set to the integer value "9". The possible GFX major generation numbers are presented in :ref:`Processors`. .. _.amdgcn.gfx_generation_minor: .amdgcn.gfx_generation_minor ++++++++++++++++++++++++++++++ -Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a “GFX810” target this will be set to the integer value “1”. The possible GFX minor generation numbers are presented in :ref:`Processors`. +Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a "GFX810" target this will be set to the integer value "1". The possible GFX minor generation numbers are presented in :ref:`Processors`. .. _.amdgcn.gfx_generation_stepping: .amdgcn.gfx_generation_stepping +++++++++++++++++++++++++++++++++ -Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`. +Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`. .. _.amdgcn.next_free_vgpr: @@ -2670,7 +2670,7 @@ Optional directive which declares the target supported by the containing assembl Creates a correctly aligned AMDHSA kernel descriptor and a symbol, .kd, in the current location of the current section. Only valid when the OS is amdhsa. must be a symbol that labels the first instruction to execute, and does not need to be previously defined. -Marks the beginning of a list of directives used to generate the bytes of a kernel descriptor, as described in Kernel Descriptor. Directives which may appear in this list are described in AMDHSA Kernel Assembler Directives. Directives may appear in any order, must be valid for the target being assembled for, and cannot be repeated. Directives support the range of values specified by the field they reference in Kernel Descriptor. If a directive is not specified, it is assumed to have its default value, unless it is marked as “Required”, in which case it is an error to omit the directive. This list of directives is terminated by an .end_amdhsa_kernel directive. +Marks the beginning of a list of directives used to generate the bytes of a kernel descriptor, as described in Kernel Descriptor. Directives which may appear in this list are described in AMDHSA Kernel Assembler Directives. Directives may appear in any order, must be valid for the target being assembled for, and cannot be repeated. Directives support the range of values specified by the field they reference in Kernel Descriptor. If a directive is not specified, it is assumed to have its default value, unless it is marked as "Required", in which case it is an error to omit the directive. This list of directives is terminated by an .end_amdhsa_kernel directive. **AMDHSA Kernel Assembler Directives** @@ -2785,7 +2785,7 @@ If an assembly source file contains multiple kernels and/or functions, the .amdg .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" // optional // gpr tracking symbols are implicitly set to zero - .text + .text .globl kern0 .p2align 8 .type kern0,@function @@ -2860,7 +2860,7 @@ Additional Documentation [AMD-GCN-GFX8] (`1 `_, `2 `_) `AMD GCN3 Instruction Set Architecture `_ -[AMD-GCN-GFX9] (`1 `_, `2 `_) `AMD “Vega” Instruction Set Architecture `_ +[AMD-GCN-GFX9] (`1 `_, `2 `_) `AMD "Vega" Instruction Set Architecture `_ [AMD-ROCm] (`1 `_, `2 `_, `3 `_, `4 `_) `ROCm: Open Platform for Development, Discovery and Education Around GPU Computing `_ @@ -2870,7 +2870,7 @@ Additional Documentation [`DWARF `_] `DWARF Debugging Information Format `_ -[YAML] (`1 `_, `2 `_) `YAML Ain’t Markup Language (YAML™) Version 1.2 `_ +[YAML] (`1 `_, `2 `_) `YAML Ain't Markup Language (YAML(TM)) Version 1.2 `_ [MsgPack] (`1 `_, `2 `_, `3 `_) `Message Pack `_ diff --git a/ROCm_Compiler_SDK/ocml.rst b/ROCm_Compiler_SDK/ocml.rst index 07da9ac9..3deeafd3 100644 --- a/ROCm_Compiler_SDK/ocml.rst +++ b/ROCm_Compiler_SDK/ocml.rst @@ -7,7 +7,7 @@ OCML User Guide ################ What Is OCML ************** -OCML is an LLVM-IR bitcode library designed to relieve language compiler and runtime implementers of the burden of implementing efficient and accurate mathematical functions. It is essentially a “libm” in intermediate representation with a fixed, simple API that can be linked in to supply the implementations of most standard low-level mathematical functions provided by the language. +OCML is an LLVM-IR bitcode library designed to relieve language compiler and runtime implementers of the burden of implementing efficient and accurate mathematical functions. It is essentially a "libm" in intermediate representation with a fixed, simple API that can be linked in to supply the implementations of most standard low-level mathematical functions provided by the language. Using OCML *********** @@ -16,11 +16,11 @@ Standard Usage OCML is expected to be used in a standard LLVM compilation flow as follows: * Compile source modules to LLVM-IR bitcode (clang) - * Link program bitcode, “wrapper” bitcode, OCML bitcode, and OCML control functions (llvm-link) + * Link program bitcode, "wrapper" bitcode, OCML bitcode, and OCML control functions (llvm-link) * Generic optimizations (opt) * Code generation (llc) -Here, “wrapper” bitcode denotes a thin library responsible for mapping mangled built-in function calls as produced by clang to the OCML API. An example in C might look like +Here, "wrapper" bitcode denotes a thin library responsible for mapping mangled built-in function calls as produced by clang to the OCML API. An example in C might look like :: @@ -71,9 +71,9 @@ OCML functions follow a simple naming convention: where {function} is generally the familiar libm name of the function, and {type suffix} indicates the type of the floating point arguments or results, and is one of - f16 – 16 bit floating point (half precision) - f32 – 32 bit floating point (single precision) - f64 – 64 bit floating point (double precision) + f16 - 16 bit floating point (half precision) + f32 - 32 bit floating point (single precision) + f64 - 64 bit floating point (double precision) For example, __ocml_sqrt_f32 is the name of the OCML single precision square root function. @@ -82,7 +82,7 @@ OCML does not currently support higher than double precision due to the lack of Supported functions ******************** -The following table contains a list of {function} currently supported by OCML, a brief description of each, and the maximum relative error in ULPs for each floating point type. A “c” in the last 3 columns indicates that the function is required to be correctly rounded. +The following table contains a list of {function} currently supported by OCML, a brief description of each, and the maximum relative error in ULPs for each floating point type. A "c" in the last 3 columns indicates that the function is required to be correctly rounded. +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | {function} | Description | f32 max err | f64 max err | f16 max err | @@ -91,7 +91,7 @@ The following table contains a list of {function} currently supported by OCML, a +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | acosh | arc hyperbolic cosine | 4 | 4 | 2 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ -| acospi | arc cosine / π | 5 | 5 | 2 | +| acospi | arc cosine / ? | 5 | 5 | 2 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | add_{rm} | add with specific rounding mode | c | c | c | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ @@ -189,9 +189,9 @@ The following table contains a list of {function} currently supported by OCML, a +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | len4 | four argument hypot | 2 | 2 | 2 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ -| lgamma | log Γ function | 6(>0) | 4(>0) | 3(>0) | +| lgamma | log ? function | 6(>0) | 4(>0) | 3(>0) | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ -| lgamma_r | log Γ function with sign | 6(>0) | 4(>0) | 3(>0) | +| lgamma_r | log ? function with sign | 6(>0) | 4(>0) | 3(>0) | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | log10 | log base 10 | 3 | 3 | 2 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ @@ -279,7 +279,7 @@ The following table contains a list of {function} currently supported by OCML, a +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | tanpi | tangent of argument times pi | 6 | 6 | 2 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ -| tgamma | true Γ function | 16 | 16 | 4 | +| tgamma | true ? function | 16 | 16 | 4 | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ | trunc | round to integer, towards zero | c | c | c | +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+ @@ -290,8 +290,8 @@ The following table contains a list of {function} currently supported by OCML, a For the functions supporting specific roundings, the rounding mode {rm} can be one of - * rte – round towards nearest even - * rtp – round towards positive infinity - * rtn – round towards negative infinity - * rtz – round towards zero + * rte - round towards nearest even + * rtp - round towards positive infinity + * rtn - round towards negative infinity + * rtz - round towards zero diff --git a/ROCm_Glossary/ROCm-Glossary.rst b/ROCm_Glossary/ROCm-Glossary.rst index 649dc3cf..eb259cbd 100644 --- a/ROCm_Glossary/ROCm-Glossary.rst +++ b/ROCm_Glossary/ROCm-Glossary.rst @@ -30,10 +30,10 @@ PCI Express (PCIe) was developed as the next generation I/O system interconnect A Queue is a runtime-allocated resource that contains a packet buffer and is associated with a packet processor. The packet processor tracks which packets in the buffer have already been processed. When it has been informed by the application that a new packet has been enqueued, the packet processor is able to process it because the packet format is standard and the packet contents are self-contained -- they include all the necessary information to run a command. A queue has an associated set of high-level operations defined in "HSA Runtime Specification" (API functions in host code) and "HSA Programmer Reference Manual Specification" (kernel code). **HSA (Heterogeneous System Architecture) :** -HSA provides a unified view of fundamental computing elements. HSA allows a programmer to write applications that seamlessly integrate CPUs (called latency compute units) with GPUs (called throughput compute units), while benefiting from the best attributes ofeach. HSA creates an improved processor design that exposes the benefits and capabilities of mainstream programmable compute elements, working together seamlessly.HSA is all about delivering new, improved user experiences through advances in computing architectures that deliver improvements across all four key vectors: improved power efficiency; improved performance; improved programmability; and broad portability across computing devices.For more on `HSA `_. +HSA provides a unified view of fundamental computing elements. HSA allows a programmer to write applications that seamlessly integrate CPUs (called latency compute units) with GPUs (called throughput compute units), while benefiting from the best attributes ofeach. HSA creates an improved processor design that exposes the benefits and capabilities of mainstream programmable compute elements, working together seamlessly.HSA is all about delivering new, improved user experiences through advances in computing architectures that deliver improvements across all four key vectors: improved power efficiency; improved performance; improved programmability; and broad portability across computing devices.For more on `HSA `_. **AQL Architectured Queueing Language :** -The Architected Queuing Language (AQL) is a standard binary interface used to describe commands such as a kernel dispatch. An AQL packet is a user-mode buffer with a specific format that encodes one command. AQL allows agents to build and enqueue their own command packets, enabling fast, low-power dispatch. AQL also provides support for kernel agent queue submissions: the kernel agent kernel can write commands in AQL format. +The Architected Queuing Language (AQL) is a standard binary interface used to describe commands such as a kernel dispatch. An AQL packet is a user-mode buffer with a specific format that encodes one command. AQL allows agents to build and enqueue their own command packets, enabling fast, low-power dispatch. AQL also provides support for kernel agent queue submissions: the kernel agent kernel can write commands in AQL format. diff --git a/ROCm_Libraries/ROCm_Libraries.rst b/ROCm_Libraries/ROCm_Libraries.rst index 8f4626da..c11c255c 100644 --- a/ROCm_Libraries/ROCm_Libraries.rst +++ b/ROCm_Libraries/ROCm_Libraries.rst @@ -1269,7 +1269,7 @@ The root of this repository has a helper bash script install.sh to build and ins **Manual build (all supported platforms)** If you use a distro other than Ubuntu, or would like more control over the build process, the hipblas build has helpful information on how to configure cmake and manually build. - + Build ######## @@ -1437,7 +1437,7 @@ Batched and strided GEMM API ------------------------------- hipBLAS GEMM can process matrices in batches with regular strides. There are several permutations of these API's, the following is an example that takes everything -:: +:: hipblasStatus_t hipblasSgemmStridedBatched( hipblasHandle_t handle, @@ -1570,7 +1570,7 @@ Running Statistical Tests :: # Go to rocRAND build directory - cd rocRAND; cd build + cd rocRAND; cd build # To run "crush" test, which verifies that generated pseudorandom # numbers are of high quality: # engine -> all, xorwow, mrg32k3a, mtgp32, philox @@ -1704,7 +1704,7 @@ The following is a simple example code that shows how to use rocFFT to compute a // Copy result back to host std::vector y(N); hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost); - + // Print results for (size_t i = 0; i < N; i++) { @@ -1810,7 +1810,7 @@ Execution info The execution api :cpp:func:`rocfft_execute` takes a rocfft_execution_info parameter. This parameter needs to be created and setup by the user and passed to the execution api. The execution info handle encapsulates -information such as execution mode, pointer to any work buffer etc. It can also hold information that are +information such as execution mode, pointer to any work buffer etc. It can also hold information that are side effect of execution such as event objects. The following functions deal with managing execution info object. Note that the *set* functions below need to be called before execution and *get* functions after execution. @@ -2012,7 +2012,7 @@ rocSPARSE with dependencies and client can be built using the following commands -DBUILD_CLIENTS_BENCHMARKS=ON \ -DBUILD_CLIENTS_SAMPLES=ON \ -DBUILD_VERBOSE=OFF \ - -DBUILD_SHARED_LIBS=ON + -DBUILD_SHARED_LIBS=ON # Compile rocSPARSE library make -j$(nproc) @@ -2028,7 +2028,7 @@ Common build problems #. **Issue:** HCC RUNTIME ERROR: Failed to find compatible kernel - **Solution:** Add the following to the cmake command when configuring: -DCMAKE_CXX_FLAGS=”–amdgpu-target=gfx803,gfx900,gfx906,gfx908” + **Solution:** Add the following to the cmake command when configuring: -DCMAKE_CXX_FLAGS="-amdgpu-target=gfx803,gfx900,gfx906,gfx908" #. **Issue:** Could not find a package configuration file provided by "ROCM" with any of the following names: ROCMConfig.cmake |br| @@ -2046,7 +2046,7 @@ You can test the installation by running one of the rocSPARSE examples, after su # Navigate to clients binary directory $ cd rocSPARSE/build/release/clients/staging - + # Execute rocSPARSE example $ ./example_csrmv 1000 @@ -2056,7 +2056,7 @@ Supported Targets Currently, rocSPARSE is supported under the following operating systems - + * Ubuntu 16.04 * Ubuntu 18.04 @@ -2093,7 +2093,7 @@ The above is a HIP (and CUDA) device management approach and has nothing to do w Once users set the device, they create a handle with `rocsparse_create_handle() `_. -Subsequent rocSPARSE routines take this handle as an input parameter. rocSPARSE ONLY queries (by hipGetDevice()) the user’s device; rocSPARSE does NOT set the device for users. If rocSPARSE does not see a valid device, it returns an error message. It is the users’ responsibility to provide a valid device to rocSPARSE and ensure the device safety. +Subsequent rocSPARSE routines take this handle as an input parameter. rocSPARSE ONLY queries (by hipGetDevice()) the user's device; rocSPARSE does NOT set the device for users. If rocSPARSE does not see a valid device, it returns an error message. It is the users' responsibility to provide a valid device to rocSPARSE and ensure the device safety. Users CANNOT switch devices between `rocsparse_create_handle() `_ and `rocsparse_destroy_handle() `_. If users want to change device, they must destroy the current handle and create another rocSPARSE handle. @@ -2693,7 +2693,7 @@ rocsparse_hybmv() :project: rocSPARSE .. doxygenfunction:: rocsparse_chybmv - :project: rocSPARSE + :project: rocSPARSE .. doxygenfunction:: rocsparse_zhybmv :project: rocSPARSE @@ -3033,31 +3033,31 @@ rocSOLVER *************** .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: Introduction ############## -An implementation of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. -rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS -implementation for AMD’s latest discrete GPUs. More information about rocBLAS can be found +An implementation of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains. +rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS +implementation for AMD's latest discrete GPUs. More information about rocBLAS can be found `here `_. Build and install ################### -rocSOLVER requires `cmake `_ -and `ROCm `_, including -`hip `_ and -`rocBLAS `_, to be installed. +rocSOLVER requires `cmake `_ +and `ROCm `_, including +`hip `_ and +`rocBLAS `_, to be installed. Once these requirements are satisfied, the following instructions will build and install rocSOLVER: .. code-block:: bash - + mkdir build && cd build CXX=/opt/rocm/bin/hcc cmake .. make @@ -3066,56 +3066,56 @@ instructions will build and install rocSOLVER: Brief description and functionality ###################################### -rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New -functionality is documented at each release of the ROCm platform. +rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New +functionality is documented at each release of the ROCm platform. The following table summarizes the LAPACK functionality implemented in rocSOLVER's last release. =============================== ====== ====== ============== ============== Lapack Auxiliary Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_laswp** x x x x -**rocsolver_larfg** x x +**rocsolver_laswp** x x x x +**rocsolver_larfg** x x **rocsolver_larft** x x **rocsolver_larf** x x -**rocsolver_larfb** x x -**rocsolver_org2r** x x -**rocsolver_orgqr** x x -**rocsolver_orgl2** x x -**rocsolver_orglq** x x -**rocsolver_orgbr** x x -**rocsolver_orm2r** x x -**rocsolver_ormqr** x x +**rocsolver_larfb** x x +**rocsolver_org2r** x x +**rocsolver_orgqr** x x +**rocsolver_orgl2** x x +**rocsolver_orglq** x x +**rocsolver_orgbr** x x +**rocsolver_orm2r** x x +**rocsolver_ormqr** x x =============================== ====== ====== ============== ============== =============================== ====== ====== ============== ============== Lapack Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_potf2** x x -rocsolver_potf2_batched x x -rocsolver_potf2_strided_batched x x -**rocsolver_potrf** x x -rocsolver_potrf_batched x x -rocsolver_potrf_strided_batched x x +**rocsolver_potf2** x x +rocsolver_potf2_batched x x +rocsolver_potf2_strided_batched x x +**rocsolver_potrf** x x +rocsolver_potrf_batched x x +rocsolver_potrf_strided_batched x x **rocsolver_getf2** x x x x rocsolver_getf2_batched x x x x rocsolver_getf2_strided_batched x x x x -**rocsolver_getrf** x x x x +**rocsolver_getrf** x x x x rocsolver_getrf_batched x x x x rocsolver_getrf_strided_batched x x x x -**rocsolver_geqr2** x x +**rocsolver_geqr2** x x rocsolver_geqr2_batched x x rocsolver_geqr2_strided_batched x x -**rocsolver_geqrf** x x -rocsolver_geqrf_batched x x +**rocsolver_geqrf** x x +rocsolver_geqrf_batched x x rocsolver_geqrf_strided_batched x x -**rocsolver_gelq2** x x +**rocsolver_gelq2** x x rocsolver_gelq2_batched x x rocsolver_gelq2_strided_batched x x -**rocsolver_gelqf** x x -rocsolver_gelqf_batched x x +**rocsolver_gelqf** x x +rocsolver_gelqf_batched x x rocsolver_gelqf_strided_batched x x -**rocsolver_getrs** x x x x +**rocsolver_getrs** x x x x rocsolver_getrs_batched x x x x rocsolver_getrs_strided_batched x x x x =============================== ====== ====== ============== ============== @@ -3123,38 +3123,38 @@ rocsolver_getrs_strided_batched x x x x Benchmarking and Testing ########################## -Additionaly, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. +Additionaly, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. -On a normal installation, clients should be located in the directory **/build/clients/staging**. +On a normal installation, clients should be located in the directory **/build/clients/staging**. **rocsolver-test** executes a suite of `Google tests `_ (*gtest*) that verifies the correct -functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by +functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by `NETLib LAPACK `_ on the CPU. Calling the rocSOLVER gtest client with the --help flag .. code-block:: bash - + ./rocsolver-test --help -returns information on different flags that control the behavior of the gtests. +returns information on different flags that control the behavior of the gtests. **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic -performance information (as for now, execution times). +performance information (as for now, execution times). -Similarly, +Similarly, .. code-block:: bash - + ./rocsolver-bench --help -returns information on how to use the rocSOLVER benchmark client. - +returns information on how to use the rocSOLVER benchmark client. + rocSOLVER API ############### -This section provides details of the rocSOLVER library API as in release +This section provides details of the rocSOLVER library API as in release `ROCm 2.10 `_. @@ -3162,7 +3162,7 @@ This section provides details of the rocSOLVER library API as in release Types ===== -Most rocSOLVER types are aliases of rocBLAS types. +Most rocSOLVER types are aliases of rocBLAS types. See rocBLAS types `here `_. Definitions @@ -3567,7 +3567,7 @@ rocsolver_getrs_strided_batched() Auxiliaries ========================= -rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions +rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions `here `_. rocSOLVER handle auxiliaries @@ -3666,7 +3666,7 @@ The hipSPARSE interface is compatible with rocSPARSE and cuSPARSE-v2 APIs. Porti CSRMV API ########### - + :: hipsparseStatus_t @@ -3903,7 +3903,7 @@ Common build problems ROCBLAS.cmake |br| rocblas-config.cmake - **Solution:** Install `rocBLAS `_ either from source or from 'AMD ROCm repository `_ + **Solution:** Install `rocBLAS `_ either from source or from 'AMD ROCm repository `_ Simple Test ########### @@ -3918,7 +3918,7 @@ You can test the installation by running a CG solver on a Laplace matrix. After ./clients/staging/cg gr_30_30.mtx -For more information regarding rocALUTION library and corresponding API documentation, refer +For more information regarding rocALUTION library and corresponding API documentation, refer `rocALUTION `_ @@ -4891,7 +4891,7 @@ Tensile is a **tool** for creating a benchmark-driven backend library for GEMMs, Overview for creating a custom TensileLib backend library for your application: -1. Install the `PyYAML and cmake dependency`_ (mandatory), ``git clone and cd Tensile`` +1. Install the `PyYAML and cmake dependency`_ (mandatory), ``git clone and cd Tensile`` 2. Create a `benchmark config.yaml`_ file in ``./Tensile/Configs/`` 3. `Run the benchmark`_. After the benchmark is finished. Tensile will dump 4 directories: 1 & 2 is about benchmarking. 3 & 4 is the summarized results from your library (like rocBLAS) viewpoints. @@ -4950,7 +4950,7 @@ Tensile uses an incremental and "programmable" `benchmarking protocol`_. Example Benchmark config.yaml as input file to Tensile ------------------------------------------------------- -:: +:: GlobalParameters: PrintLevel: 1 @@ -5160,18 +5160,18 @@ Each step of the benchmark can override what problem sizes will be benchmarked. 1. [1968] * Benchmark only size 1968; n = 1. - + 2. [16, 1920] * Benchmark sizes 16 to 1968 using the default step size (=16); n = 123. - + 3. [16, 32, 1968] * Benchmark sizes 16 to 1968 using a step size of 32; n = 61. - + 4. [64, 32, 16, 1968] * Benchmark sizes from 64 to 1968 with a step size of 32. Also, increase the step size by 16 each iteration. * This causes fewer sizes to be benchmarked when the sizes are large, and more benchmarks where the sizes are small; this is typically desired behavior. * n = 16 (64, 96, 144, 208, 288, 384, 496, 624, 768, 928, 1104, 1296, 1504, 1728, 1968). The stride at the beginning is 32, but the stride at the end is 256. - + 5. 0 * The size of this index is just whatever size index 0 is. For a 3-dimensional ProblemType, this allows benchmarking only a 2- dimensional or 1-dimensional slice of problem sizes. @@ -5255,12 +5255,12 @@ Compilers -------------- * For Tensile_BACKEND = OpenCL1.2 *(untested)* - + * Visual Studio 14 (2015). (VS 2012 may also be supported; c++11 should no longer be required by Tensile. Need to verify.) * GCC 4.8 and above * For Tensile_BACKEND = HIP - + * Public ROCm @@ -5273,7 +5273,7 @@ Tensile can be installed via: 1. Download repo and don't install; install PyYAML dependency manually and call python scripts manually: :: - + git clone https://github.com/ROCmSoftwarePlatform/Tensile.git python Tensile/Tensile/Tensile.py your_custom_config.yaml your_benchmark_path @@ -5329,7 +5329,7 @@ The kernel parameters affect many aspects of performance. Changing a parameter m .. image:: img1.png :align: center - + How N-Dimensional Tensor Contractions Are Mapped to Finite-Dimensional GPU Kernels -------------------------------------------------------------------------------------- @@ -5372,7 +5372,7 @@ The device languages Tensile supports for the gpu kernels is * HIP * Assembly - * gfx803 + * gfx803 * gfx900 Library Logic @@ -5455,7 +5455,7 @@ After running the `benchmark`_ and generating `library config files`_, you're re ) target_link_libraries( TARGET Tensile ) -TODO: Where is the Tensile include directory? +TODO: Where is the Tensile include directory? .. _benchmark: https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#id39 .. _library config files: https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#id46 @@ -5668,7 +5668,7 @@ In order to verify the build and capability of ROCm SMI on your system and to se $ cmake -DROCM_DIR= /tests/rocm_smi_test $ make -To run the test, execute the program rsmitst that is built from the steps above. +To run the test, execute the program rsmitst that is built from the steps above. Usage Basics ############## @@ -5691,25 +5691,25 @@ A simple "Hello World" type program that displays the device ID of detected devi #include #include "rocm_smi/rocm_smi.h" int main() { - rsmi_status_t ret; - uint32_t num_devices; - uint64_t dev_id; - - // We will skip return code checks for this example, but it + rsmi_status_t ret; + uint32_t num_devices; + uint64_t dev_id; + + // We will skip return code checks for this example, but it // is recommended to always check this as some calls may not // apply for some devices or ROCm releases - + ret = rsmi_init(0); ret = rsmi_num_monitor_devices(&num_devices); - + for (int i=0; i < num_devices; ++i) { ret = rsmi_dev_id_get(i, &dev_id); // dev_id holds the device ID of device i, upon a - // successful call - } + // successful call + } ret = rsmi_shut_down(); return 0; - } + } ***** RCCL @@ -5761,7 +5761,7 @@ To build the library : $ cd rccl $ mkdir build $ cd build - $ CXX=/opt/rocm/bin/hcc cmake + $ CXX=/opt/rocm/bin/hcc cmake $ make -j 8 @@ -5769,7 +5769,7 @@ To build the library : You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, :: - + apt install rocm-cmake. @@ -5867,7 +5867,7 @@ Build And Install # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the HCC compiler. # [CXX=hcc] cmake ../. # or cmake-gui ../. - + # Build make -j4 @@ -5886,7 +5886,7 @@ Using hipCUB In A Project ########################### Recommended way of including hipCUB into a CMake project is by using its package configuration files. - + :: # On ROCm hipCUB requires rocPRIM @@ -5997,7 +5997,7 @@ First create a build directory: :: - mkdir build; + mkdir build; cd build; @@ -6119,7 +6119,7 @@ Deprecated Libraries hCRNG ###### -hCRNG has been **deprecated** and has been replaced by `rocRAND `_ +hCRNG has been **deprecated** and has been replaced by `rocRAND `_ The hcRNG library is an implementation of uniform random number generators targeting the AMD heterogeneous hardware via HCC compiler runtime. The computational resources of underlying AMD heterogenous compute gets exposed and exploited through the HCC C++ frontend. Refer `here `_ for more details on HCC compiler. @@ -6145,7 +6145,7 @@ For more information, please refer :ref:`CLFF` clBLAS ######## -This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. +This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves. @@ -6154,18 +6154,18 @@ For more information, please refer :ref:`CLB` clSPARSE ######### - -an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. + +an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. For more information, please refer :ref:`CLS` clRNG ######## - + A library for uniform random number generation in OpenCL. -Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators. +Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators. For more information, please refer :ref:`CLR` diff --git a/ROCm_Libraries/dep-lib.rst b/ROCm_Libraries/dep-lib.rst index b8fb719d..39e6623e 100644 --- a/ROCm_Libraries/dep-lib.rst +++ b/ROCm_Libraries/dep-lib.rst @@ -4,7 +4,7 @@ hcRNG ********** -hCRNG has been deprecated and has been replaced by `rocRAND `_ +hCRNG has been deprecated and has been replaced by `rocRAND `_ ################################################################################################################# Introduction @@ -30,7 +30,7 @@ file: Randomarray.cpp :: #!c++ - + //This example is a simple random array generation and it compares host output with device output //Random number generator Mrg31k3p #include @@ -43,7 +43,7 @@ file: Randomarray.cpp #include #include using namespace hc; - + int main() { hcrngStatus status = HCRNG_SUCCESS; @@ -53,7 +53,7 @@ file: Randomarray.cpp size_t streamCount = 10; //Number of random numbers to be generated //numberCount must be a multiple of streamCount - size_t numberCount = 100; + size_t numberCount = 100; //Enumerate the list of accelerators std::vectoracc = hc::accelerator::get_all(); accelerator_view accl_view = (acc[1].create_view()); @@ -61,21 +61,21 @@ file: Randomarray.cpp float *Random1 = (float*) malloc(sizeof(float) * numberCount); float *Random2 = (float*) malloc(sizeof(float) * numberCount); float *outBufferDevice = hc::am_alloc(sizeof(float) * numberCount, acc[1], 0); - + //Create streams hcrngMrg31k3pStream *streams = hcrngMrg31k3pCreateStreams(NULL, streamCount, &streamBufferSize, NULL); hcrngMrg31k3pStream *streams_buffer = hc::am_alloc(sizeof(hcrngMrg31k3pStream) * streamCount, acc[1], 0); accl_view.copy(streams, streams_buffer, streamCount* sizeof(hcrngMrg31k3pStream)); - - //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) + + //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) status = hcrngMrg31k3pDeviceRandomU01Array_single(accl_view, streamCount, streams_buffer, numberCount, outBufferDevice); - + if(status) std::cout << "TEST FAILED" << std::endl; accl_view.copy(outBufferDevice, Random1, numberCount * sizeof(float)); - + //Invoke random number generators in host for (size_t i = 0; i < numberCount; i++) - Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]); + Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]); // Compare host and device outputs for(int i =0; i < numberCount; i++) { if (Random1[i] != Random2[i]) { @@ -87,7 +87,7 @@ file: Randomarray.cpp continue; } if(!ispassed) std::cout << "TEST FAILED" << std::endl; - + //Free host resources free(Random1); free(Random2); @@ -95,8 +95,8 @@ file: Randomarray.cpp hc::am_free(outBufferDevice); hc::am_free(streams_buffer); return 0; - } - + } + * Compiling the example code: @@ -141,8 +141,8 @@ and **Reboot the system** Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application: :: - cd /opt/rocm/hsa/sample - make + cd /opt/rocm/hsa/sample + make ./vector_copy **Library Installation** @@ -150,14 +150,14 @@ Once Reboot, to verify that the ROCm stack completed successfully you can execut **a. Install using Prebuilt debian** :: - + wget https://github.com/ROCmSoftwarePlatform/hcRNG/blob/master/pre-builds/hcrng-master-184472e-Linux.deb sudo dpkg -i hcrng-master-184472e-Linux.deb **b. Build debian from source** :: - + git clone https://github.com/ROCmSoftwarePlatform/hcRNG.git && cd hcRNG chmod +x build.sh && ./build.sh @@ -286,7 +286,7 @@ AMD is hosting both debian and rpm repositories for the ROCm 2.7 packages. The p Complete installation steps of ROCm can be found `Here `_ -or +or For Debian based systems, like Ubuntu, configure the Debian ROCm repository as follows: @@ -519,7 +519,7 @@ Build dependencies To develop the clFFT library code on a Windows operating system, ensure to install the following packages on your system: - * Windows® 7/8.1 + * Windows(R) 7/8.1 * Visual Studio 2012 or later @@ -548,7 +548,7 @@ To test the developed clFFT library code, ensure to install the following packag * Googletest v1.6 * Latest FFTW - + * Latest Boost Performance infrastructure @@ -565,7 +565,7 @@ clBLAS For Github repository `clBLAS `_ -This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. +This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves. @@ -716,7 +716,7 @@ Build dependencies ########################################## **Library for Windows** - * Windows® 7/8 + * Windows(R) 7/8 * Visual Studio 2010 SP1, 2012 * An OpenCL SDK, such as APP SDK 2.8 * Latest CMake @@ -749,10 +749,10 @@ Python ************** clSPARSE ************** - + For Github repository `clSPARSE `_ -an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. +an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. What's new in clSPARSE v0.10.1 ################################### @@ -779,7 +779,7 @@ clSPARSE features * Dense to CSR conversions (& converse) * COO to CSR conversions (& converse) * Functions to read matrix market files in COO or CSR format -True in spirit with the other clMath libraries, clSPARSE exports a “C” interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API’s to make them less ‘cluttered’ compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3 +True in spirit with the other clMath libraries, clSPARSE exports a "C" interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API's to make them less 'cluttered' compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3 API semantic versioning ############################## @@ -808,7 +808,7 @@ clSPARSE is licensed under the `Apache License `_) * Solution (.sln) or @@ -850,12 +850,12 @@ clSPARSE is licensed under the `Apache License `_ A library for uniform random number generation in OpenCL. -Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators. +Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators. What's New @@ -873,7 +873,7 @@ Building ############## 1. Install the runtime dependency: * An OpenCL SDK, such as APP SDK. - + 2. Install the build dependencies: * The CMake cross-platform build system. Visual Studio users can use CMake Tools for Visual Studio. @@ -906,7 +906,7 @@ On a 64-bit Linux platform, steps 3 through 9 from above, executed in a Bash-com export CLRNG_ROOT=$PWD/package export LD_LIBRARY_PATH=$CLRNG_ROOT/lib64:$LD_LIBRARY_PATH $CLRNG_ROOT/bin/CTest - + **Examples** Examples can be found in src/client. The compiled client program examples can be found under the bin subdirectory of the installation package ($CLRNG_ROOT/bin under Linux). Note that the examples expect an OpenCL GPU device to be available. @@ -1047,7 +1047,7 @@ The following are the steps to use the library **ROCM 2.7 Installation** -To Know more about ROCM refer +To Know more about ROCM refer https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md **a. Installing Debian ROCM repositories** @@ -1083,8 +1083,8 @@ and Reboot the system Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application: - * cd /opt/rocm/hsa/sample - * make + * cd /opt/rocm/hsa/sample + * make * ./vector_copy **Library Installation** @@ -1129,7 +1129,7 @@ The following are the sub-routines that are implemented KeyFeature ############# - + * Support 1D, 2D and 3D Fast Fourier Transforms * Supports R2C, C2R, C2C, D2Z, Z2D and Z2Z Transforms * Support Out-Of-Place data storage @@ -1145,7 +1145,7 @@ This section lists the known set of hardware and software requirements to build **Hardware** - * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU + * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU * System Memory >= 4GB (Better if >10GB for NN application over multiple GPUs) * Hard Drive > 200GB (Better if SSD or NVMe driver for NN application over multiple GPUs) * Minimum GPU Memory (Global) > 2GB @@ -1197,7 +1197,7 @@ file: hcfft_1D_R2C.cpp :: #!c++ - + #include #include #include "hcfft.h" @@ -1239,9 +1239,9 @@ file: hcfft_1D_R2C.cpp free(input); free(output); hc::am_free(idata); - hc::am_free(odata); + hc::am_free(odata); } - + * Compiling the example code: Assuming the library and compiler installation is followed as in installation. @@ -1264,9 +1264,9 @@ This sections enumerates the list of tested combinations of Hardware and system **GPU Cards** - * Radeon R9 Nano + * Radeon R9 Nano * Radeon R9 FuryX - * Radeon R9 Fury + * Radeon R9 Fury * Kaveri and Carizo APU **Server System** diff --git a/ROCm_Libraries/hipsparse_wiki.rst b/ROCm_Libraries/hipsparse_wiki.rst index 02aef504..76dced9c 100644 --- a/ROCm_Libraries/hipsparse_wiki.rst +++ b/ROCm_Libraries/hipsparse_wiki.rst @@ -272,7 +272,7 @@ Exported sparse BLAS functions hipSPARSE includes the following auxiliary functions +------------------------------+ - | Function name | + | Function name | +==============================+ | hipsparseCreate | +------------------------------+ @@ -322,86 +322,86 @@ hipSPARSE includes the following auxiliary functions +------------------------------+ | hipsparseCreateCsrilu02Info | +------------------------------+ - - - + + + hipSPARSE includes the following Level 1, 2 and conversion functions ####################################################################### - + **Level 1** -================ ========== ========= ================ ================= ====== +================ ========== ========= ================ ================= ====== Function single double single complex double complex half ================ ========== ========= ================ ================= ====== -hipsparseXaxpyi x x -hipsparseXdoti x x -hipsparseXgthr x x -hipsparseXgthrz x x -hipsparseXroti x x -hipsparseXsctr x x +hipsparseXaxpyi x x +hipsparseXdoti x x +hipsparseXgthr x x +hipsparseXgthrz x x +hipsparseXroti x x +hipsparseXsctr x x ================ ========== ========= ================ ================= ====== **Level 2** -================================ ========== ========= ================ ================= ====== +================================ ========== ========= ================ ================= ====== Function single double single complex double complex half ================================ ========== ========= ================ ================= ====== -hipsparseXcsrmv x x -hipsparseXcsrsv2_bufferSize x x -hipsparseXcsrsv2_bufferSizeExt x x -hipsparseXcsrsv2_analysis x x -hipsparseXcsrsv2_solve x x -hipsparseXhybmv x x +hipsparseXcsrmv x x +hipsparseXcsrsv2_bufferSize x x +hipsparseXcsrsv2_bufferSizeExt x x +hipsparseXcsrsv2_analysis x x +hipsparseXcsrsv2_solve x x +hipsparseXhybmv x x ================================ ========== ========= ================ ================= ====== **Level 3** -================================ ========== ========= ================ ================= ====== +================================ ========== ========= ================ ================= ====== Function single double single complex double complex half ================================ ========== ========= ================ ================= ====== -hipsparseXcsrmm x x -hipsparseXcsrmm2 x x +hipsparseXcsrmm x x +hipsparseXcsrmm2 x x ================================ ========== ========= ================ ================= ====== **Extra** -================================ ========== ========= ================ ================= ====== +================================ ========== ========= ================ ================= ====== Function single double single complex double complex halfy ================================ ========== ========= ================ ================= ====== -hipsparseXcsrgemmNnz -hipsparseXcsrgemm x x -hipsparseXcsrgemm2_bufferSizeExt +hipsparseXcsrgemmNnz +hipsparseXcsrgemm x x +hipsparseXcsrgemm2_bufferSizeExt hipsparseXcsrgemm2Nnz hipsparseXcsrgemm2 ================================ ========== ========= ================ ================= ====== **Preconditioners** -================================= ========== ========= ================ ================= ====== +================================= ========== ========= ================ ================= ====== Function single double single complex double complex half ================================= ========== ========= ================ ================= ====== -hipsparseXcsrilu02_bufferSize x x -hipsparseXcsrilu02_bufferSizeExt x x -hipsparseXcsrilu02_analysis x x -hipsparseXcsrilu02 x x +hipsparseXcsrilu02_bufferSize x x +hipsparseXcsrilu02_bufferSizeExt x x +hipsparseXcsrilu02_analysis x x +hipsparseXcsrilu02 x x ================================= ========== ========= ================ ================= ====== **Conversion** -==================================== ========== ========= ================ ================= ====== +==================================== ========== ========= ================ ================= ====== Function single double single complex double complex half ==================================== ========== ========= ================ ================= ====== -hipsparseXcsr2coo -hipsparseXcsr2csc x x -hipsparseXcsr2hyb x x -hipsparseXcoo2csr -hipsparseCreateIdentityPermutation -hipsparseXcsrsort_bufferSizeExt -hipsparseXcsrsort -hipsparseXcoosort_bufferSizeExt -hipsparseXcoosortByRow -hipsparseXcoosortByColumn +hipsparseXcsr2coo +hipsparseXcsr2csc x x +hipsparseXcsr2hyb x x +hipsparseXcoo2csr +hipsparseCreateIdentityPermutation +hipsparseXcsrsort_bufferSizeExt +hipsparseXcsrsort +hipsparseXcoosort_bufferSizeExt +hipsparseXcoosortByRow +hipsparseXcoosortByColumn ==================================== ========== ========= ================ ================= ====== Additional notes diff --git a/ROCm_Libraries/rocALUTION/Doxyfile b/ROCm_Libraries/rocALUTION/Doxyfile index d8cad4ba..adf10b2e 100644 --- a/ROCm_Libraries/rocALUTION/Doxyfile +++ b/ROCm_Libraries/rocALUTION/Doxyfile @@ -163,7 +163,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -172,7 +172,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -239,13 +239,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -292,7 +292,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -642,7 +642,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -684,7 +684,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -697,7 +697,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -767,7 +767,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -797,7 +797,7 @@ INPUT = ROCm_Libraries/rocALUTION/src/modules.dox \ ROCm_Libraries/rocALUTION/src/solvers/direct \ ROCm_Libraries/rocALUTION/src/solvers/multigrid \ ROCm_Libraries/rocALUTION/src/solvers/preconditioners - + # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -878,7 +878,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -894,7 +894,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -905,13 +905,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -931,7 +931,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -948,7 +948,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -957,7 +957,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -972,7 +972,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1084,7 +1084,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1110,7 +1110,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1155,7 +1155,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1165,7 +1165,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1177,7 +1177,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1190,7 +1190,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1200,7 +1200,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1329,7 +1329,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1337,7 +1337,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1350,7 +1350,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1381,7 +1381,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1406,7 +1406,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1414,21 +1414,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1561,7 +1561,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1569,7 +1569,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1629,7 +1629,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1645,7 +1645,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1655,7 +1655,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1719,7 +1719,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1735,7 +1735,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1746,7 +1746,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1757,7 +1757,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1765,7 +1765,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1865,14 +1865,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1917,7 +1917,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -1936,7 +1936,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = YES +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -2030,7 +2030,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2071,7 +2071,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2079,7 +2079,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2089,7 +2089,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2098,7 +2098,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2127,13 +2127,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2182,14 +2182,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2238,7 +2238,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2382,26 +2382,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2409,12 +2409,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp b/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp index 8f3506fc..022a3195 100644 --- a/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp +++ b/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp @@ -394,7 +394,7 @@ class BaseMatrix BaseMatrix* prolong, BaseMatrix* restrict) const; - /// Ruge Stüben coarsening + /// Ruge Stuben coarsening virtual bool RugeStueben(ValueType eps, BaseMatrix* prolong, BaseMatrix* restrict) const; diff --git a/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt b/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt index 88780086..ba13da22 100644 --- a/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt +++ b/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt @@ -31,7 +31,7 @@ set(HOST_SOURCES base/host/host_matrix_hyb.cpp base/host/host_matrix_dense.cpp base/host/host_vector.cpp - base/host/host_conversion.cpp + base/host/host_conversion.cpp base/host/host_affinity.cpp base/host/host_io.cpp base/host/host_stencil_laplace2d.cpp diff --git a/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp b/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp index 2de7f288..692070be 100644 --- a/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp +++ b/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp @@ -4308,7 +4308,7 @@ bool HostMatrixCSR::RugeStueben(ValueType eps, set_to_zero_host(this->nrow_ + 1, S_row_offset); set_to_zero_host(this->nnz_, S_val); -// Determine strong influences in matrix (Ruge Stüben approach) +// Determine strong influences in matrix (Ruge Stuben approach) #ifdef _OPENMP #pragma omp parallel for schedule(dynamic, 1024) #endif diff --git a/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp b/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp index 973af6da..33064b58 100644 --- a/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp +++ b/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp @@ -56,7 +56,7 @@ void RugeStuebenAMG::Print(void) const { LOG_INFO("AMG solver"); LOG_INFO("AMG number of levels " << this->levels_); - LOG_INFO("AMG using Ruge-Stüben coarsening"); + LOG_INFO("AMG using Ruge-Stuben coarsening"); LOG_INFO("AMG coarsest operator size = " << this->op_level_[this->levels_ - 2]->GetM()); LOG_INFO("AMG coarsest level nnz = " << this->op_level_[this->levels_ - 2]->GetNnz()); LOG_INFO("AMG with smoother:"); @@ -70,7 +70,7 @@ void RugeStuebenAMG::PrintStart_(void) cons LOG_INFO("AMG solver starts"); LOG_INFO("AMG number of levels " << this->levels_); - LOG_INFO("AMG using Ruge-Stüben coarsening"); + LOG_INFO("AMG using Ruge-Stuben coarsening"); LOG_INFO("AMG coarsest operator size = " << this->op_level_[this->levels_ - 2]->GetM()); LOG_INFO("AMG coarsest level nnz = " << this->op_level_[this->levels_ - 2]->GetNnz()); LOG_INFO("AMG with smoother:"); diff --git a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp index eec8a31f..eeb4e2d6 100644 --- a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp +++ b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp @@ -291,7 +291,7 @@ class IC : public Preconditioner * \details * The Variable Preconditioner can hold a selection of preconditioners. Thus, any type * of preconditioners can be combined. As example, the variable preconditioner can - * combine Jacobi, GS and ILU – then, the first iteration of the iterative solver will + * combine Jacobi, GS and ILU - then, the first iteration of the iterative solver will * apply Jacobi, the second iteration will apply GS and the third iteration will apply * ILU. After that, the solver will start again with Jacobi, GS, ILU. * diff --git a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp index b4bda8f5..f924aae9 100644 --- a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp +++ b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp @@ -72,7 +72,7 @@ class AIChebyshev : public Preconditioner * \brief Factorized Approximate Inverse Preconditioner * \details * The Factorized Sparse Approximate Inverse preconditioner computes a direct - * approximation of \f$M^{-1}\f$ by minimizing the Frobenius norm \f$||I − GL||_{F}\f$, + * approximation of \f$M^{-1}\f$ by minimizing the Frobenius norm \f$||I - GL||_{F}\f$, * where \f$L\f$ denotes the exact lower triangular part of \f$A\f$ and \f$G:=M^{-1}\f$. * The FSAI preconditioner is initialized by \f$q\f$, based on the sparsity pattern of * \f$|A^{q}|\f$. However, it is also possible to supply external sparsity patterns in form @@ -134,7 +134,7 @@ class FSAI : public Preconditioner * The SParse Approximate Inverse algorithm is an explicitly computed preconditioner for * general sparse linear systems. In its current implementation, only the sparsity * pattern of the system matrix is supported. The SPAI computation is based on the - * minimization of the Frobenius norm \f$||AM − I||_{F}\f$. + * minimization of the Frobenius norm \f$||AM - I||_{F}\f$. * \cite grote * * \tparam OperatorType - can be LocalMatrix diff --git a/ROCm_Libraries/rocBLAS/Doxyfile b/ROCm_Libraries/rocBLAS/Doxyfile index 196cfa0e..1cd2a76e 100644 --- a/ROCm_Libraries/rocBLAS/Doxyfile +++ b/ROCm_Libraries/rocBLAS/Doxyfile @@ -164,7 +164,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -173,7 +173,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -240,13 +240,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -295,7 +295,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -649,7 +649,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -691,7 +691,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -704,7 +704,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -773,7 +773,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -786,7 +786,7 @@ WARN_LOGFILE = # Note: If this tag is empty the current directory is searched. INPUT = ROCm_Libraries/rocBLAS/src/ \ - ROCm_Libraries/rocBLAS/src/src/ + ROCm_Libraries/rocBLAS/src/src/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -867,7 +867,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -883,7 +883,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -894,13 +894,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -920,7 +920,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -937,7 +937,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -946,7 +946,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -961,7 +961,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1073,7 +1073,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1099,7 +1099,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1144,7 +1144,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1154,7 +1154,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1166,7 +1166,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1179,7 +1179,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1189,7 +1189,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1318,7 +1318,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1326,7 +1326,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1339,7 +1339,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1370,7 +1370,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1395,7 +1395,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1403,21 +1403,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1550,7 +1550,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1558,7 +1558,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1618,7 +1618,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1634,7 +1634,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1644,7 +1644,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1708,7 +1708,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1724,7 +1724,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1735,7 +1735,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1746,7 +1746,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1754,7 +1754,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1854,14 +1854,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1906,7 +1906,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -2018,7 +2018,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2059,7 +2059,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2067,7 +2067,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2077,7 +2077,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2086,7 +2086,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2115,13 +2115,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2170,14 +2170,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2226,7 +2226,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2370,26 +2370,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2397,12 +2397,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h b/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h index 954f6136..a4245df5 100644 --- a/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h +++ b/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h @@ -115,7 +115,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal(rocblas_handle handle, /*! \brief BLAS Level 1 API \details - scal_batched scales each element of vector x_i with scalar alpha, for i = 1, … , batch_count. + scal_batched scales each element of vector x_i with scalar alpha, for i = 1, ... , batch_count. x_i := alpha * x_i @@ -182,7 +182,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_batched(rocblas_handle /*! \brief BLAS Level 1 API \details - scal_strided_batched scales each element of vector x_i with scalar alpha, for i = 1, … , batch_count. + scal_strided_batched scales each element of vector x_i with scalar alpha, for i = 1, ... , batch_count. x_i := alpha * x_i , @@ -262,7 +262,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_strided_batched(rocblas_handle /*! \brief BLAS Level 1 API \details - copy copies each element x[i] into y[i], for i = 1 , … , n + copy copies each element x[i] into y[i], for i = 1 , ... , n y := x, @@ -316,7 +316,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zcopy(rocblas_handle handle /*! \brief BLAS Level 1 API \details - copy_batched copies each element x_i[j] into y_i[j], for j = 1 , … , n; i = 1 , … , batch_count + copy_batched copies each element x_i[j] into y_i[j], for j = 1 , ... , n; i = 1 , ... , batch_count y_i := x_i, @@ -380,7 +380,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zcopy_batched(rocblas_handle /*! \brief BLAS Level 1 API \details - copy_strided_batched copies each element x_i[j] into y_i[j], for j = 1 , … , n; i = 1 , … , batch_count + copy_strided_batched copies each element x_i[j] into y_i[j], for j = 1 , ... , n; i = 1 , ... , batch_count y_i := x_i, @@ -4561,7 +4561,7 @@ rocblas_zsyr(rocblas_handle handle, A[i] := A[i] + alpha*x[i]*x[i]**T where alpha is a scalar, x is an array of vectors, and A is an array of - n by n symmetric matrices, for i = 1 , … , batch_count + n by n symmetric matrices, for i = 1 , ... , batch_count @param[in] handle [rocblas_handle] @@ -4621,7 +4621,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_dsyr_batched(rocblas_handle handle, A[i] := A[i] + alpha*x[i]*x[i]**T where alpha is a scalar, vectors, and A is an array of - n by n symmetric matrices, for i = 1 , … , batch_count + n by n symmetric matrices, for i = 1 , ... , batch_count @param[in] handle [rocblas_handle] diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp index cc108c2d..cecedfba 100644 --- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp +++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp @@ -38,7 +38,7 @@ constexpr char rocblas_copy_name[] = "rocblas_zcopy"; /*! \brief BLAS Level 1 API \details - copy copies the vector x[i] into the vector y[i], for i = 1 , … , n + copy copies the vector x[i] into the vector y[i], for i = 1 , ... , n y := x, diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp index 7508e7ca..ad880070 100644 --- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp +++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp @@ -39,7 +39,7 @@ constexpr char rocblas_scal_name[] = "rocblas_zscal"; /*! \brief BLAS Level 1 API \details - scal scal the vector x[i] with scalar alpha, for i = 1 , … , n + scal scal the vector x[i] with scalar alpha, for i = 1 , ... , n x := alpha * x , diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp index 712b2e3d..8dc1f9c6 100644 --- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp +++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp @@ -41,7 +41,7 @@ constexpr char rocblas_swap_name[] = "rocblas_zswap"; /*! \brief BLAS Level 1 API \details - swap interchange vector x[i] and y[i], for i = 1 , … , n + swap interchange vector x[i] and y[i], for i = 1 , ... , n y := x; x := y diff --git a/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp b/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp index 07b33167..11352bf8 100644 --- a/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp +++ b/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp @@ -296,13 +296,13 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle, if((trans_a == rocblas_operation_none) && (trans_b == rocblas_operation_none)) { - t_status = tensile_Cijk_Ailk_Bljk_B(static_cast(d), - static_cast(c_in), - static_cast(a), + t_status = tensile_Cijk_Ailk_Bljk_B(static_cast(d), + static_cast(c_in), + static_cast(a), static_cast(b), alpha, beta, - static_cast(ldd), stride_d, - static_cast(lda), stride_a, + static_cast(ldd), stride_d, + static_cast(lda), stride_a, static_cast(ldb), stride_b, static_cast(m), static_cast(n), @@ -313,13 +313,13 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle, else if((trans_a == rocblas_operation_none) && (trans_b == rocblas_operation_transpose || trans_b == rocblas_operation_conjugate_transpose)) { - t_status = tensile_Cijk_Ailk_Bjlk_B(static_cast(d), - static_cast(c_in), - static_cast(a), + t_status = tensile_Cijk_Ailk_Bjlk_B(static_cast(d), + static_cast(c_in), + static_cast(a), static_cast(b), alpha, beta, - static_cast(ldd), stride_d, - static_cast(lda), stride_a, + static_cast(ldd), stride_d, + static_cast(lda), stride_a, static_cast(ldb), stride_b, static_cast(m), static_cast(n), @@ -331,12 +331,12 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle, (trans_b == rocblas_operation_none)) { t_status = tensile_Cijk_Alik_Bljk_B(static_cast(d), - static_cast(c_in), + static_cast(c_in), static_cast(a), static_cast(b), alpha, beta, - static_cast(ldd), stride_d, - static_cast(lda), stride_a, + static_cast(ldd), stride_d, + static_cast(lda), stride_a, static_cast(ldb), stride_b, static_cast(m), static_cast(n), @@ -348,12 +348,12 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle, (trans_b == rocblas_operation_transpose || trans_b == rocblas_operation_conjugate_transpose)) { t_status = tensile_Cijk_Alik_Bjlk_B(static_cast(d), - static_cast(c_in), + static_cast(c_in), static_cast(a), static_cast(b), alpha, beta, - static_cast(ldd), stride_d, - static_cast(lda), stride_a, + static_cast(ldd), stride_d, + static_cast(lda), stride_a, static_cast(ldb), stride_b, static_cast(m), static_cast(n), diff --git a/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp b/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp index 93e87a70..9a80f7af 100644 --- a/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp +++ b/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp @@ -1,41 +1,41 @@ -/* ************************************************************************ - * Copyright 2018 Advanced Micro Devices, Inc. - * - * ************************************************************************ */ - -#include -#include -#include -#include "definitions.h" -#include "rocblas-types.h" -#include "rocblas-functions.h" -#include "rocblas-version.h" - -#define TO_STR2(x) #x -#define TO_STR(x) TO_STR2(x) -// clang-format off -#define VERSION_STRING \ - (TO_STR(ROCBLAS_VERSION_MAJOR) "." \ - TO_STR(ROCBLAS_VERSION_MINOR) "." \ - TO_STR(ROCBLAS_VERSION_PATCH) "." \ - TO_STR(ROCBLAS_VERSION_TWEAK) "-" \ - TO_STR(ROCBLAS_VERSION_COMMIT_ID)) -// clang-format on -/******************************************************************************* - *! \brief loads char* buf with the rocblas library version. size_t len - is the maximum length of char* buf. - ******************************************************************************/ -extern "C" rocblas_status rocblas_get_version_string(char* buf, size_t len) -{ - std::string v(VERSION_STRING); - strcpy(buf, v.c_str()); - - if(buf == NULL) - return rocblas_status_internal_error; - - size_t count = std::min(len - 1, v.length()); - memcpy(buf, v.c_str(), count); - *(buf + count) = '\0'; - - return rocblas_status_success; -} +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#include +#include +#include +#include "definitions.h" +#include "rocblas-types.h" +#include "rocblas-functions.h" +#include "rocblas-version.h" + +#define TO_STR2(x) #x +#define TO_STR(x) TO_STR2(x) +// clang-format off +#define VERSION_STRING \ + (TO_STR(ROCBLAS_VERSION_MAJOR) "." \ + TO_STR(ROCBLAS_VERSION_MINOR) "." \ + TO_STR(ROCBLAS_VERSION_PATCH) "." \ + TO_STR(ROCBLAS_VERSION_TWEAK) "-" \ + TO_STR(ROCBLAS_VERSION_COMMIT_ID)) +// clang-format on +/******************************************************************************* + *! \brief loads char* buf with the rocblas library version. size_t len + is the maximum length of char* buf. + ******************************************************************************/ +extern "C" rocblas_status rocblas_get_version_string(char* buf, size_t len) +{ + std::string v(VERSION_STRING); + strcpy(buf, v.c_str()); + + if(buf == NULL) + return rocblas_status_internal_error; + + size_t count = std::min(len - 1, v.length()); + memcpy(buf, v.c_str(), count); + *(buf + count) = '\0'; + + return rocblas_status_success; +} diff --git a/ROCm_Libraries/rocFFT/Doxyfile b/ROCm_Libraries/rocFFT/Doxyfile index d7cd8a71..ab5ce1b3 100644 --- a/ROCm_Libraries/rocFFT/Doxyfile +++ b/ROCm_Libraries/rocFFT/Doxyfile @@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -171,7 +171,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -238,13 +238,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -696,7 +696,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -858,7 +858,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -885,13 +885,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -928,7 +928,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -937,7 +937,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -952,7 +952,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1136,7 +1136,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1146,7 +1146,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1158,7 +1158,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1171,7 +1171,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1181,7 +1181,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1310,7 +1310,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1318,7 +1318,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1331,7 +1331,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1362,7 +1362,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1387,7 +1387,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1395,21 +1395,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1542,7 +1542,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1550,7 +1550,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1610,7 +1610,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1626,7 +1626,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1636,7 +1636,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1700,7 +1700,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1716,7 +1716,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1727,7 +1727,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1738,7 +1738,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1746,7 +1746,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1846,14 +1846,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1898,7 +1898,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -1917,7 +1917,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = YES +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -2011,7 +2011,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2052,7 +2052,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2060,7 +2060,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2070,7 +2070,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2079,7 +2079,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2108,13 +2108,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2163,14 +2163,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2219,7 +2219,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2363,26 +2363,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2390,12 +2390,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocSOLVER/API.rst b/ROCm_Libraries/rocSOLVER/API.rst index bdfb6ff3..bf80aac6 100644 --- a/ROCm_Libraries/rocSOLVER/API.rst +++ b/ROCm_Libraries/rocSOLVER/API.rst @@ -1,12 +1,12 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ************* rocSOLVER API ************* -This section provides details of the rocSOLVER library API as in release +This section provides details of the rocSOLVER library API as in release `ROCm 2.10 `_. @@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as in release Types ===== -Most rocSOLVER types are aliases of rocBLAS types. +Most rocSOLVER types are aliases of rocBLAS types. See rocBLAS types `here `_. Definitions @@ -312,7 +312,7 @@ rocsolver_getrs_strided_batched() Auxiliaries ========================= -rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions +rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions `here `_. rocSOLVER handle auxiliaries diff --git a/ROCm_Libraries/rocSOLVER/Doxyfile b/ROCm_Libraries/rocSOLVER/Doxyfile index de295523..45b8d873 100644 --- a/ROCm_Libraries/rocSOLVER/Doxyfile +++ b/ROCm_Libraries/rocSOLVER/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = ./rocmlogo.png # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -#OUTPUT_DIRECTORY = +#OUTPUT_DIRECTORY = # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -171,7 +171,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -238,13 +238,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -696,7 +696,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -706,7 +706,7 @@ LAYOUT_FILE = # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. -CITE_BIB_FILES = +CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages @@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -858,7 +858,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -885,13 +885,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -928,7 +928,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -937,7 +937,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -952,7 +952,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1144,7 +1144,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1156,7 +1156,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1169,7 +1169,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1316,7 +1316,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1329,7 +1329,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1360,7 +1360,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1549,7 +1549,7 @@ MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1609,7 +1609,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1625,7 +1625,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1635,7 +1635,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1699,7 +1699,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1715,7 +1715,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1726,7 +1726,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1737,7 +1737,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1745,7 +1745,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1845,14 +1845,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1897,7 +1897,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -2010,7 +2010,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2051,7 +2051,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2059,7 +2059,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2069,7 +2069,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2078,7 +2078,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2107,13 +2107,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2162,14 +2162,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2218,7 +2218,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2362,26 +2362,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2389,12 +2389,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocSOLVER/Introduction.rst b/ROCm_Libraries/rocSOLVER/Introduction.rst index 5d75fcda..a98d401a 100644 --- a/ROCm_Libraries/rocSOLVER/Introduction.rst +++ b/ROCm_Libraries/rocSOLVER/Introduction.rst @@ -1,14 +1,14 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ************* Introduction ************* -rocSOLVER is a library of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. -rocSOLVER is implemented in the HIP programming language and based on an optimized BLAS -implementation for AMD’s latest discrete GPUs. +rocSOLVER is a library of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains. +rocSOLVER is implemented in the HIP programming language and based on an optimized BLAS +implementation for AMD's latest discrete GPUs. For more information about rocBLAS, see `rocBLAS `_. @@ -19,9 +19,9 @@ Build and Install Prerequisites -------------- -For installation, rocSOLVER requires `cmake `_ -and `ROCm `_, including -`hip `_ and +For installation, rocSOLVER requires `cmake `_ +and `ROCm `_, including +`hip `_ and `rocBLAS `_ @@ -31,7 +31,7 @@ Installation Follow the instructions below to build and install rocSOLVER: .. code-block:: bash - + mkdir build && cd build CXX=/opt/rocm/bin/hcc cmake .. make @@ -48,48 +48,48 @@ The following table summarizes the LAPACK functionality implemented in rocSOLVER =============================== ====== ====== ============== ============== Lapack Auxiliary Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_laswp** x x x x -**rocsolver_larfg** x x +**rocsolver_laswp** x x x x +**rocsolver_larfg** x x **rocsolver_larft** x x **rocsolver_larf** x x -**rocsolver_larfb** x x -**rocsolver_org2r** x x -**rocsolver_orgqr** x x -**rocsolver_orgl2** x x -**rocsolver_orglq** x x -**rocsolver_orgbr** x x -**rocsolver_orm2r** x x -**rocsolver_ormqr** x x +**rocsolver_larfb** x x +**rocsolver_org2r** x x +**rocsolver_orgqr** x x +**rocsolver_orgl2** x x +**rocsolver_orglq** x x +**rocsolver_orgbr** x x +**rocsolver_orm2r** x x +**rocsolver_ormqr** x x =============================== ====== ====== ============== ============== =============================== ====== ====== ============== ============== Lapack Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_potf2** x x -rocsolver_potf2_batched x x -rocsolver_potf2_strided_batched x x -**rocsolver_potrf** x x -rocsolver_potrf_batched x x -rocsolver_potrf_strided_batched x x +**rocsolver_potf2** x x +rocsolver_potf2_batched x x +rocsolver_potf2_strided_batched x x +**rocsolver_potrf** x x +rocsolver_potrf_batched x x +rocsolver_potrf_strided_batched x x **rocsolver_getf2** x x x x rocsolver_getf2_batched x x x x rocsolver_getf2_strided_batched x x x x -**rocsolver_getrf** x x x x +**rocsolver_getrf** x x x x rocsolver_getrf_batched x x x x rocsolver_getrf_strided_batched x x x x -**rocsolver_geqr2** x x +**rocsolver_geqr2** x x rocsolver_geqr2_batched x x rocsolver_geqr2_strided_batched x x -**rocsolver_geqrf** x x -rocsolver_geqrf_batched x x +**rocsolver_geqrf** x x +rocsolver_geqrf_batched x x rocsolver_geqrf_strided_batched x x -**rocsolver_gelq2** x x +**rocsolver_gelq2** x x rocsolver_gelq2_batched x x rocsolver_gelq2_strided_batched x x -**rocsolver_gelqf** x x -rocsolver_gelqf_batched x x +**rocsolver_gelqf** x x +rocsolver_gelqf_batched x x rocsolver_gelqf_strided_batched x x -**rocsolver_getrs** x x x x +**rocsolver_getrs** x x x x rocsolver_getrs_batched x x x x rocsolver_getrs_strided_batched x x x x =============================== ====== ====== ============== ============== @@ -97,30 +97,30 @@ rocsolver_getrs_strided_batched x x x x Benchmarking and Testing ========================== -For testing and benchmarking, rocSOLVER has a basic/preliminary infrastructure similar to rocBLAS. +For testing and benchmarking, rocSOLVER has a basic/preliminary infrastructure similar to rocBLAS. -On a normal installation, clients are located in the directory **/build/clients/staging**. +On a normal installation, clients are located in the directory **/build/clients/staging**. **rocsolver-test** executes a suite of `Google tests `_ (*gtest*) that verifies the correct -functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by +functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by `NETLib LAPACK `_ on the CPU. Calling the rocSOLVER gtest client with the --help flag .. code-block:: bash - + ./rocsolver-test --help -returns information on different flags that control the behavior of the gtests. +returns information on different flags that control the behavior of the gtests. **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic -performance information (as for now, execution times). +performance information (as for now, execution times). -Similarly, +Similarly, .. code-block:: bash - + ./rocsolver-bench --help -returns information on how to use the rocSOLVER benchmark client. - +returns information on how to use the rocSOLVER benchmark client. + diff --git a/ROCm_Libraries/rocSOLVER/Jenkinsfile b/ROCm_Libraries/rocSOLVER/Jenkinsfile index e8d0d1de..7c9b42b0 100644 --- a/ROCm_Libraries/rocSOLVER/Jenkinsfile +++ b/ROCm_Libraries/rocSOLVER/Jenkinsfile @@ -26,8 +26,8 @@ rocSOLVERCI: { def rocsolver = new rocProject('rocSOLVER') - - def nodes = new dockerNodes(['internal && gfx900 && ubuntu16', 'internal && gfx906 && ubuntu16', 'internal && gfx906 && centos7', + + def nodes = new dockerNodes(['internal && gfx900 && ubuntu16', 'internal && gfx906 && ubuntu16', 'internal && gfx906 && centos7', 'internal && gfx900 && centos7','internal && gfx900 && ubuntu16 && hip-clang', 'internal && gfx906 && ubuntu16 && hip-clang', 'internal && gfx900 && sles', 'internal && gfx906 && sles'], rocsolver) @@ -43,7 +43,7 @@ rocSOLVERCI: String compiler = platform.jenkinsLabel.contains('hip-clang') ? 'hipcc' : 'hcc' String branch = platform.jenkinsLabel.contains('hip-clang') ? 'hip-clang' : 'develop' String build_command = "${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/${compiler} -Damd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr .." - + def getRocBLAS = auxiliary.getLibrary('rocBLAS',platform.jenkinsLabel,branch) def command = """#!/usr/bin/env bash set -x @@ -81,7 +81,7 @@ rocSOLVERCI: finally { junit "${project.paths.project_build_prefix}/build/clients/staging/*.xml" - } + } } def packageCommand = @@ -90,7 +90,7 @@ rocSOLVERCI: String branch = platform.jenkinsLabel.contains('hip-clang') ? 'hip-clang' : 'develop' def getRocBLAS = auxiliary.getLibrary('rocBLAS',platform.jenkinsLabel,branch) - def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build",false,getRocBLAS) + def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build",false,getRocBLAS) platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) diff --git a/ROCm_Libraries/rocSOLVER/LICENSE.md b/ROCm_Libraries/rocSOLVER/LICENSE.md index 6f3eab60..22991b38 100644 --- a/ROCm_Libraries/rocSOLVER/LICENSE.md +++ b/ROCm_Libraries/rocSOLVER/LICENSE.md @@ -1,4 +1,4 @@ -Copyright © 2018 Advanced Micro Devices, Inc. +Copyright (C) 2018 Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/ROCm_Libraries/rocSOLVER/bump_develop_version.sh b/ROCm_Libraries/rocSOLVER/bump_develop_version.sh index 6d6f9f44..f30aa8b5 100644 --- a/ROCm_Libraries/rocSOLVER/bump_develop_version.sh +++ b/ROCm_Libraries/rocSOLVER/bump_develop_version.sh @@ -2,8 +2,8 @@ # This script needs to be edited to bump new master version to new develop for new release. # - run this script after running bump_master_version.sh and merging develop into master -# - run this script in master branch -# - after running this script merge master into develop +# - run this script in master branch +# - after running this script merge master into develop OLD_ROCSOLVER_VERSION="0.0.1" NEW_ROCSOLVER_VERSION="0.0.2" diff --git a/ROCm_Libraries/rocSOLVER/bump_master_version.sh b/ROCm_Libraries/rocSOLVER/bump_master_version.sh index d6da7160..9c42e6b2 100644 --- a/ROCm_Libraries/rocSOLVER/bump_master_version.sh +++ b/ROCm_Libraries/rocSOLVER/bump_master_version.sh @@ -1,7 +1,7 @@ #!/bin/sh # This script needs to be edited to bump old develop version to new master version for new release. -# - run this script in develop branch +# - run this script in develop branch # - after running this script merge develop into master # - after running this script and merging develop into master, run bump_develop_version.sh in master and # merge master into develop diff --git a/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake b/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake index 110bcfaa..5483d094 100644 --- a/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake +++ b/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake @@ -22,4 +22,4 @@ function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) -endfunction( ) \ No newline at end of file +endfunction( ) diff --git a/ROCm_Libraries/rocSOLVER/debian/postinst b/ROCm_Libraries/rocSOLVER/debian/postinst index 8675688f..36acd581 100644 --- a/ROCm_Libraries/rocSOLVER/debian/postinst +++ b/ROCm_Libraries/rocSOLVER/debian/postinst @@ -1,4 +1,3 @@ echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocsolver-alt.conf ldconfig - \ No newline at end of file diff --git a/ROCm_Libraries/rocSOLVER/debian/prerm b/ROCm_Libraries/rocSOLVER/debian/prerm index 0d084f2c..748f5a80 100644 --- a/ROCm_Libraries/rocSOLVER/debian/prerm +++ b/ROCm_Libraries/rocSOLVER/debian/prerm @@ -1,4 +1,3 @@ rm /etc/ld.so.conf.d/rocsolver-alt.conf ldconfig - \ No newline at end of file diff --git a/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake b/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake index 7355eb98..6dc43477 100644 --- a/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake +++ b/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake @@ -39,7 +39,7 @@ ExternalProject_Add( ) # The fortran flag '-fno-optimize-sibling-calls' has been added as a workaround for a known bug # that causes incompatibility issues between gfortran and C lapack calls for gfortran versions 7,8 and 9 -# The ticket can be tracked at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90329 +# The ticket can be tracked at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90329 ExternalProject_Get_Property( lapack source_dir ) diff --git a/ROCm_Libraries/rocSOLVER/docs/Doxyfile b/ROCm_Libraries/rocSOLVER/docs/Doxyfile index c41190c8..d9539384 100644 --- a/ROCm_Libraries/rocSOLVER/docs/Doxyfile +++ b/ROCm_Libraries/rocSOLVER/docs/Doxyfile @@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -171,7 +171,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -238,13 +238,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -639,7 +639,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -681,7 +681,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -694,7 +694,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -704,7 +704,7 @@ LAYOUT_FILE = # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. -CITE_BIB_FILES = +CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages @@ -763,7 +763,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -856,7 +856,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -872,7 +872,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -883,13 +883,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -909,7 +909,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -926,7 +926,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -935,7 +935,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -950,7 +950,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1062,7 +1062,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1088,7 +1088,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1132,7 +1132,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1142,7 +1142,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1154,7 +1154,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1167,7 +1167,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1177,7 +1177,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1306,7 +1306,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1314,7 +1314,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1327,7 +1327,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1358,7 +1358,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1383,7 +1383,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1391,21 +1391,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1538,7 +1538,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1546,7 +1546,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1606,7 +1606,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1622,7 +1622,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1632,7 +1632,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1696,7 +1696,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1712,7 +1712,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1723,7 +1723,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1734,7 +1734,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1742,7 +1742,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1842,14 +1842,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1894,7 +1894,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -2007,7 +2007,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2048,7 +2048,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2056,7 +2056,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2066,7 +2066,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2075,7 +2075,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2104,13 +2104,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2159,14 +2159,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2215,7 +2215,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2359,26 +2359,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2386,12 +2386,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h index cd388512..3fbbfaf4 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h +++ b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h @@ -42,7 +42,7 @@ extern "C" { n rocsolver_int. n >= 0.\n The number of columns of the matrix A. @param[inout] - A pointer to type. Array on the GPU of dimension lda*n. \n + A pointer to type. Array on the GPU of dimension lda*n. \n On entry, the matrix of column dimension n to which the row interchanges will be applied. On exit, the permuted matrix. @param[in] @@ -59,7 +59,7 @@ extern "C" { @param[in] ipiv pointer to rocsolver_int. Array on the GPU of dimension at least k1 + (k2 - k1) * abs(incx).\n The vector of pivot indices. Only the elements in positions - k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. + k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. Elements of ipiv are considered 1-based. @param[in] incx rocsolver_int. incx != 0.\n @@ -67,92 +67,92 @@ extern "C" { is negative, the pivots are applied in reverse order. *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, const rocsolver_int n, - float *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + float *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, const rocsolver_int n, - double *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + double *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_float_complex *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + rocblas_float_complex *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_double_complex *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + rocblas_double_complex *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -/*! \brief LARFG generates an orthogonal Householder reflector H of order n. +/*! \brief LARFG generates an orthogonal Householder reflector H of order n. \details Householder reflector H is such that - + H * [alpha] = [beta] [ x ] [ 0 ] - where x is an n-1 vector and alpha and beta are scalars. Matrix H can be + where x is an n-1 vector and alpha and beta are scalars. Matrix H can be generated as - + H = I - tau * [1] * [1 v'] [v] - with v an n-1 vector and tau a scalar. + with v an n-1 vector and tau a scalar. @param[in] handle rocsolver_handle @param[in] n rocsolver_int. n >= 0.\n - The order (size) of reflector H. + The order (size) of reflector H. @param[inout] alpha pointer to type. A scalar on the GPU.\n - On input the scalar alpha, + On input the scalar alpha, on output it is overwritten with beta. - @param[inout] + @param[inout] x pointer to type. Array on the GPU of size at least n-1.\n - On input it is the vector x, + On input it is the vector x, on output it is overwritten with vector v. @param[in] incx rocsolver_int. incx > 0.\n - The increment between consecutive elements of x. + The increment between consecutive elements of x. @param[out] tau pointer to type. A scalar on the GPU.\n The scalar tau. *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, + const rocsolver_int n, float *alpha, - float *x, - const rocsolver_int incx, + float *x, + const rocsolver_int incx, float *tau); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, + const rocsolver_int n, double *alpha, - double *x, - const rocsolver_int incx, + double *x, + const rocsolver_int incx, double *tau); @@ -164,9 +164,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, H = H(1) * H(2) * ... * H(k) (forward direction), or H = H(k) * ... * H(2) * H(1) (backward direction) - depending on the value of direct. + depending on the value of direct. - The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. + The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. If storev is column-wise, then H = I - V * T * V' @@ -175,7 +175,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, H = I - V' * T * V - where the i-th row of matrix V contains the Householder vector associated to H(i). + where the i-th row of matrix V contains the Householder vector associated to H(i). @param[in] handle rocsolver_handle. @@ -188,10 +188,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n The order (size) of the block reflector. - @param[in] + @param[in] k rocsovler_int. k >= 1.\n The number of Householder matrices. - @param[in] + @param[in] V pointer to type. Array on the GPU of size ldv*k if column-wise, or ldv*n if row-wise.\n The matrix of Householder vectors. @param[in] @@ -203,44 +203,44 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, @param[out] T pointer to type. Array on the GPU of dimension ldt*k.\n The triangular factor. T is upper triangular is forward operation, otherwise it is lower triangular. - The rest of the array is not used. - @param[in] + The rest of the array is not used. + @param[in] ldt rocsolver_int. ldt >= k.\n The leading dimension of T. - **************************************************************************/ + **************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarft(rocsolver_handle handle, - const rocsolver_direct direct, + const rocsolver_direct direct, const rocsolver_storev storev, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, float *V, const rocsolver_int ldv, float *tau, - float *T, - const rocsolver_int ldt); + float *T, + const rocsolver_int ldt); ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, const rocsolver_direct direct, - const rocsolver_storev storev, - const rocsolver_int n, + const rocsolver_storev storev, + const rocsolver_int n, const rocsolver_int k, double *V, const rocsolver_int ldv, double *tau, - double *T, - const rocsolver_int ldt); + double *T, + const rocsolver_int ldt); /*! \brief LARF applies a Householder reflector H to a general matrix A. \details The Householder reflector H, of order m (or n), is to be applied to a m-by-n matrix A - from the left (or the right). H is given by + from the left (or the right). H is given by H = I - alpha * x * x' - + where alpha is a scalar and x a Householder vector. H is never actually computed. @param[in] @@ -254,16 +254,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, Number of rows of A. @param[in] n rocsolver_int. n >= 0.\n - Number of columns of A. + Number of columns of A. @param[in] - x pointer to type. Array on the GPU of + x pointer to type. Array on the GPU of size at least (1 + (m-1)*abs(incx)) if left side, or at least (1 + (n-1)*abs(incx)) if right side.\n The Householder vector x. @param[in] incx rocsolver_int. incx != 0.\n - Increment between to consecutive elements of x. - If incx < 0, the elements of x are used in reverse order. + Increment between to consecutive elements of x. + If incx < 0, the elements of x are used in reverse order. @param[in] alpha pointer to type. A scalar on the GPU.\n If alpha = 0, then H = I (A will remain the same, x is never used) @@ -273,35 +273,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, H*A (or A*H). @param[in] lda rocsolver_int. lda >= m.\n - Leading dimension of A. - + Leading dimension of A. + *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, - const rocsolver_side side, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, + const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, - float* x, - const rocsolver_int incx, + const rocsolver_int n, + float* x, + const rocsolver_int incx, const float* alpha, - float* A, + float* A, const rocsolver_int lda); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, - const rocsolver_side side, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, + const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, - double* x, - const rocsolver_int incx, + const rocsolver_int n, + double* x, + const rocsolver_int incx, const double* alpha, - double* A, + double* A, const rocsolver_int lda); /*! \brief LARFB applies a block reflector H to a general m-by-n matrix A. \details - The block reflector H is applied in one of the following forms, depending on + The block reflector H is applied in one of the following forms, depending on the values of side and trans: H * A (No transpose from the left) @@ -322,7 +322,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, H = I - V' * T * V - where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. + where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. T is the associated triangular factor as computed by LARFT. @param[in] @@ -345,11 +345,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix A. - @param[in] + @param[in] k rocsovler_int. k >= 1.\n The number of Householder matrices. - @param[in] - V pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, + @param[in] + V pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, or ldv*m if row-wise and applying from the left.\n The matrix of Householder vectors. @param[in] @@ -359,16 +359,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, @param[in] T pointer to type. Array on the GPU of dimension ldt*k.\n The triangular factor of the block reflector. - @param[in] + @param[in] ldt rocsolver_int. ldt >= k.\n The leading dimension of T. @param[inout] A pointer to type. Array on the GPU of size lda*n.\n On input, the matrix A. On output it is overwritten with - H*A, A*H, H'*A, or A*H'. + H*A, A*H, H'*A, or A*H'. @param[in] lda rocsolver_int. lda >= m.\n - Leading dimension of A. + Leading dimension of A. ****************************************************************************/ @@ -376,31 +376,31 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfb(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_direct direct, - const rocsolver_storev storev, + const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, float *V, const rocsolver_int ldv, - float *T, + float *T, const rocsolver_int ldt, float *A, - const rocsolver_int lda); + const rocsolver_int lda); ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_direct direct, - const rocsolver_storev storev, + const rocsolver_direct direct, + const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, double *V, const rocsolver_int ldv, - double *T, + double *T, const rocsolver_int ldt, double *A, - const rocsolver_int lda); + const rocsolver_int lda); /*! \brief ORG2R generates a m-by-n Matrix Q with orthonormal columns. @@ -409,17 +409,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, The matrix Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEQRF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. 0 <= n <= m.\n The number of colums of the matrix Q. @@ -433,7 +433,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GEQRF. @@ -442,16 +442,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorg2r(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -463,17 +463,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, The matrix Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEQRF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. 0 <= n <= m.\n The number of colums of the matrix Q. @@ -487,7 +487,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GEQRF. @@ -496,16 +496,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgqr(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -517,17 +517,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, The matrix Q is defined as the first m rows of the product of k Householder reflectors of order n - + Q = H(k) * H(k-1) * ... * H(1) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GELQF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. 0 <= m <= n.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. n >= 0.\n The number of colums of the matrix Q. @@ -541,7 +541,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GELQF. @@ -550,16 +550,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgl2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -572,17 +572,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, The matrix Q is defined as the first m rows of the product of k Householder reflectors of order n - + Q = H(k) * H(k-1) * ... * H(1) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GELQF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. 0 <= m <= n.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. n >= 0.\n The number of colums of the matrix Q. @@ -596,7 +596,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GELQF. @@ -605,16 +605,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorglq(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -622,9 +622,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, /*! \brief ORGBR generates a m-by-n Matrix Q with orthonormal rows or columns. \details - If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first + If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) If m < k, Q is defined as the product of Householder reflectors of order m @@ -635,12 +635,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, first m rows of the product of k Householder reflectors of order n Q = H(k) * H(k-1) * ... * H(1) - + If n <= k, Q is defined as the product of Householder reflectors of order n Q = H(n-1) * H(n-2) * ... * H(1) - The Householder matrices H(i) are never stored, they are computed from its corresponding + The Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEBRD. @param[in] @@ -650,12 +650,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, Specifies whether to work column-wise or row-wise. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. If row-wise, then min(n,k) <= m <= n. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix Q. - If column-wise, then min(m,k) <= n <= m. + The number of colums of the matrix Q. + If column-wise, then min(m,k) <= n <= m. @param[in] k rocsolver_int. k >= 0.\n The number of columns (if storev is colum-wise) or rows (if row-wise) of the @@ -667,7 +667,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension min(m,k) if column-wise, or min(n,k) if row-wise.\n The scalar factors of the Householder matrices H(i) as returned by GEBRD. @@ -677,8 +677,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); @@ -686,8 +686,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -696,8 +696,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, \details (This is the unblocked version of the algorithm). - - The matrix Q is applied in one of the following forms, depending on + + The matrix Q is applied in one of the following forms, depending on the values of side and trans: Q * C (No transpose from the left) @@ -709,7 +709,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, Q = H(1) * H(2) * ... * H(k) - or order m if applying from the left, or n if applying from the right. Q is never stored, it is + or order m if applying from the left, or n if applying from the right. Q is never stored, it is calculated from the Householder vectors and scalars returned by the QR factorization GEQRF. @param[in] @@ -726,10 +726,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix C. - @param[in] + @param[in] k rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n The number of Householder reflectors that form Q. - @param[in] + @param[in] A pointer to type. Array on the GPU of size lda*k.\n The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF in the first k columns of its argument A. @@ -742,19 +742,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, @param[inout] C pointer to type. Array on the GPU of size ldc*n.\n On input, the matrix C. On output it is overwritten with - Q*C, C*Q, Q'*C, or C*Q'. + Q*C, C*Q, Q'*C, or C*Q'. @param[in] lda rocsolver_int. ldc >= m.\n - Leading dimension of C. - + Leading dimension of C. + ****************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorm2r(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv, @@ -765,8 +765,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv, @@ -777,8 +777,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, \details (This is the blocked version of the algorithm). - - The matrix Q is applied in one of the following forms, depending on + + The matrix Q is applied in one of the following forms, depending on the values of side and trans: Q * C (No transpose from the left) @@ -790,7 +790,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, Q = H(1) * H(2) * ... * H(k) - or order m if applying from the left, or n if applying from the right. Q is never stored, it is + or order m if applying from the left, or n if applying from the right. Q is never stored, it is calculated from the Householder vectors and scalars returned by the QR factorization GEQRF. @param[in] @@ -807,10 +807,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix C. - @param[in] + @param[in] k rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n The number of Householder reflectors that form Q. - @param[in] + @param[in] A pointer to type. Array on the GPU of size lda*k.\n The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF in the first k columns of its argument A. @@ -823,19 +823,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, @param[inout] C pointer to type. Array on the GPU of size ldc*n.\n On input, the matrix C. On output it is overwritten with - Q*C, C*Q, Q'*C, or C*Q'. + Q*C, C*Q, Q'*C, or C*Q'. @param[in] lda rocsolver_int. ldc >= m.\n - Leading dimension of C. - + Leading dimension of C. + ****************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sormqr(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv, @@ -846,8 +846,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv, @@ -880,10 +880,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix A. + The number of rows of the matrix A. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix A. + The number of colums of the matrix A. @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix A to be factored. @@ -891,7 +891,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, The unit diagonal elements of L are not stored. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n The vector of pivot indices. Elements of ipiv are 1-based indices. @@ -900,14 +900,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, Matrix P of the factorization can be derived from ipiv. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful exit. + If info = 0, succesful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot. - + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -915,7 +915,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -923,7 +923,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -931,7 +931,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -968,8 +968,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, lda rocsolver_int. lda >= m.\n Specifies the leading dimension of matrices A_i. @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivot indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -981,17 +981,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1001,7 +1001,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1011,7 +1011,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1021,7 +1021,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1034,7 +1034,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand \details (This is the right-looking Level 2 BLAS version of the algorithm). - + The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i @@ -1064,8 +1064,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivots indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1077,17 +1077,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1098,7 +1098,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1109,7 +1109,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1120,7 +1120,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1147,10 +1147,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix A. + The number of rows of the matrix A. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix A. + The number of colums of the matrix A. @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix A to be factored. @@ -1158,7 +1158,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han The unit diagonal elements of L are not stored. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n The vector of pivot indices. Elements of ipiv are 1-based indices. @@ -1167,14 +1167,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han Matrix P of the factorization can be derived from ipiv. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful exit. + If info = 0, succesful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot. - + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1182,7 +1182,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1190,7 +1190,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1198,7 +1198,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1235,8 +1235,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, lda rocsolver_int. lda >= m.\n Specifies the leading dimension of matrices A_i. @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivot indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1248,17 +1248,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1268,7 +1268,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1278,7 +1278,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1288,7 +1288,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1301,7 +1301,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand \details (This is the right-looking Level 3 BLAS version of the algorithm). - + The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i @@ -1331,8 +1331,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivots indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1344,17 +1344,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1365,7 +1365,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1376,7 +1376,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1387,7 +1387,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1406,7 +1406,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han A = Q * [ R ] [ 0 ] - where R is upper triangular (upper trapezoidal if m < n), and Q is + where R is upper triangular (upper trapezoidal if m < n), and Q is a m-by-m orthogonal matrix represented as the product of Householder matrices Q = H(1) * H(2) * ... * H(k), with k = min(m,n) @@ -1414,8 +1414,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i) * v(i)' - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1428,30 +1428,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R; the elements below the diagonal are the m - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GEQR2_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1464,7 +1464,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, A_j = Q_j * [ R_j ] [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1473,7 +1473,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1486,19 +1486,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1507,22 +1507,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GEQR2_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1533,9 +1533,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1544,7 +1544,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1557,23 +1557,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1582,24 +1582,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQ2 computes a LQ factorization of a general m-by-n matrix A. @@ -1610,8 +1610,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han The factorization has the form A = [ L 0 ] * Q - - where L is lower triangular (lower trapezoidal if m > n), and Q is + + where L is lower triangular (lower trapezoidal if m > n), and Q is a n-by-n orthogonal matrix represented as the product of Householder matrices Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n) @@ -1619,8 +1619,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i)' * v(i) - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1633,30 +1633,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and delow the diagonal contain the + On exit, the elements on and delow the diagonal contain the factor L; the elements above the diagonal are the n - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GELQ2_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -1666,9 +1666,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -1677,7 +1677,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1690,19 +1690,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1711,22 +1711,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQ2_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -1736,9 +1736,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -1747,7 +1747,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1760,23 +1760,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1785,24 +1785,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); @@ -1815,8 +1815,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han A = Q * [ R ] [ 0 ] - - where R is upper triangular (upper trapezoidal if m < n), and Q is + + where R is upper triangular (upper trapezoidal if m < n), and Q is a m-by-m orthogonal matrix represented as the product of Householder matrices Q = H(1) * H(2) * ... * H(k), with k = min(m,n) @@ -1824,8 +1824,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i) * v(i)' - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1838,30 +1838,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R; the elements below the diagonal are the m - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GEQRF_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1872,9 +1872,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1883,7 +1883,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1896,19 +1896,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1917,22 +1917,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GEQRF_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1943,9 +1943,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1954,7 +1954,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1967,23 +1967,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1992,24 +1992,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQF computes a LQ factorization of a general m-by-n matrix A. @@ -2020,8 +2020,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han The factorization has the form A = [ L 0 ] * Q - - where L is lower triangular (lower trapezoidal if m > n), and Q is + + where L is lower triangular (lower trapezoidal if m > n), and Q is a n-by-n orthogonal matrix represented as the product of Householder matrices Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n) @@ -2029,8 +2029,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i)' * v(i) - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2043,30 +2043,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and delow the diagonal contain the + On exit, the elements on and delow the diagonal contain the factor L; the elements above the diagonal are the n - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GELQF_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -2076,9 +2076,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -2087,7 +2087,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2100,19 +2100,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -2121,22 +2121,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQF_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -2146,9 +2146,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -2157,7 +2157,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2170,23 +2170,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -2195,46 +2195,46 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GETRS solves a system of n linear equations on n variables using the LU factorization computed by GETRF. \details - It solves one of the following systems: + It solves one of the following systems: - A * X = B (no transpose), - A' * X = B (transpose), or + A * X = B (no transpose), + A' * X = B (transpose), or A* * X = B (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations. + Specifies the form of the system of equations. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of A. + The order of the system, i.e. the number of columns and rows of A. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2244,7 +2244,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_han The factors L and U of the factorization A = P*L*U returned by GETRF. @param[in] lda rocsolver_int. lda >= n.\n - The leading dimension of A. + The leading dimension of A. @param[in] ipiv pointer to rocsolver_int. Array on the GPU of dimension n.\n The pivot indices returned by GETRF. @@ -2278,26 +2278,26 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int *ipiv, rocblas_double_complex *B, const rocsolver_int ldb); -/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables +/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables using the LU factorization computed by GETRF_BATCHED. \details - For each instance j in the batch, it solves one of the following systems: + For each instance j in the batch, it solves one of the following systems: - A_j * X_j = B_j (no transpose), - A_j' * X_j = B_j (transpose), or + A_j * X_j = B_j (no transpose), + A_j' * X_j = B_j (transpose), or A_j* * X_j = B_j (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations of each instance in the batch. + Specifies the form of the system of equations of each instance in the batch. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of all A_j matrices. + The order of the system, i.e. the number of columns and rows of all A_j matrices. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2312,7 +2312,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n Contains the vectors ipiv_j of pivot indices returned by GETRF_BATCHED. @param[in,out] - B Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n + B Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n On entry, the right hand side matrices B_j. On exit, the solution matrix X_j of each system in the batch. @param[in] @@ -2320,7 +2320,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( The leading dimension of matrices B_j. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of instances (systems) in the batch. + Number of instances (systems) in the batch. ********************************************************************/ @@ -2337,35 +2337,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_batched( ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count); ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count); -/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables +/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables using the LU factorization computed by GETRF_STRIDED_BATCHED. \details - For each instance j in the batch, it solves one of the following systems: + For each instance j in the batch, it solves one of the following systems: - A_j * X_j = B_j (no transpose), - A_j' * X_j = B_j (transpose), or + A_j * X_j = B_j (no transpose), + A_j' * X_j = B_j (transpose), or A_j* * X_j = B_j (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations of each instance in the batch. + Specifies the form of the system of equations of each instance in the batch. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of all A_j matrices. + The order of the system, i.e. the number of columns and rows of all A_j matrices. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2378,7 +2378,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( The leading dimension of matrices A_j. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[in] ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n @@ -2392,11 +2392,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( The leading dimension of matrices B_j. @param[in] strideB rocsolver_int.\n - Stride from the start of one matrix B_j and the next one B_(j+1). + Stride from the start of one matrix B_j and the next one B_(j+1). There is no restriction for the value of strideB. Normal use case is strideB >= ldb*nrhs. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of instances (systems) in the batch. + Number of instances (systems) in the batch. ********************************************************************/ @@ -2413,13 +2413,13 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_strided_batched( ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count); ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count); @@ -2427,7 +2427,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( positive definite matrix A. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization has the form: @@ -2453,8 +2453,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( specifies the leading dimension of A. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful factorization of matrix A. - If info = i > 0, the leading minor of order i of A is not positive definite. + If info = 0, succesful factorization of matrix A. + If info = i > 0, the leading minor of order i of A is not positive definite. The factorization stopped at this point. ********************************************************************/ @@ -2472,11 +2472,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle, rocblas_int* info); -/*! \brief POTF2_BATCHED computes the Cholesky factorization of a +/*! \brief POTF2_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2496,24 +2496,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle, The dimension of matrix A_i. @param[inout] A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocblas_int* info, @@ -2521,17 +2521,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocblas_int* info, const rocsolver_int batch_count); -/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a +/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2551,28 +2551,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle hand The dimension of matrix A_i. @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_i and the next one A_(i+1). + Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2581,7 +2581,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2592,7 +2592,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han positive definite matrix A. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization has the form: @@ -2618,8 +2618,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han specifies the leading dimension of A. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful factorization of matrix A. - If info = i > 0, the leading minor of order i of A is not positive definite. + If info = 0, succesful factorization of matrix A. + If info = i > 0, the leading minor of order i of A is not positive definite. The factorization stopped at this point. ********************************************************************/ @@ -2637,11 +2637,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle, rocblas_int* info); -/*! \brief POTRF_BATCHED computes the Cholesky factorization of a +/*! \brief POTRF_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2661,24 +2661,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle, The dimension of matrix A_i. @param[inout] A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocblas_int* info, @@ -2686,17 +2686,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocblas_int* info, const rocsolver_int batch_count); -/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a +/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2716,28 +2716,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle hand The dimension of matrix A_i. @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_i and the next one A_(i+1). + Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2746,7 +2746,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, diff --git a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h index 55d3e42a..e8cf8251 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h +++ b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h @@ -11,8 +11,8 @@ #include -/*! \brief Used to specify int32 or int64. - \details rocsolver_int is a rocblas_int +/*! \brief Used to specify int32 or int64. + \details rocsolver_int is a rocblas_int ******************************************************************/ typedef rocblas_int rocsolver_int; @@ -20,12 +20,12 @@ typedef rocblas_float_complex rocsolver_float_complex; typedef rocblas_double_complex rocsolver_double_complex; typedef rocblas_half rocsolver_half; -/*! \brief A structure holding the rocsolver library context. - \details +/*! \brief A structure holding the rocsolver library context. + \details It must be initialized using rocsolver_create_handle() - and the returned handle must be passed to all subsequent library + and the returned handle must be passed to all subsequent library function calls. It should be destroyed at the end using rocsolver_destroy_handle().\n - rocsolver_handle is a rocblas_handle. + rocsolver_handle is a rocblas_handle. *************************************************************************/ typedef rocblas_handle rocsolver_handle; @@ -56,16 +56,16 @@ typedef rocblas_status rocsolver_status; typedef rocblas_layer_mode rocsolver_layer_mode; -/*! \brief Used to specify the order in which multiple elementary matrices are applied together - ********************************************************************************/ +/*! \brief Used to specify the order in which multiple elementary matrices are applied together + ********************************************************************************/ typedef enum rocsolver_direct_ { rocsolver_forward_direction = 171, /**< Elementary matrices applied from the right. */ rocsolver_backward_direction = 172, /**< Elementary matrices applied from the left. */ } rocsolver_direct; -/*! \brief Used to specify how householder vectors are stored in a matrix of vectors - ********************************************************************************/ +/*! \brief Used to specify how householder vectors are stored in a matrix of vectors + ********************************************************************************/ typedef enum rocsolver_storev_ { rocsolver_column_wise = 181, /**< Householder vectors are stored in the columns of a matrix. */ diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt index cbf3d10d..4a435950 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt @@ -82,7 +82,7 @@ add_library( rocsolver ${rocsolver_lapack_source} ${relative_rocsolver_headers_public} ${rocsolver_auxiliary_source} - ${rocsolver_common_source} + ${rocsolver_common_source} ) add_library( roc::rocsolver ALIAS rocsolver ) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp index 9c52fd62..8c4e0c70 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_larf.hpp" template -rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha, T* A, const rocsolver_int lda) { @@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side rocblas_int stridep = 0; rocblas_int batch_count=1; - return rocsolver_larf_template(handle,side, + return rocsolver_larf_template(handle,side, m,n, x,0, //vector shifted 0 entries incx, @@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side stridep, A,0, //matrix shifted 0 entries lda, - stridea, + stridea, batch_count); } @@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side extern "C" { -ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha, float* A, const rocsolver_int lda) { return rocsolver_larf_impl(handle, side, m, n, x, incx, alpha, A, lda); } -ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha, double* A, const rocsolver_int lda) { diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp index 27a5a0d4..3755ea14 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp @@ -19,8 +19,8 @@ template rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, - const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, + const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, + const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count) { // quick return @@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ T* zeroInt; //constant 0 in device hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ // OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + //memory in GPU (workspace) T *workvec; hipMalloc(&workvec, sizeof(T)*order*batch_count); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute the matrix vector product (W=tau*A'*X or W=tau*A*X) for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp index 12ed4e92..d28b4a03 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp @@ -5,10 +5,10 @@ #include "rocauxiliary_larfb.hpp" template -rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf, T* A, const rocsolver_int lda) { @@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid if (storev == rocsolver_row_wise) { if (ldv < k) return rocblas_status_invalid_size; - } else { + } else { if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n)) return rocblas_status_invalid_size; } @@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid rocblas_int stridef = 0; rocblas_int batch_count=1; - return rocsolver_larfb_template(handle,side,trans,direct,storev, + return rocsolver_larfb_template(handle,side,trans,direct,storev, m,n,k, V,0, //shifted 0 entries ldv, @@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid stridef, A,0, //shifted 0 entries lda, - stridea, + stridea, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp index 5214e29a..dc4ee469 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp @@ -19,7 +19,7 @@ template -__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U } template -__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A Wp = work + b*strideW; Ap = load_ptr_batch(A,shiftA,b,strideA); - Ap[i + j*lda] -= Wp[i + j*ldw]; + Ap[i + j*lda] -= Wp[i + j*ldw]; } } template -rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, T *F, const rocsolver_int shiftF, - const rocsolver_int ldf, const rocsolver_int strideF, + const rocsolver_int ldf, const rocsolver_int strideF, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, const rocsolver_int batch_count) { @@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver //determine the side, size of workspace //and whether V is trapezoidal - rocsolver_operation transp; + rocsolver_operation transp; rocsolver_fill uploV; bool trap; rocblas_int order, ldw; - bool colwise = (storev == rocsolver_column_wise); + bool colwise = (storev == rocsolver_column_wise); bool leftside = (side == rocblas_side_left); size_t offsetV; - + if (leftside) { order = n; ldw = k; @@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver if (colwise) { uploV = rocblas_fill_lower; offsetV = idx2D(k,0,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_transpose; - else + else transp = rocblas_operation_none; } else { uploV = rocblas_fill_upper; offsetV = idx2D(0,k,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_none; - else + else transp = rocblas_operation_transpose; } @@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver rocblas_int blocksx = (order - 1)/32 + 1; rocblas_int blocksy = (ldw - 1)/32 + 1; hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + // BACKWARD DIRECTION TO BE IMPLEMENTED... rocsolver_fill uploT = rocblas_fill_upper; if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - + //compute: // V1' * A1, or - // or + // or // A1 * V1 for (int b=0;b(VV,shiftV,b,strideV); @@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } // compute: - // V1' * A1 + V2' * A2 - // or + // V1' * A1 + V2' * A2 + // or // A1 * V1 + A2 * V2 - if (trap) { + if (trap) { for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt, (Vp + offsetV),ldv, (Ap + idx2D(k,0,lda)),lda, @@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } - // compute: + // compute: // trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) + // (A1 * V1 + A2 * V2) * trans(T) for (int b=0;b(FF,shiftF,b,strideF); rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw); @@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver // compute: // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2) // or - // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' + // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' if (transp == rocblas_operation_transpose) transp = rocblas_operation_none; else @@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt, (Vp + offsetV),ldv, (work + b*strideW),ldw, @@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } } - + // compute: // V1 * trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) * V1' + // (A1 * V1 + A2 * V2) * trans(T) * V1' for (int b=0;b(VV,shiftV,b,strideV); rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw); } - + // compute: // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2) // or // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1' hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + hipFree(minoneInt); hipFree(oneInt); hipFree(work); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp index 4b1e00fa..8e651066 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp @@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n, incx, stridex, tau, - strideP, + strideP, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp index f4fc193c..38683f5d 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp @@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha template -rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, +rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex, T *tau, const rocblas_int strideP, const rocblas_int batch_count) { @@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int hipStream_t stream; rocblas_get_stream(handle, &stream); dim3 gridReset(1, batch_count, 1); - dim3 threads(1, 1, 1); + dim3 threads(1, 1, 1); if (n == 1) { hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0); - return rocblas_status_success; - } + return rocblas_status_success; + } T *xp; @@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *norms; - hipMalloc(&norms, sizeof(T)*batch_count); + hipMalloc(&norms, sizeof(T)*batch_count); // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute norm of x for (int b=0;b(xx,shiftx,b,stridex); @@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //set value of tau and beta and scalling factor for vector x //alpha <- beta - //norms <- scalling + //norms <- scalling hipLaunchKernelGGL(set_taubeta,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex); - + //compute vector v=x*norms for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp index 5ab79a92..10915015 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_larft.hpp" template -rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, - const rocsolver_storev storev, const rocsolver_int n, +rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, + const rocsolver_storev storev, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau, T* F, const rocsolver_int ldf) { @@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir stridet, F, ldf, - stridef, + stridef, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp index ee2add09..8a38ac3f 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp @@ -17,8 +17,8 @@ #include "common_device.hpp" template -__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, - T* tau, const rocsolver_int strideT, +__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, + T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev) { const auto blocksize = hipBlockDim_x; @@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid const auto blocksize = hipBlockDim_x; const auto b = hipBlockIdx_x; const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x; - + if (i < k) { T *tp; tp = tau + b*strideT; tp[i] = -tp[i]; } } - + template -rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, +rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, - const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count) { // quick return @@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice); hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - //Fix diagonal of T, make zero the non used triangular part, + //Fix diagonal of T, make zero the non used triangular part, //setup tau (changing signs) and account for the non-stored 1's on the householder vectors rocblas_int blocks = (k - 1)/32 + 1; hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream, k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev); hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT); - // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS + // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS // AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - - rocblas_operation trans; - - for (int i = 1; i < k; ++i) { + rocblas_operation trans; + + + for (int i = 1; i < k; ++i) { //compute the matrix vector product, using the householder vectors for (int b=0;b(VV,shiftV,b,strideV); Fp = F + b*strideF; - rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, + rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1); - } + } } //restore tau @@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipFree(oneInt); hipFree(zeroInt); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp index e79f652f..360fef79 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp @@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const } ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); } ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp index 0dc74205..4615a7ec 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp @@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA template rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2, - const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, + const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, const rocblas_int batch_count) { // quick return - if (n == 0 || !batch_count) + if (n == 0 || !batch_count) return rocblas_status_success; rocblas_int start, end, inc; @@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int end = k1 - 1; inc = -1; incx = -incx; - } + } else { start = k1; end = k2 + 1; diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp index 102fd83e..465b3635 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_org2r.hpp" template -rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp index 08d072aa..2dbcc11e 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j > i) + else if (j > i) Ap[i + j*lda] = 0.0; else if (j >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j,j+1,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th column -corresponding to H(i)- if (j < m - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), - (M + idx2D(j + 1, j, lda)), 1); - } + rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), + (M + idx2D(j + 1, j, lda)), 1); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp index bd3e4714..eb4f0bb6 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orgbr.hpp" template -rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp index a1315b6e..deec30a8 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp @@ -23,7 +23,7 @@ #define BS 32 //blocksize for kernels template -__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && j <= i) { rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy columns - Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); - + Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); + } else { - // shift columns to the right + // shift columns to the right Ap[i+1 + j*lda] = Wp[i + j*ldw - offset]; - + // make first row the identity if (i == j) { Ap[(j+1)*lda] = 0.0; @@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const } template -__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && i <= j) { rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy rows - Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); - + Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); + } else { - // shift rows downward + // shift rows downward Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset]; - + // make first column the identity if (i == j) { Ap[i+1] = 0.0; @@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const } template -rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization + // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization // of a m-by-k matrix A (given by gebrd) if (storev == rocsolver_column_wise) { if (m >= k) { - rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); + rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); } else { // shift the householder vectors provided by gebrd as they come below the first subdiagonal // workspace @@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (m - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); - + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + // result - rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); + hipFree(W); - } + } } - - // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization + + // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization // of a k-by-n matrix A (given by gebrd) else { if (n > k) { @@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (n - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // result rocsolver_orglq_template(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + hipFree(W); } - } + } return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp index 27e3d8ed..ec38dc16 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgl2.hpp" template -rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp index 202a4fc3..35475070 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j < i) + else if (j < i) Ap[i + j*lda] = 0.0; else if (i >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j+1,j,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th row -corresponding to H(i)- if (j < n - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), - (M + idx2D(j, j + 1, lda)), lda); - } + rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), + (M + idx2D(j, j + 1, lda)), lda); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp index 35b17482..e3039734 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orglq.hpp" template -rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp index 97886fce..39f77a46 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp @@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A, if (i < m && j < kk) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_orgl2_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding left submatrix if (kk < m) { blocksx = (m - kk - 1)/32 + 1; blocksy = (kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, m,kk,A,shiftA,lda,strideA); - - rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < m) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_orgl2_template(handle, jb, n - j, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_orgl2_template(handle, jb, n - j, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp index ef11bd5e..7b1aceec 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgqr.hpp" template -rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp index 86386317..8079413c 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp @@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A, if (i < kk && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_org2r_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding top submatrix if (kk < n) { blocksx = (kk - 1)/32 + 1; blocksy = (n- kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, n,kk,A,shiftA,lda,strideA); - - rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < n) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_org2r_template(handle, m - j, jb, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_org2r_template(handle, m - j, jb, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp index 34ee185b..fdaa1724 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orm2r.hpp" template -rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp index 10522f08..dd83c375 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp @@ -18,10 +18,10 @@ #include "../auxiliary/rocauxiliary_larf.hpp" template -rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver ncol = n - i; jc = i; } - - // insert one in A(i,i) tobuild/apply the householder matrix + + // insert one in A(i,i) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); - // Apply current Householder reflector + // Apply current Householder reflector rocsolver_larf_template(handle,side, //side nrow, //number of rows of matrix to modify - ncol, //number of columns of matrix to modify + ncol, //number of columns of matrix to modify A, shiftA + idx2D(i,i,lda), //householder vector x 1, strideA, //inc of x (ipiv + i), strideP, //householder scalar (alpha) @@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver // restore original value of A(i,i) hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp index 7d11d5e6..820f4a46 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_ormqr.hpp" template -rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp index fd0b523c..b24d77cd 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp @@ -20,10 +20,10 @@ #include "../auxiliary/rocauxiliary_larft.hpp" template -rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked variant of the algorithm - if (k <= ORMQR_ORM2R_BLOCKSIZE) + if (k <= ORMQR_ORM2R_BLOCKSIZE) return rocsolver_orm2r_template(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count); //memory in GPU (workspace) T* work; rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE; rocblas_int strideW = ldw *ldw; - hipMalloc(&work, sizeof(T)*strideW*batch_count); + hipMalloc(&work, sizeof(T)*strideW*batch_count); // determine limits and indices bool left = (side == rocblas_side_left); @@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver C, shiftC + idx2D(ic,jc,ldc),ldc,strideC, batch_count); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp index 2d57c7d9..65dd0697 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp @@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n, return rocblas_izamax(handle, n, x, incx, result); } -//ger +//ger template <> rocblas_status rocblas_ger(rocblas_handle handle, rocblas_int m, rocblas_int n, diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp index 1aaaab61..d28acb79 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp @@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int } template -__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) +__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) { int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - + if (b < batch) out[b] = in + b*stride; } template -__forceinline__ __global__ void setdiag(const rocblas_int j, U A, +__forceinline__ __global__ void setdiag(const rocblas_int j, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, T *ipiv, const rocblas_int strideP) { @@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A, T *tau = ipiv + b*strideP; T t = -tau[j]; - tau[j] = t; + tau[j] = t; Ap[j + j*lda] = 1.0 + t; } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp index 5d9cf574..260d9d1f 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp @@ -8,7 +8,7 @@ // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE -// MORE SUITABLE VALUES +// MORE SUITABLE VALUES diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp index 185d1690..b7e34f6b 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp @@ -1,24 +1,24 @@ -/* ************************************************************************ - * Copyright 2019-2020 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP -#define GUARD_ROCBLAS_MANAGE_PTR_HPP - -#include - -namespace rocsolver { -// device_malloc wraps hipMalloc and provides same API as malloc -static void *device_malloc(size_t byte_size) { - void *pointer; - PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); - return pointer; -} - -// device_free wraps hipFree and provides same API as free -static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } -} // namespace rocsolver - -using rocsolver_unique_ptr = std::unique_ptr; - -#endif +/* ************************************************************************ + * Copyright 2019-2020 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP +#define GUARD_ROCBLAS_MANAGE_PTR_HPP + +#include + +namespace rocsolver { +// device_malloc wraps hipMalloc and provides same API as malloc +static void *device_malloc(size_t byte_size) { + void *pointer; + PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); + return pointer; +} + +// device_free wraps hipFree and provides same API as free +static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } +} // namespace rocsolver + +using rocsolver_unique_ptr = std::unique_ptr; + +#endif diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp index d412d69a..f5f6d466 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp index 29c4266f..81ec19ae 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on row j @@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int n - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on - lda, strideA, //inc of x + lda, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the right + // Apply Householder reflector to the rest of matrix from the right if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp index 027572df..35fe7af5 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp index 9eefcb03..569facbb 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp index a29c5b0f..f75a0da7 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp index b0e15bef..d40b9dd5 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_gelq2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of rows in the block rocsolver_gelq2_template(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < m) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_gelq2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp index 91631008..cee74932 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp index 13e0312f..a5581819 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp index 0cae47b0..249784a0 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp index 668fc8a0..485550d7 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on column j @@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on - 1, strideA, //inc of x + 1, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the left + // Apply Householder reflector to the rest of matrix from the left if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp index ef67a2eb..70e765e8 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp index 26816634..e468de7e 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp index d941c762..b91aa412 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp index fcdb4935..e1a3adaf 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_geqr2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of columns in the block rocsolver_geqr2_template(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < n) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_geqr2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp index 3ae16e6a..41bb01e6 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp index b3e3809d..bd670e1f 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp index 9b01a5af..d74da116 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int *ipiv, rocblas_int* info) -{ + rocblas_int *ipiv, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || lda < 1) @@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp index 727a76c3..5630004e 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp @@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c template rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice); //pivoting info in device (to avoid continuous synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int //info=0 (starting with a nonsingular matrix) hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** @@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well! for (int b=0;b(AA,shiftA,b,strideA); - rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, + rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, (ipiv + shiftP + b*strideP + j)); } @@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipLaunchKernelGGL(getf2_check_singularity, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info); - // Swap pivot row and j-th row + // Swap pivot row and j-th row rocsolver_laswp_template(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count); // Compute elements J+1:M of J'th column for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (pivotGPU + b), - (M + idx2D(j + 1, j, lda)), oneInt); + rocblas_scal(handle, (m-j-1), (pivotGPU + b), + (M + idx2D(j + 1, j, lda)), oneInt); } // update trailing submatrix @@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int for (int b=0;b(AA,shiftA,b,strideA); rocblas_ger(handle, m - j - 1, n - j - 1, minoneInt, - (M + idx2D(j + 1, j, lda)), oneInt, + (M + idx2D(j + 1, j, lda)), oneInt, (M + idx2D(j, j + 1, lda)), lda, (M + idx2D(j + 1, j + 1, lda)), lda); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp index bd9e7240..462e932d 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp index ccb2d252..b3ea05e9 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp @@ -7,19 +7,19 @@ template rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) return rocblas_status_invalid_size; - + return rocsolver_getf2_template(handle,m,n, A,0, //the matrix is shifted 0 entries (will work on the entire matrix) @@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp index 4a1c1b91..9b3bdf70 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, rocblas_int *ipiv, rocblas_int* info) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp index f19138bb..395fd187 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp @@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) + if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) return rocsolver_getf2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GETRF_GETF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_getf2_template(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count); - + // adjust pivot indices and check singularity sizePivot = min(m - j, jb); //number of pivots in the block - blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; + blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; gridPivot = dim3(blocksPivot, batch_count, 1); hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info); @@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int (M + idx2D(j + jb, j + jb, lda)), lda); } } - } + } } hipFree(pivotGPU); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp index 5ed946d0..44317213 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m, - rocblas_int n, U A, rocblas_int lda, + rocblas_int n, U A, rocblas_int lda, rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp index c1ef590b..35443146 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp index 255e306c..435339c1 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, T *A, const rocblas_int lda, - const rocblas_int *ipiv, T *B, const rocblas_int ldb) + const rocblas_int *ipiv, T *B, const rocblas_int ldb) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n) + if (n < 0 || nrhs < 0 || lda < n || ldb < n) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, - const rocblas_int *ipiv, float *B, const rocblas_int ldb) + const rocblas_int *ipiv, float *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } @@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, - const rocblas_int *ipiv, double *B, const rocblas_int ldb) + const rocblas_int *ipiv, double *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda, - const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) + const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda, diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp index 1209770f..e18816df 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp @@ -19,7 +19,7 @@ template rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B, - const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { // quick return if (n == 0 || nrhs == 0 || batch_count == 0) { @@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve L*X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_lower, trans, rocblas_diagonal_unit, n, nrhs, @@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope trans, rocblas_diagonal_non_unit, n, nrhs, oneInt, Ap, lda, Bp, ldb); } - + } else { for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve U**T *X = B or U**H *X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_upper, trans, rocblas_diagonal_non_unit, n, nrhs, diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp index dd2dbe6a..43d48ac5 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } @@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp index 49ced525..e42302d3 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } @@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp index 1ed3f0ee..0127cbe0 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp @@ -5,14 +5,14 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp index 4e1c3c91..518d202e 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp @@ -18,9 +18,9 @@ #include "common_device.hpp" #include "ideal_sizes.hpp" -template -__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, - const rocblas_int j, T *res, rocblas_int *info) +template +__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, + const rocblas_int j, T *res, rocblas_int *info) { int id = hipBlockIdx_x; @@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; #ifdef batched @@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //diagonal info in device (device memory workspace to avoid synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(0, j, lda)), 1, (M + idx2D(0, j, lda)), 1, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_transpose, j, n - j - 1, - d_minone, (M + idx2D(0, j + 1, lda)), lda, + d_minone, (M + idx2D(0, j + 1, lda)), lda, (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda); - } + } for (int b=0;b(AA,shiftA,b,strideA); rocblas_scal(handle, n - j - 1, (pivotGPU + b), @@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(j, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_none, n - j - 1, j, - d_minone, (M + idx2D(j + 1, 0, lda)), lda, + d_minone, (M + idx2D(j + 1, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1); } for (int b=0;b -rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp index 4988f364..4e88e448 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp index e0512eed..b8be605f 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp @@ -5,14 +5,14 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp index 1f1c6650..aef657d4 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp @@ -19,12 +19,12 @@ #include "ideal_sizes.hpp" #include "roclapack_potf2.hpp" -inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) +inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) { int id = hipBlockIdx_x; if (info[id] == 0 && iinfo[id] > 0) - info[id] = iinfo[id] + j; + info[id] = iinfo[id] + j; } template @@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (n < POTRF_POTF2_SWITCHSIZE) + if (n < POTRF_POTF2_SWITCHSIZE) return rocsolver_potf2_template(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count); #ifdef batched @@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //info in device (device memory workspace to avoid synchronization with CPU) - rocblas_int *iinfo; + rocblas_int *iinfo; hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count); hipStream_t stream; @@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none, @@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, } else { // Compute the Cholesky factorization A = L'*L. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose, diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp index 7ac5061e..06dda30c 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp @@ -6,15 +6,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp index 2e49ab4b..6c081fc4 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in index 970adc43..8b6304e0 100644 --- a/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in +++ b/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in @@ -1,6 +1,6 @@ @PACKAGE_INIT@ - + set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@) set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@) diff --git a/ROCm_Libraries/rocSOLVER/docs/source/api.rst b/ROCm_Libraries/rocSOLVER/docs/source/api.rst index 4068d267..690a60a8 100644 --- a/ROCm_Libraries/rocSOLVER/docs/source/api.rst +++ b/ROCm_Libraries/rocSOLVER/docs/source/api.rst @@ -1,12 +1,12 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ************* rocSOLVER API - (Documentation in progress....) ************* -This section provides details of the rocSOLVER library API as of Release +This section provides details of the rocSOLVER library API as of Release `ROCm 2.10 `_. @@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as of Release Types ===== -Most rocSOLVER types are aliases of rocBLAS types. +Most rocSOLVER types are aliases of rocBLAS types. See rocBLAS types `here `_. Definitions @@ -312,7 +312,7 @@ rocsolver_getrs_strided_batched() Auxiliaries ========================= -rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions +rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions `here `_. rocSOLVER handle auxiliaries diff --git a/ROCm_Libraries/rocSOLVER/docs/source/index.rst b/ROCm_Libraries/rocSOLVER/docs/source/index.rst index 91296248..b586bf8e 100644 --- a/ROCm_Libraries/rocSOLVER/docs/source/index.rst +++ b/ROCm_Libraries/rocSOLVER/docs/source/index.rst @@ -4,9 +4,9 @@ Welcome to rocSOLVER's documentation! ======================================= .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: - library + library api diff --git a/ROCm_Libraries/rocSOLVER/docs/source/library.rst b/ROCm_Libraries/rocSOLVER/docs/source/library.rst index 7bbf839d..202bd844 100644 --- a/ROCm_Libraries/rocSOLVER/docs/source/library.rst +++ b/ROCm_Libraries/rocSOLVER/docs/source/library.rst @@ -1,30 +1,30 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ************* Introduction ************* -An implementation of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. -rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS -implementation for AMD’s latest discrete GPUs. More information about rocBLAS can be found +An implementation of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains. +rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS +implementation for AMD's latest discrete GPUs. More information about rocBLAS can be found `here `_. Build and install =================== -rocSOLVER requires `cmake `_ -and `ROCm `_, including -`hip `_ and -`rocBLAS `_, to be installed. +rocSOLVER requires `cmake `_ +and `ROCm `_, including +`hip `_ and +`rocBLAS `_, to be installed. Once these requirements are satisfied, the following instructions will build and install rocSOLVER: .. code-block:: bash - + mkdir build && cd build CXX=/opt/rocm/bin/hcc cmake .. make @@ -33,56 +33,56 @@ instructions will build and install rocSOLVER: Brief description and functionality ==================================== -rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New -functionality is documented at each release of the ROCm platform. +rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New +functionality is documented at each release of the ROCm platform. The following table summarizes the LAPACK functionality implemented in rocSOLVER's last release. =============================== ====== ====== ============== ============== Lapack Auxiliary Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_laswp** x x x x -**rocsolver_larfg** x x +**rocsolver_laswp** x x x x +**rocsolver_larfg** x x **rocsolver_larft** x x **rocsolver_larf** x x -**rocsolver_larfb** x x -**rocsolver_org2r** x x -**rocsolver_orgqr** x x -**rocsolver_orgl2** x x -**rocsolver_orglq** x x -**rocsolver_orgbr** x x -**rocsolver_orm2r** x x -**rocsolver_ormqr** x x +**rocsolver_larfb** x x +**rocsolver_org2r** x x +**rocsolver_orgqr** x x +**rocsolver_orgl2** x x +**rocsolver_orglq** x x +**rocsolver_orgbr** x x +**rocsolver_orm2r** x x +**rocsolver_ormqr** x x =============================== ====== ====== ============== ============== =============================== ====== ====== ============== ============== Lapack Function single double single complex double complex =============================== ====== ====== ============== ============== -**rocsolver_potf2** x x -rocsolver_potf2_batched x x -rocsolver_potf2_strided_batched x x -**rocsolver_potrf** x x -rocsolver_potrf_batched x x -rocsolver_potrf_strided_batched x x +**rocsolver_potf2** x x +rocsolver_potf2_batched x x +rocsolver_potf2_strided_batched x x +**rocsolver_potrf** x x +rocsolver_potrf_batched x x +rocsolver_potrf_strided_batched x x **rocsolver_getf2** x x x x rocsolver_getf2_batched x x x x rocsolver_getf2_strided_batched x x x x -**rocsolver_getrf** x x x x +**rocsolver_getrf** x x x x rocsolver_getrf_batched x x x x rocsolver_getrf_strided_batched x x x x -**rocsolver_geqr2** x x +**rocsolver_geqr2** x x rocsolver_geqr2_batched x x rocsolver_geqr2_strided_batched x x -**rocsolver_geqrf** x x -rocsolver_geqrf_batched x x +**rocsolver_geqrf** x x +rocsolver_geqrf_batched x x rocsolver_geqrf_strided_batched x x -**rocsolver_gelq2** x x +**rocsolver_gelq2** x x rocsolver_gelq2_batched x x rocsolver_gelq2_strided_batched x x -**rocsolver_gelqf** x x -rocsolver_gelqf_batched x x +**rocsolver_gelqf** x x +rocsolver_gelqf_batched x x rocsolver_gelqf_strided_batched x x -**rocsolver_getrs** x x x x +**rocsolver_getrs** x x x x rocsolver_getrs_batched x x x x rocsolver_getrs_strided_batched x x x x =============================== ====== ====== ============== ============== @@ -90,30 +90,30 @@ rocsolver_getrs_strided_batched x x x x Benchmarking and testing ========================== -Additionally, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. +Additionally, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. -On a normal installation, clients should be located in the directory **/build/clients/staging**. +On a normal installation, clients should be located in the directory **/build/clients/staging**. **rocsolver-test** executes a suite of `Google tests `_ (*gtest*) that verifies the correct -functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by +functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by `NETLib LAPACK `_ on the CPU. Calling the rocSOLVER gtest client with the --help flag .. code-block:: bash - + ./rocsolver-test --help -returns information on different flags that control the behavior of the gtests. +returns information on different flags that control the behavior of the gtests. **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic -performance information (as for now, execution times). +performance information (as for now, execution times). -Similarly, +Similarly, .. code-block:: bash - + ./rocsolver-bench --help -returns information on how to use the rocSOLVER benchmark client. - +returns information on how to use the rocSOLVER benchmark client. + diff --git a/ROCm_Libraries/rocSOLVER/index.rst b/ROCm_Libraries/rocSOLVER/index.rst index 91296248..b586bf8e 100644 --- a/ROCm_Libraries/rocSOLVER/index.rst +++ b/ROCm_Libraries/rocSOLVER/index.rst @@ -4,9 +4,9 @@ Welcome to rocSOLVER's documentation! ======================================= .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: - library + library api diff --git a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h index cd388512..3fbbfaf4 100644 --- a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h +++ b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h @@ -42,7 +42,7 @@ extern "C" { n rocsolver_int. n >= 0.\n The number of columns of the matrix A. @param[inout] - A pointer to type. Array on the GPU of dimension lda*n. \n + A pointer to type. Array on the GPU of dimension lda*n. \n On entry, the matrix of column dimension n to which the row interchanges will be applied. On exit, the permuted matrix. @param[in] @@ -59,7 +59,7 @@ extern "C" { @param[in] ipiv pointer to rocsolver_int. Array on the GPU of dimension at least k1 + (k2 - k1) * abs(incx).\n The vector of pivot indices. Only the elements in positions - k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. + k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. Elements of ipiv are considered 1-based. @param[in] incx rocsolver_int. incx != 0.\n @@ -67,92 +67,92 @@ extern "C" { is negative, the pivots are applied in reverse order. *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, const rocsolver_int n, - float *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + float *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, const rocsolver_int n, - double *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + double *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_float_complex *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + rocblas_float_complex *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, +ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_double_complex *A, - const rocsolver_int lda, - const rocsolver_int k1, - const rocsolver_int k2, - const rocsolver_int *ipiv, + rocblas_double_complex *A, + const rocsolver_int lda, + const rocsolver_int k1, + const rocsolver_int k2, + const rocsolver_int *ipiv, const rocsolver_int incx); -/*! \brief LARFG generates an orthogonal Householder reflector H of order n. +/*! \brief LARFG generates an orthogonal Householder reflector H of order n. \details Householder reflector H is such that - + H * [alpha] = [beta] [ x ] [ 0 ] - where x is an n-1 vector and alpha and beta are scalars. Matrix H can be + where x is an n-1 vector and alpha and beta are scalars. Matrix H can be generated as - + H = I - tau * [1] * [1 v'] [v] - with v an n-1 vector and tau a scalar. + with v an n-1 vector and tau a scalar. @param[in] handle rocsolver_handle @param[in] n rocsolver_int. n >= 0.\n - The order (size) of reflector H. + The order (size) of reflector H. @param[inout] alpha pointer to type. A scalar on the GPU.\n - On input the scalar alpha, + On input the scalar alpha, on output it is overwritten with beta. - @param[inout] + @param[inout] x pointer to type. Array on the GPU of size at least n-1.\n - On input it is the vector x, + On input it is the vector x, on output it is overwritten with vector v. @param[in] incx rocsolver_int. incx > 0.\n - The increment between consecutive elements of x. + The increment between consecutive elements of x. @param[out] tau pointer to type. A scalar on the GPU.\n The scalar tau. *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, + const rocsolver_int n, float *alpha, - float *x, - const rocsolver_int incx, + float *x, + const rocsolver_int incx, float *tau); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, + const rocsolver_int n, double *alpha, - double *x, - const rocsolver_int incx, + double *x, + const rocsolver_int incx, double *tau); @@ -164,9 +164,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, H = H(1) * H(2) * ... * H(k) (forward direction), or H = H(k) * ... * H(2) * H(1) (backward direction) - depending on the value of direct. + depending on the value of direct. - The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. + The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. If storev is column-wise, then H = I - V * T * V' @@ -175,7 +175,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, H = I - V' * T * V - where the i-th row of matrix V contains the Householder vector associated to H(i). + where the i-th row of matrix V contains the Householder vector associated to H(i). @param[in] handle rocsolver_handle. @@ -188,10 +188,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n The order (size) of the block reflector. - @param[in] + @param[in] k rocsovler_int. k >= 1.\n The number of Householder matrices. - @param[in] + @param[in] V pointer to type. Array on the GPU of size ldv*k if column-wise, or ldv*n if row-wise.\n The matrix of Householder vectors. @param[in] @@ -203,44 +203,44 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, @param[out] T pointer to type. Array on the GPU of dimension ldt*k.\n The triangular factor. T is upper triangular is forward operation, otherwise it is lower triangular. - The rest of the array is not used. - @param[in] + The rest of the array is not used. + @param[in] ldt rocsolver_int. ldt >= k.\n The leading dimension of T. - **************************************************************************/ + **************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarft(rocsolver_handle handle, - const rocsolver_direct direct, + const rocsolver_direct direct, const rocsolver_storev storev, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, float *V, const rocsolver_int ldv, float *tau, - float *T, - const rocsolver_int ldt); + float *T, + const rocsolver_int ldt); ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, const rocsolver_direct direct, - const rocsolver_storev storev, - const rocsolver_int n, + const rocsolver_storev storev, + const rocsolver_int n, const rocsolver_int k, double *V, const rocsolver_int ldv, double *tau, - double *T, - const rocsolver_int ldt); + double *T, + const rocsolver_int ldt); /*! \brief LARF applies a Householder reflector H to a general matrix A. \details The Householder reflector H, of order m (or n), is to be applied to a m-by-n matrix A - from the left (or the right). H is given by + from the left (or the right). H is given by H = I - alpha * x * x' - + where alpha is a scalar and x a Householder vector. H is never actually computed. @param[in] @@ -254,16 +254,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, Number of rows of A. @param[in] n rocsolver_int. n >= 0.\n - Number of columns of A. + Number of columns of A. @param[in] - x pointer to type. Array on the GPU of + x pointer to type. Array on the GPU of size at least (1 + (m-1)*abs(incx)) if left side, or at least (1 + (n-1)*abs(incx)) if right side.\n The Householder vector x. @param[in] incx rocsolver_int. incx != 0.\n - Increment between to consecutive elements of x. - If incx < 0, the elements of x are used in reverse order. + Increment between to consecutive elements of x. + If incx < 0, the elements of x are used in reverse order. @param[in] alpha pointer to type. A scalar on the GPU.\n If alpha = 0, then H = I (A will remain the same, x is never used) @@ -273,35 +273,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle, H*A (or A*H). @param[in] lda rocsolver_int. lda >= m.\n - Leading dimension of A. - + Leading dimension of A. + *************************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, - const rocsolver_side side, +ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, + const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, - float* x, - const rocsolver_int incx, + const rocsolver_int n, + float* x, + const rocsolver_int incx, const float* alpha, - float* A, + float* A, const rocsolver_int lda); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, - const rocsolver_side side, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, + const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, - double* x, - const rocsolver_int incx, + const rocsolver_int n, + double* x, + const rocsolver_int incx, const double* alpha, - double* A, + double* A, const rocsolver_int lda); /*! \brief LARFB applies a block reflector H to a general m-by-n matrix A. \details - The block reflector H is applied in one of the following forms, depending on + The block reflector H is applied in one of the following forms, depending on the values of side and trans: H * A (No transpose from the left) @@ -322,7 +322,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, H = I - V' * T * V - where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. + where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. T is the associated triangular factor as computed by LARFT. @param[in] @@ -345,11 +345,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix A. - @param[in] + @param[in] k rocsovler_int. k >= 1.\n The number of Householder matrices. - @param[in] - V pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, + @param[in] + V pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, or ldv*m if row-wise and applying from the left.\n The matrix of Householder vectors. @param[in] @@ -359,16 +359,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, @param[in] T pointer to type. Array on the GPU of dimension ldt*k.\n The triangular factor of the block reflector. - @param[in] + @param[in] ldt rocsolver_int. ldt >= k.\n The leading dimension of T. @param[inout] A pointer to type. Array on the GPU of size lda*n.\n On input, the matrix A. On output it is overwritten with - H*A, A*H, H'*A, or A*H'. + H*A, A*H, H'*A, or A*H'. @param[in] lda rocsolver_int. lda >= m.\n - Leading dimension of A. + Leading dimension of A. ****************************************************************************/ @@ -376,31 +376,31 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfb(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_direct direct, - const rocsolver_storev storev, + const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, float *V, const rocsolver_int ldv, - float *T, + float *T, const rocsolver_int ldt, float *A, - const rocsolver_int lda); + const rocsolver_int lda); ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_direct direct, - const rocsolver_storev storev, + const rocsolver_direct direct, + const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, const rocsolver_int k, double *V, const rocsolver_int ldv, - double *T, + double *T, const rocsolver_int ldt, double *A, - const rocsolver_int lda); + const rocsolver_int lda); /*! \brief ORG2R generates a m-by-n Matrix Q with orthonormal columns. @@ -409,17 +409,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, The matrix Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEQRF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. 0 <= n <= m.\n The number of colums of the matrix Q. @@ -433,7 +433,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GEQRF. @@ -442,16 +442,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorg2r(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -463,17 +463,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, The matrix Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEQRF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. 0 <= n <= m.\n The number of colums of the matrix Q. @@ -487,7 +487,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GEQRF. @@ -496,16 +496,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgqr(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -517,17 +517,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, The matrix Q is defined as the first m rows of the product of k Householder reflectors of order n - + Q = H(k) * H(k-1) * ... * H(1) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GELQF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. 0 <= m <= n.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. n >= 0.\n The number of colums of the matrix Q. @@ -541,7 +541,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GELQF. @@ -550,16 +550,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgl2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -572,17 +572,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, The matrix Q is defined as the first m rows of the product of k Householder reflectors of order n - + Q = H(k) * H(k-1) * ... * H(1) - Householder matrices H(i) are never stored, they are computed from its corresponding + Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GELQF. @param[in] handle rocsolver_handle. @param[in] m rocsolver_int. 0 <= m <= n.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. @param[in] n rocsolver_int. n >= 0.\n The number of colums of the matrix Q. @@ -596,7 +596,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension at least k.\n The scalar factors of the Householder matrices H(i) as returned by GELQF. @@ -605,16 +605,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorglq(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -622,9 +622,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, /*! \brief ORGBR generates a m-by-n Matrix Q with orthonormal rows or columns. \details - If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first + If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first n columns of the product of k Householder reflectors of order m - + Q = H(1) * H(2) * ... * H(k) If m < k, Q is defined as the product of Householder reflectors of order m @@ -635,12 +635,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, first m rows of the product of k Householder reflectors of order n Q = H(k) * H(k-1) * ... * H(1) - + If n <= k, Q is defined as the product of Householder reflectors of order n Q = H(n-1) * H(n-2) * ... * H(1) - The Householder matrices H(i) are never stored, they are computed from its corresponding + The Householder matrices H(i) are never stored, they are computed from its corresponding Householder vector v(i) and scalar ipiv_i as returned by GEBRD. @param[in] @@ -650,12 +650,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, Specifies whether to work column-wise or row-wise. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix Q. + The number of rows of the matrix Q. If row-wise, then min(n,k) <= m <= n. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix Q. - If column-wise, then min(m,k) <= n <= m. + The number of colums of the matrix Q. + If column-wise, then min(m,k) <= n <= m. @param[in] k rocsolver_int. k >= 0.\n The number of columns (if storev is colum-wise) or rows (if row-wise) of the @@ -667,7 +667,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, On exit, the computed matrix Q. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[in] ipiv pointer to type. Array on the GPU of dimension min(m,k) if column-wise, or min(n,k) if row-wise.\n The scalar factors of the Householder matrices H(i) as returned by GEBRD. @@ -677,8 +677,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv); @@ -686,8 +686,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv); @@ -696,8 +696,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, \details (This is the unblocked version of the algorithm). - - The matrix Q is applied in one of the following forms, depending on + + The matrix Q is applied in one of the following forms, depending on the values of side and trans: Q * C (No transpose from the left) @@ -709,7 +709,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, Q = H(1) * H(2) * ... * H(k) - or order m if applying from the left, or n if applying from the right. Q is never stored, it is + or order m if applying from the left, or n if applying from the right. Q is never stored, it is calculated from the Householder vectors and scalars returned by the QR factorization GEQRF. @param[in] @@ -726,10 +726,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix C. - @param[in] + @param[in] k rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n The number of Householder reflectors that form Q. - @param[in] + @param[in] A pointer to type. Array on the GPU of size lda*k.\n The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF in the first k columns of its argument A. @@ -742,19 +742,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle, @param[inout] C pointer to type. Array on the GPU of size ldc*n.\n On input, the matrix C. On output it is overwritten with - Q*C, C*Q, Q'*C, or C*Q'. + Q*C, C*Q, Q'*C, or C*Q'. @param[in] lda rocsolver_int. ldc >= m.\n - Leading dimension of C. - + Leading dimension of C. + ****************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorm2r(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv, @@ -765,8 +765,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv, @@ -777,8 +777,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, \details (This is the blocked version of the algorithm). - - The matrix Q is applied in one of the following forms, depending on + + The matrix Q is applied in one of the following forms, depending on the values of side and trans: Q * C (No transpose from the left) @@ -790,7 +790,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, Q = H(1) * H(2) * ... * H(k) - or order m if applying from the left, or n if applying from the right. Q is never stored, it is + or order m if applying from the left, or n if applying from the right. Q is never stored, it is calculated from the Householder vectors and scalars returned by the QR factorization GEQRF. @param[in] @@ -807,10 +807,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, @param[in] n rocsolver_int. n >= 0.\n Number of columns of matrix C. - @param[in] + @param[in] k rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n The number of Householder reflectors that form Q. - @param[in] + @param[in] A pointer to type. Array on the GPU of size lda*k.\n The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF in the first k columns of its argument A. @@ -823,19 +823,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle, @param[inout] C pointer to type. Array on the GPU of size ldc*n.\n On input, the matrix C. On output it is overwritten with - Q*C, C*Q, Q'*C, or C*Q'. + Q*C, C*Q, Q'*C, or C*Q'. @param[in] lda rocsolver_int. ldc >= m.\n - Leading dimension of C. - + Leading dimension of C. + ****************************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sormqr(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, float *A, const rocsolver_int lda, float *ipiv, @@ -846,8 +846,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, const rocsolver_int m, - const rocsolver_int n, - const rocsolver_int k, + const rocsolver_int n, + const rocsolver_int k, double *A, const rocsolver_int lda, double *ipiv, @@ -880,10 +880,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix A. + The number of rows of the matrix A. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix A. + The number of colums of the matrix A. @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix A to be factored. @@ -891,7 +891,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, The unit diagonal elements of L are not stored. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n The vector of pivot indices. Elements of ipiv are 1-based indices. @@ -900,14 +900,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle, Matrix P of the factorization can be derived from ipiv. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful exit. + If info = 0, succesful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot. - + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -915,7 +915,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -923,7 +923,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -931,7 +931,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -968,8 +968,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, lda rocsolver_int. lda >= m.\n Specifies the leading dimension of matrices A_i. @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivot indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -981,17 +981,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle, There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1001,7 +1001,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1011,7 +1011,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1021,7 +1021,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1034,7 +1034,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand \details (This is the right-looking Level 2 BLAS version of the algorithm). - + The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i @@ -1064,8 +1064,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivots indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1077,17 +1077,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1098,7 +1098,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1109,7 +1109,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1120,7 +1120,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1147,10 +1147,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han handle rocsolver_handle. @param[in] m rocsolver_int. m >= 0.\n - The number of rows of the matrix A. + The number of rows of the matrix A. @param[in] n rocsolver_int. n >= 0.\n - The number of colums of the matrix A. + The number of colums of the matrix A. @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix A to be factored. @@ -1158,7 +1158,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han The unit diagonal elements of L are not stored. @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n The vector of pivot indices. Elements of ipiv are 1-based indices. @@ -1167,14 +1167,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han Matrix P of the factorization can be derived from ipiv. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful exit. + If info = 0, succesful exit. If info = i > 0, U is singular. U(i,i) is the first zero pivot. - + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1182,7 +1182,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1190,7 +1190,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1198,7 +1198,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle, ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, @@ -1235,8 +1235,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, lda rocsolver_int. lda >= m.\n Specifies the leading dimension of matrices A_i. @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivot indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivot indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1248,17 +1248,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle, There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1268,7 +1268,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1278,7 +1278,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1288,7 +1288,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, @@ -1301,7 +1301,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand \details (This is the right-looking Level 3 BLAS version of the algorithm). - + The factorization of matrix A_i in the batch has the form A_i = P_i * L_i * U_i @@ -1331,8 +1331,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n @param[out] - ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors of pivots indices ipiv_i (corresponding to A_i). + ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n + Contains the vectors of pivots indices ipiv_i (corresponding to A_i). Dimension of ipiv_i is min(m,n). Elements of ipiv_i are 1-based indices. For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the @@ -1344,17 +1344,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n). @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful exit for factorization of A_i. + If info_i = 0, succesful exit for factorization of A_i. If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. - + Number of matrices in the batch. + ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1365,7 +1365,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1376,7 +1376,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1387,7 +1387,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, + const rocsolver_int n, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -1406,7 +1406,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han A = Q * [ R ] [ 0 ] - where R is upper triangular (upper trapezoidal if m < n), and Q is + where R is upper triangular (upper trapezoidal if m < n), and Q is a m-by-m orthogonal matrix represented as the product of Householder matrices Q = H(1) * H(2) * ... * H(k), with k = min(m,n) @@ -1414,8 +1414,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i) * v(i)' - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1428,30 +1428,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R; the elements below the diagonal are the m - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GEQR2_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1464,7 +1464,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, A_j = Q_j * [ R_j ] [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1473,7 +1473,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1486,19 +1486,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1507,22 +1507,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GEQR2_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1533,9 +1533,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1544,7 +1544,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1557,23 +1557,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1582,24 +1582,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQ2 computes a LQ factorization of a general m-by-n matrix A. @@ -1610,8 +1610,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han The factorization has the form A = [ L 0 ] * Q - - where L is lower triangular (lower trapezoidal if m > n), and Q is + + where L is lower triangular (lower trapezoidal if m > n), and Q is a n-by-n orthogonal matrix represented as the product of Householder matrices Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n) @@ -1619,8 +1619,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i)' * v(i) - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1633,30 +1633,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and delow the diagonal contain the + On exit, the elements on and delow the diagonal contain the factor L; the elements above the diagonal are the n - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GELQ2_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -1666,9 +1666,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -1677,7 +1677,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1690,19 +1690,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1711,22 +1711,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQ2_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -1736,9 +1736,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -1747,7 +1747,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1760,23 +1760,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1785,24 +1785,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); @@ -1815,8 +1815,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han A = Q * [ R ] [ 0 ] - - where R is upper triangular (upper trapezoidal if m < n), and Q is + + where R is upper triangular (upper trapezoidal if m < n), and Q is a m-by-m orthogonal matrix represented as the product of Householder matrices Q = H(1) * H(2) * ... * H(k), with k = min(m,n) @@ -1824,8 +1824,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i) * v(i)' - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1838,30 +1838,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R; the elements below the diagonal are the m - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GEQRF_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1872,9 +1872,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1883,7 +1883,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1896,19 +1896,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1917,22 +1917,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GEQRF_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices. @@ -1943,9 +1943,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form A_j = Q_j * [ R_j ] - [ 0 ] + [ 0 ] - where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is + where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is a m-by-m orthogonal matrix represented as the product of Householder matrices Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n) @@ -1954,7 +1954,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)' - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -1967,23 +1967,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and above the diagonal contain the + On exit, the elements on and above the diagonal contain the factor R_j. The elements below the diagonal are the m - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -1992,24 +1992,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQF computes a LQ factorization of a general m-by-n matrix A. @@ -2020,8 +2020,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han The factorization has the form A = [ L 0 ] * Q - - where L is lower triangular (lower trapezoidal if m > n), and Q is + + where L is lower triangular (lower trapezoidal if m > n), and Q is a n-by-n orthogonal matrix represented as the product of Householder matrices Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n) @@ -2029,8 +2029,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han Each Householder matrix H(i), for i = 1,2,...,k, is given by H(i) = I - ipiv[i-1] * v(i)' * v(i) - - where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. + + where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2043,30 +2043,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han @param[inout] A pointer to type. Array on the GPU of dimension lda*n.\n On entry, the m-by-n matrix to be factored. - On exit, the elements on and delow the diagonal contain the + On exit, the elements on and delow the diagonal contain the factor L; the elements above the diagonal are the n - i elements of vector v(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of A. + Specifies the leading dimension of A. @param[out] ipiv pointer to type. Array on the GPU of dimension min(m,n).\n The scalar factors of the Householder matrices H(i). ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, + const rocsolver_int lda, float *ipiv); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, + const rocsolver_int lda, double *ipiv); /*! \brief GELQF_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -2076,9 +2076,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -2087,7 +2087,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2100,19 +2100,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, @param[inout] A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i=1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -2121,22 +2121,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *const A[], - const rocsolver_int lda, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *const A[], - const rocsolver_int lda, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GELQF_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices. @@ -2146,9 +2146,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand The factorization of matrix A_j in the batch has the form - A_j = [ L_j 0 ] * Q_j + A_j = [ L_j 0 ] * Q_j - where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is + where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is a n-by-n orthogonal matrix represented as the product of Householder matrices Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n) @@ -2157,7 +2157,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i) - where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. + where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. @param[in] handle rocsolver_handle. @@ -2170,23 +2170,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n On entry, the m-by-n matrices A_j to be factored. - On exit, the elements on and below the diagonal contain the + On exit, the elements on and below the diagonal contain the factor L_j. The elements above the diagonal are the n - i elements of vector v_j(i) for i = 1,2,...,min(m,n). @param[in] lda rocsolver_int. lda >= m.\n - Specifies the leading dimension of matrices A_j. + Specifies the leading dimension of matrices A_j. @param[in] - strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + strideA rocsolver_int.\n + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] ipiv pointer to type. Array on the GPU (the size depends on the value of strideP).\n - Contains the vectors ipiv_j of scalar factors of the + Contains the vectors ipiv_j of scalar factors of the Householder matrices H_j(i). @param[in] strideP rocsolver_int.\n - Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). + Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). There is no restriction for the value of strideP. Normal use is strideP >= min(m,n). @param[in] @@ -2195,46 +2195,46 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand ********************************************************************/ -ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, float *A, - const rocsolver_int lda, - const rocsolver_int strideA, - float *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + float *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); -ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, - const rocsolver_int m, - const rocsolver_int n, +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, + const rocsolver_int m, + const rocsolver_int n, double *A, - const rocsolver_int lda, - const rocsolver_int strideA, - double *ipiv, - const rocsolver_int strideP, + const rocsolver_int lda, + const rocsolver_int strideA, + double *ipiv, + const rocsolver_int strideP, const rocsolver_int batch_count); /*! \brief GETRS solves a system of n linear equations on n variables using the LU factorization computed by GETRF. \details - It solves one of the following systems: + It solves one of the following systems: - A * X = B (no transpose), - A' * X = B (transpose), or + A * X = B (no transpose), + A' * X = B (transpose), or A* * X = B (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations. + Specifies the form of the system of equations. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of A. + The order of the system, i.e. the number of columns and rows of A. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2244,7 +2244,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_han The factors L and U of the factorization A = P*L*U returned by GETRF. @param[in] lda rocsolver_int. lda >= n.\n - The leading dimension of A. + The leading dimension of A. @param[in] ipiv pointer to rocsolver_int. Array on the GPU of dimension n.\n The pivot indices returned by GETRF. @@ -2278,26 +2278,26 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int *ipiv, rocblas_double_complex *B, const rocsolver_int ldb); -/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables +/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables using the LU factorization computed by GETRF_BATCHED. \details - For each instance j in the batch, it solves one of the following systems: + For each instance j in the batch, it solves one of the following systems: - A_j * X_j = B_j (no transpose), - A_j' * X_j = B_j (transpose), or + A_j * X_j = B_j (no transpose), + A_j' * X_j = B_j (transpose), or A_j* * X_j = B_j (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations of each instance in the batch. + Specifies the form of the system of equations of each instance in the batch. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of all A_j matrices. + The order of the system, i.e. the number of columns and rows of all A_j matrices. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2312,7 +2312,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n Contains the vectors ipiv_j of pivot indices returned by GETRF_BATCHED. @param[in,out] - B Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n + B Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n On entry, the right hand side matrices B_j. On exit, the solution matrix X_j of each system in the batch. @param[in] @@ -2320,7 +2320,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( The leading dimension of matrices B_j. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of instances (systems) in the batch. + Number of instances (systems) in the batch. ********************************************************************/ @@ -2337,35 +2337,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_batched( ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count); ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count); -/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables +/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables using the LU factorization computed by GETRF_STRIDED_BATCHED. \details - For each instance j in the batch, it solves one of the following systems: + For each instance j in the batch, it solves one of the following systems: - A_j * X_j = B_j (no transpose), - A_j' * X_j = B_j (transpose), or + A_j * X_j = B_j (no transpose), + A_j' * X_j = B_j (transpose), or A_j* * X_j = B_j (conjugate transpose) - depending on the value of trans. + depending on the value of trans. @param[in] handle rocsolver_handle. @param[in] trans rocsolver_operation.\n - Specifies the form of the system of equations of each instance in the batch. + Specifies the form of the system of equations of each instance in the batch. @param[in] n rocsolver_int. n >= 0.\n - The order of the system, i.e. the number of columns and rows of all A_j matrices. + The order of the system, i.e. the number of columns and rows of all A_j matrices. @param[in] nrhs rocsolver_int. nrhs >= 0.\n The number of right hand sides, i.e., the number of columns @@ -2378,7 +2378,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( The leading dimension of matrices A_j. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_j and the next one A_(j+1). + Stride from the start of one matrix A_j and the next one A_(j+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[in] ipiv pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n @@ -2392,11 +2392,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( The leading dimension of matrices B_j. @param[in] strideB rocsolver_int.\n - Stride from the start of one matrix B_j and the next one B_(j+1). + Stride from the start of one matrix B_j and the next one B_(j+1). There is no restriction for the value of strideB. Normal use case is strideB >= ldb*nrhs. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of instances (systems) in the batch. + Number of instances (systems) in the batch. ********************************************************************/ @@ -2413,13 +2413,13 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_strided_batched( ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count); ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count); @@ -2427,7 +2427,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( positive definite matrix A. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization has the form: @@ -2453,8 +2453,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( specifies the leading dimension of A. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful factorization of matrix A. - If info = i > 0, the leading minor of order i of A is not positive definite. + If info = 0, succesful factorization of matrix A. + If info = i > 0, the leading minor of order i of A is not positive definite. The factorization stopped at this point. ********************************************************************/ @@ -2472,11 +2472,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle, rocblas_int* info); -/*! \brief POTF2_BATCHED computes the Cholesky factorization of a +/*! \brief POTF2_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2496,24 +2496,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle, The dimension of matrix A_i. @param[inout] A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocblas_int* info, @@ -2521,17 +2521,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocblas_int* info, const rocsolver_int batch_count); -/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a +/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the unblocked version of the algorithm). + (This is the unblocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2551,28 +2551,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle hand The dimension of matrix A_i. @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_i and the next one A_(i+1). + Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2581,7 +2581,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2592,7 +2592,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han positive definite matrix A. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization has the form: @@ -2618,8 +2618,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han specifies the leading dimension of A. @param[out] info pointer to a rocsolver_int on the GPU.\n - If info = 0, succesful factorization of matrix A. - If info = i > 0, the leading minor of order i of A is not positive definite. + If info = 0, succesful factorization of matrix A. + If info = i > 0, the leading minor of order i of A is not positive definite. The factorization stopped at this point. ********************************************************************/ @@ -2637,11 +2637,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle, rocblas_int* info); -/*! \brief POTRF_BATCHED computes the Cholesky factorization of a +/*! \brief POTRF_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2661,24 +2661,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle, The dimension of matrix A_i. @param[inout] A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *const A[], const rocsolver_int lda, rocblas_int* info, @@ -2686,17 +2686,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle hand ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *const A[], const rocsolver_int lda, rocblas_int* info, const rocsolver_int batch_count); -/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a +/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a batch of real symmetric positive definite matrices. \details - (This is the blocked version of the algorithm). + (This is the blocked version of the algorithm). The factorization of matrix A_i in the batch has the form: @@ -2716,28 +2716,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle hand The dimension of matrix A_i. @param[inout] A pointer to type. Array on the GPU (the size depends on the value of strideA).\n - On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. + On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. @param[in] lda rocsolver_int. lda >= n.\n specifies the leading dimension of A_i. @param[in] strideA rocsolver_int.\n - Stride from the start of one matrix A_i and the next one A_(i+1). + Stride from the start of one matrix A_i and the next one A_(i+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[out] info pointer to rocsolver_int. Array of batch_count integers on the GPU.\n - If info_i = 0, succesful factorization of matrix A_i. - If info_i = j > 0, the leading minor of order j of A_i is not positive definite. + If info_i = 0, succesful factorization of matrix A_i. + If info_i = j > 0, the leading minor of order j of A_i is not positive definite. The i-th factorization stopped at this point. @param[in] batch_count rocsolver_int. batch_count >= 0.\n - Number of matrices in the batch. + Number of matrices in the batch. ********************************************************************/ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, float *A, const rocsolver_int lda, const rocsolver_int strideA, @@ -2746,7 +2746,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_han ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_strided_batched(rocsolver_handle handle, const rocsolver_fill uplo, - const rocsolver_int n, + const rocsolver_int n, double *A, const rocsolver_int lda, const rocsolver_int strideA, diff --git a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h index 55d3e42a..e8cf8251 100644 --- a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h +++ b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h @@ -11,8 +11,8 @@ #include -/*! \brief Used to specify int32 or int64. - \details rocsolver_int is a rocblas_int +/*! \brief Used to specify int32 or int64. + \details rocsolver_int is a rocblas_int ******************************************************************/ typedef rocblas_int rocsolver_int; @@ -20,12 +20,12 @@ typedef rocblas_float_complex rocsolver_float_complex; typedef rocblas_double_complex rocsolver_double_complex; typedef rocblas_half rocsolver_half; -/*! \brief A structure holding the rocsolver library context. - \details +/*! \brief A structure holding the rocsolver library context. + \details It must be initialized using rocsolver_create_handle() - and the returned handle must be passed to all subsequent library + and the returned handle must be passed to all subsequent library function calls. It should be destroyed at the end using rocsolver_destroy_handle().\n - rocsolver_handle is a rocblas_handle. + rocsolver_handle is a rocblas_handle. *************************************************************************/ typedef rocblas_handle rocsolver_handle; @@ -56,16 +56,16 @@ typedef rocblas_status rocsolver_status; typedef rocblas_layer_mode rocsolver_layer_mode; -/*! \brief Used to specify the order in which multiple elementary matrices are applied together - ********************************************************************************/ +/*! \brief Used to specify the order in which multiple elementary matrices are applied together + ********************************************************************************/ typedef enum rocsolver_direct_ { rocsolver_forward_direction = 171, /**< Elementary matrices applied from the right. */ rocsolver_backward_direction = 172, /**< Elementary matrices applied from the left. */ } rocsolver_direct; -/*! \brief Used to specify how householder vectors are stored in a matrix of vectors - ********************************************************************************/ +/*! \brief Used to specify how householder vectors are stored in a matrix of vectors + ********************************************************************************/ typedef enum rocsolver_storev_ { rocsolver_column_wise = 181, /**< Householder vectors are stored in the columns of a matrix. */ diff --git a/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt index cbf3d10d..4a435950 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt +++ b/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt @@ -82,7 +82,7 @@ add_library( rocsolver ${rocsolver_lapack_source} ${relative_rocsolver_headers_public} ${rocsolver_auxiliary_source} - ${rocsolver_common_source} + ${rocsolver_common_source} ) add_library( roc::rocsolver ALIAS rocsolver ) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp index 9c52fd62..8c4e0c70 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_larf.hpp" template -rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha, T* A, const rocsolver_int lda) { @@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side rocblas_int stridep = 0; rocblas_int batch_count=1; - return rocsolver_larf_template(handle,side, + return rocsolver_larf_template(handle,side, m,n, x,0, //vector shifted 0 entries incx, @@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side stridep, A,0, //matrix shifted 0 entries lda, - stridea, + stridea, batch_count); } @@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side extern "C" { -ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha, float* A, const rocsolver_int lda) { return rocsolver_larf_impl(handle, side, m, n, x, incx, alpha, A, lda); } -ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha, double* A, const rocsolver_int lda) { diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp index 27a5a0d4..3755ea14 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp @@ -19,8 +19,8 @@ template rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, - const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, + const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, + const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count) { // quick return @@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ T* zeroInt; //constant 0 in device hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ // OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + //memory in GPU (workspace) T *workvec; hipMalloc(&workvec, sizeof(T)*order*batch_count); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute the matrix vector product (W=tau*A'*X or W=tau*A*X) for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp index 12ed4e92..d28b4a03 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp @@ -5,10 +5,10 @@ #include "rocauxiliary_larfb.hpp" template -rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf, T* A, const rocsolver_int lda) { @@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid if (storev == rocsolver_row_wise) { if (ldv < k) return rocblas_status_invalid_size; - } else { + } else { if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n)) return rocblas_status_invalid_size; } @@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid rocblas_int stridef = 0; rocblas_int batch_count=1; - return rocsolver_larfb_template(handle,side,trans,direct,storev, + return rocsolver_larfb_template(handle,side,trans,direct,storev, m,n,k, V,0, //shifted 0 entries ldv, @@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid stridef, A,0, //shifted 0 entries lda, - stridea, + stridea, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp index 5214e29a..dc4ee469 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp @@ -19,7 +19,7 @@ template -__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U } template -__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A Wp = work + b*strideW; Ap = load_ptr_batch(A,shiftA,b,strideA); - Ap[i + j*lda] -= Wp[i + j*ldw]; + Ap[i + j*lda] -= Wp[i + j*ldw]; } } template -rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, T *F, const rocsolver_int shiftF, - const rocsolver_int ldf, const rocsolver_int strideF, + const rocsolver_int ldf, const rocsolver_int strideF, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, const rocsolver_int batch_count) { @@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver //determine the side, size of workspace //and whether V is trapezoidal - rocsolver_operation transp; + rocsolver_operation transp; rocsolver_fill uploV; bool trap; rocblas_int order, ldw; - bool colwise = (storev == rocsolver_column_wise); + bool colwise = (storev == rocsolver_column_wise); bool leftside = (side == rocblas_side_left); size_t offsetV; - + if (leftside) { order = n; ldw = k; @@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver if (colwise) { uploV = rocblas_fill_lower; offsetV = idx2D(k,0,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_transpose; - else + else transp = rocblas_operation_none; } else { uploV = rocblas_fill_upper; offsetV = idx2D(0,k,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_none; - else + else transp = rocblas_operation_transpose; } @@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver rocblas_int blocksx = (order - 1)/32 + 1; rocblas_int blocksy = (ldw - 1)/32 + 1; hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + // BACKWARD DIRECTION TO BE IMPLEMENTED... rocsolver_fill uploT = rocblas_fill_upper; if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - + //compute: // V1' * A1, or - // or + // or // A1 * V1 for (int b=0;b(VV,shiftV,b,strideV); @@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } // compute: - // V1' * A1 + V2' * A2 - // or + // V1' * A1 + V2' * A2 + // or // A1 * V1 + A2 * V2 - if (trap) { + if (trap) { for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt, (Vp + offsetV),ldv, (Ap + idx2D(k,0,lda)),lda, @@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } - // compute: + // compute: // trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) + // (A1 * V1 + A2 * V2) * trans(T) for (int b=0;b(FF,shiftF,b,strideF); rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw); @@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver // compute: // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2) // or - // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' + // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' if (transp == rocblas_operation_transpose) transp = rocblas_operation_none; else @@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt, (Vp + offsetV),ldv, (work + b*strideW),ldw, @@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } } - + // compute: // V1 * trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) * V1' + // (A1 * V1 + A2 * V2) * trans(T) * V1' for (int b=0;b(VV,shiftV,b,strideV); rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw); } - + // compute: // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2) // or // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1' hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + hipFree(minoneInt); hipFree(oneInt); hipFree(work); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp index 4b1e00fa..8e651066 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp @@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n, incx, stridex, tau, - strideP, + strideP, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp index f4fc193c..38683f5d 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp @@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha template -rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, +rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex, T *tau, const rocblas_int strideP, const rocblas_int batch_count) { @@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int hipStream_t stream; rocblas_get_stream(handle, &stream); dim3 gridReset(1, batch_count, 1); - dim3 threads(1, 1, 1); + dim3 threads(1, 1, 1); if (n == 1) { hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0); - return rocblas_status_success; - } + return rocblas_status_success; + } T *xp; @@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *norms; - hipMalloc(&norms, sizeof(T)*batch_count); + hipMalloc(&norms, sizeof(T)*batch_count); // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute norm of x for (int b=0;b(xx,shiftx,b,stridex); @@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //set value of tau and beta and scalling factor for vector x //alpha <- beta - //norms <- scalling + //norms <- scalling hipLaunchKernelGGL(set_taubeta,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex); - + //compute vector v=x*norms for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp index 5ab79a92..10915015 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_larft.hpp" template -rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, - const rocsolver_storev storev, const rocsolver_int n, +rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, + const rocsolver_storev storev, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau, T* F, const rocsolver_int ldf) { @@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir stridet, F, ldf, - stridef, + stridef, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp index ee2add09..8a38ac3f 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp @@ -17,8 +17,8 @@ #include "common_device.hpp" template -__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, - T* tau, const rocsolver_int strideT, +__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, + T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev) { const auto blocksize = hipBlockDim_x; @@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid const auto blocksize = hipBlockDim_x; const auto b = hipBlockIdx_x; const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x; - + if (i < k) { T *tp; tp = tau + b*strideT; tp[i] = -tp[i]; } } - + template -rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, +rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, - const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count) { // quick return @@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice); hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - //Fix diagonal of T, make zero the non used triangular part, + //Fix diagonal of T, make zero the non used triangular part, //setup tau (changing signs) and account for the non-stored 1's on the householder vectors rocblas_int blocks = (k - 1)/32 + 1; hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream, k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev); hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT); - // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS + // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS // AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - - rocblas_operation trans; - - for (int i = 1; i < k; ++i) { + rocblas_operation trans; + + + for (int i = 1; i < k; ++i) { //compute the matrix vector product, using the householder vectors for (int b=0;b(VV,shiftV,b,strideV); Fp = F + b*strideF; - rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, + rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1); - } + } } //restore tau @@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipFree(oneInt); hipFree(zeroInt); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp index e79f652f..360fef79 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp @@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const } ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); } ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp index 0dc74205..4615a7ec 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp @@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA template rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2, - const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, + const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, const rocblas_int batch_count) { // quick return - if (n == 0 || !batch_count) + if (n == 0 || !batch_count) return rocblas_status_success; rocblas_int start, end, inc; @@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int end = k1 - 1; inc = -1; incx = -incx; - } + } else { start = k1; end = k2 + 1; diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp index 102fd83e..465b3635 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_org2r.hpp" template -rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp index 08d072aa..2dbcc11e 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j > i) + else if (j > i) Ap[i + j*lda] = 0.0; else if (j >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j,j+1,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th column -corresponding to H(i)- if (j < m - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), - (M + idx2D(j + 1, j, lda)), 1); - } + rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), + (M + idx2D(j + 1, j, lda)), 1); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp index bd3e4714..eb4f0bb6 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orgbr.hpp" template -rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp index a1315b6e..deec30a8 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp @@ -23,7 +23,7 @@ #define BS 32 //blocksize for kernels template -__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && j <= i) { rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy columns - Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); - + Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); + } else { - // shift columns to the right + // shift columns to the right Ap[i+1 + j*lda] = Wp[i + j*ldw - offset]; - + // make first row the identity if (i == j) { Ap[(j+1)*lda] = 0.0; @@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const } template -__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && i <= j) { rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy rows - Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); - + Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); + } else { - // shift rows downward + // shift rows downward Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset]; - + // make first column the identity if (i == j) { Ap[i+1] = 0.0; @@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const } template -rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization + // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization // of a m-by-k matrix A (given by gebrd) if (storev == rocsolver_column_wise) { if (m >= k) { - rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); + rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); } else { // shift the householder vectors provided by gebrd as they come below the first subdiagonal // workspace @@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (m - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); - + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + // result - rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); + hipFree(W); - } + } } - - // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization + + // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization // of a k-by-n matrix A (given by gebrd) else { if (n > k) { @@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (n - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // result rocsolver_orglq_template(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + hipFree(W); } - } + } return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp index 27e3d8ed..ec38dc16 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgl2.hpp" template -rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp index 202a4fc3..35475070 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j < i) + else if (j < i) Ap[i + j*lda] = 0.0; else if (i >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j+1,j,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th row -corresponding to H(i)- if (j < n - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), - (M + idx2D(j, j + 1, lda)), lda); - } + rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), + (M + idx2D(j, j + 1, lda)), lda); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp index 35b17482..e3039734 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orglq.hpp" template -rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp index 97886fce..39f77a46 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp @@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A, if (i < m && j < kk) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_orgl2_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding left submatrix if (kk < m) { blocksx = (m - kk - 1)/32 + 1; blocksy = (kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, m,kk,A,shiftA,lda,strideA); - - rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < m) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_orgl2_template(handle, jb, n - j, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_orgl2_template(handle, jb, n - j, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp index ef11bd5e..7b1aceec 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgqr.hpp" template -rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp index 86386317..8079413c 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp @@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A, if (i < kk && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_org2r_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding top submatrix if (kk < n) { blocksx = (kk - 1)/32 + 1; blocksy = (n- kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, n,kk,A,shiftA,lda,strideA); - - rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < n) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_org2r_template(handle, m - j, jb, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_org2r_template(handle, m - j, jb, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp index 34ee185b..fdaa1724 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orm2r.hpp" template -rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp index 10522f08..dd83c375 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp @@ -18,10 +18,10 @@ #include "../auxiliary/rocauxiliary_larf.hpp" template -rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver ncol = n - i; jc = i; } - - // insert one in A(i,i) tobuild/apply the householder matrix + + // insert one in A(i,i) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); - // Apply current Householder reflector + // Apply current Householder reflector rocsolver_larf_template(handle,side, //side nrow, //number of rows of matrix to modify - ncol, //number of columns of matrix to modify + ncol, //number of columns of matrix to modify A, shiftA + idx2D(i,i,lda), //householder vector x 1, strideA, //inc of x (ipiv + i), strideP, //householder scalar (alpha) @@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver // restore original value of A(i,i) hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp index 7d11d5e6..820f4a46 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_ormqr.hpp" template -rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp index fd0b523c..b24d77cd 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp @@ -20,10 +20,10 @@ #include "../auxiliary/rocauxiliary_larft.hpp" template -rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked variant of the algorithm - if (k <= ORMQR_ORM2R_BLOCKSIZE) + if (k <= ORMQR_ORM2R_BLOCKSIZE) return rocsolver_orm2r_template(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count); //memory in GPU (workspace) T* work; rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE; rocblas_int strideW = ldw *ldw; - hipMalloc(&work, sizeof(T)*strideW*batch_count); + hipMalloc(&work, sizeof(T)*strideW*batch_count); // determine limits and indices bool left = (side == rocblas_side_left); @@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver C, shiftC + idx2D(ic,jc,ldc),ldc,strideC, batch_count); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp index 2d57c7d9..65dd0697 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp @@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n, return rocblas_izamax(handle, n, x, incx, result); } -//ger +//ger template <> rocblas_status rocblas_ger(rocblas_handle handle, rocblas_int m, rocblas_int n, diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp index 1aaaab61..d28acb79 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp @@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int } template -__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) +__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) { int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - + if (b < batch) out[b] = in + b*stride; } template -__forceinline__ __global__ void setdiag(const rocblas_int j, U A, +__forceinline__ __global__ void setdiag(const rocblas_int j, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, T *ipiv, const rocblas_int strideP) { @@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A, T *tau = ipiv + b*strideP; T t = -tau[j]; - tau[j] = t; + tau[j] = t; Ap[j + j*lda] = 1.0 + t; } diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp index 5d9cf574..260d9d1f 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp @@ -8,7 +8,7 @@ // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE -// MORE SUITABLE VALUES +// MORE SUITABLE VALUES diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp index 185d1690..b7e34f6b 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp @@ -1,24 +1,24 @@ -/* ************************************************************************ - * Copyright 2019-2020 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP -#define GUARD_ROCBLAS_MANAGE_PTR_HPP - -#include - -namespace rocsolver { -// device_malloc wraps hipMalloc and provides same API as malloc -static void *device_malloc(size_t byte_size) { - void *pointer; - PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); - return pointer; -} - -// device_free wraps hipFree and provides same API as free -static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } -} // namespace rocsolver - -using rocsolver_unique_ptr = std::unique_ptr; - -#endif +/* ************************************************************************ + * Copyright 2019-2020 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP +#define GUARD_ROCBLAS_MANAGE_PTR_HPP + +#include + +namespace rocsolver { +// device_malloc wraps hipMalloc and provides same API as malloc +static void *device_malloc(size_t byte_size) { + void *pointer; + PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); + return pointer; +} + +// device_free wraps hipFree and provides same API as free +static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } +} // namespace rocsolver + +using rocsolver_unique_ptr = std::unique_ptr; + +#endif diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp index d412d69a..f5f6d466 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp index 29c4266f..81ec19ae 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on row j @@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int n - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on - lda, strideA, //inc of x + lda, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the right + // Apply Householder reflector to the rest of matrix from the right if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp index 027572df..35fe7af5 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp index 9eefcb03..569facbb 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp index a29c5b0f..f75a0da7 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp index b0e15bef..d40b9dd5 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_gelq2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of rows in the block rocsolver_gelq2_template(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < m) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_gelq2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp index 91631008..cee74932 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp index 13e0312f..a5581819 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp index 0cae47b0..249784a0 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp index 668fc8a0..485550d7 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on column j @@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on - 1, strideA, //inc of x + 1, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the left + // Apply Householder reflector to the rest of matrix from the left if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp index ef67a2eb..70e765e8 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp index 26816634..e468de7e 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp index d941c762..b91aa412 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp index fcdb4935..e1a3adaf 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_geqr2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of columns in the block rocsolver_geqr2_template(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < n) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_geqr2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp index 3ae16e6a..41bb01e6 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp index b3e3809d..bd670e1f 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp index 9b01a5af..d74da116 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int *ipiv, rocblas_int* info) -{ + rocblas_int *ipiv, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || lda < 1) @@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp index 727a76c3..5630004e 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp @@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c template rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice); //pivoting info in device (to avoid continuous synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int //info=0 (starting with a nonsingular matrix) hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** @@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well! for (int b=0;b(AA,shiftA,b,strideA); - rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, + rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, (ipiv + shiftP + b*strideP + j)); } @@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipLaunchKernelGGL(getf2_check_singularity, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info); - // Swap pivot row and j-th row + // Swap pivot row and j-th row rocsolver_laswp_template(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count); // Compute elements J+1:M of J'th column for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (pivotGPU + b), - (M + idx2D(j + 1, j, lda)), oneInt); + rocblas_scal(handle, (m-j-1), (pivotGPU + b), + (M + idx2D(j + 1, j, lda)), oneInt); } // update trailing submatrix @@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int for (int b=0;b(AA,shiftA,b,strideA); rocblas_ger(handle, m - j - 1, n - j - 1, minoneInt, - (M + idx2D(j + 1, j, lda)), oneInt, + (M + idx2D(j + 1, j, lda)), oneInt, (M + idx2D(j, j + 1, lda)), lda, (M + idx2D(j + 1, j + 1, lda)), lda); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp index bd9e7240..462e932d 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp index ccb2d252..b3ea05e9 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp @@ -7,19 +7,19 @@ template rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) return rocblas_status_invalid_size; - + return rocsolver_getf2_template(handle,m,n, A,0, //the matrix is shifted 0 entries (will work on the entire matrix) @@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp index 4a1c1b91..9b3bdf70 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, rocblas_int *ipiv, rocblas_int* info) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp index f19138bb..395fd187 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp @@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) + if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) return rocsolver_getf2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GETRF_GETF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_getf2_template(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count); - + // adjust pivot indices and check singularity sizePivot = min(m - j, jb); //number of pivots in the block - blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; + blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; gridPivot = dim3(blocksPivot, batch_count, 1); hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info); @@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int (M + idx2D(j + jb, j + jb, lda)), lda); } } - } + } } hipFree(pivotGPU); diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp index 5ed946d0..44317213 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m, - rocblas_int n, U A, rocblas_int lda, + rocblas_int n, U A, rocblas_int lda, rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp index c1ef590b..35443146 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp index 255e306c..435339c1 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, T *A, const rocblas_int lda, - const rocblas_int *ipiv, T *B, const rocblas_int ldb) + const rocblas_int *ipiv, T *B, const rocblas_int ldb) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n) + if (n < 0 || nrhs < 0 || lda < n || ldb < n) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, - const rocblas_int *ipiv, float *B, const rocblas_int ldb) + const rocblas_int *ipiv, float *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } @@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, - const rocblas_int *ipiv, double *B, const rocblas_int ldb) + const rocblas_int *ipiv, double *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda, - const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) + const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda, diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp index 1209770f..e18816df 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp @@ -19,7 +19,7 @@ template rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B, - const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { // quick return if (n == 0 || nrhs == 0 || batch_count == 0) { @@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve L*X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_lower, trans, rocblas_diagonal_unit, n, nrhs, @@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope trans, rocblas_diagonal_non_unit, n, nrhs, oneInt, Ap, lda, Bp, ldb); } - + } else { for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve U**T *X = B or U**H *X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_upper, trans, rocblas_diagonal_non_unit, n, nrhs, diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp index dd2dbe6a..43d48ac5 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } @@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp index 49ced525..e42302d3 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } @@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp index 1ed3f0ee..0127cbe0 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp @@ -5,14 +5,14 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp index 4e1c3c91..518d202e 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp @@ -18,9 +18,9 @@ #include "common_device.hpp" #include "ideal_sizes.hpp" -template -__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, - const rocblas_int j, T *res, rocblas_int *info) +template +__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, + const rocblas_int j, T *res, rocblas_int *info) { int id = hipBlockIdx_x; @@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; #ifdef batched @@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //diagonal info in device (device memory workspace to avoid synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(0, j, lda)), 1, (M + idx2D(0, j, lda)), 1, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_transpose, j, n - j - 1, - d_minone, (M + idx2D(0, j + 1, lda)), lda, + d_minone, (M + idx2D(0, j + 1, lda)), lda, (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda); - } + } for (int b=0;b(AA,shiftA,b,strideA); rocblas_scal(handle, n - j - 1, (pivotGPU + b), @@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(j, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_none, n - j - 1, j, - d_minone, (M + idx2D(j + 1, 0, lda)), lda, + d_minone, (M + idx2D(j + 1, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1); } for (int b=0;b -rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp index 4988f364..4e88e448 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp index e0512eed..b8be605f 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp @@ -5,14 +5,14 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp index 1f1c6650..aef657d4 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp @@ -19,12 +19,12 @@ #include "ideal_sizes.hpp" #include "roclapack_potf2.hpp" -inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) +inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) { int id = hipBlockIdx_x; if (info[id] == 0 && iinfo[id] > 0) - info[id] = iinfo[id] + j; + info[id] = iinfo[id] + j; } template @@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (n < POTRF_POTF2_SWITCHSIZE) + if (n < POTRF_POTF2_SWITCHSIZE) return rocsolver_potf2_template(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count); #ifdef batched @@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //info in device (device memory workspace to avoid synchronization with CPU) - rocblas_int *iinfo; + rocblas_int *iinfo; hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count); hipStream_t stream; @@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none, @@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, } else { // Compute the Cholesky factorization A = L'*L. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose, diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp index 7ac5061e..06dda30c 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp @@ -6,15 +6,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp index 2e49ab4b..6c081fc4 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in index 970adc43..8b6304e0 100644 --- a/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in +++ b/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in @@ -1,6 +1,6 @@ @PACKAGE_INIT@ - + set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@) set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@) diff --git a/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt index cbf3d10d..4a435950 100644 --- a/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt +++ b/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt @@ -82,7 +82,7 @@ add_library( rocsolver ${rocsolver_lapack_source} ${relative_rocsolver_headers_public} ${rocsolver_auxiliary_source} - ${rocsolver_common_source} + ${rocsolver_common_source} ) add_library( roc::rocsolver ALIAS rocsolver ) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp index 9c52fd62..8c4e0c70 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_larf.hpp" template -rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha, T* A, const rocsolver_int lda) { @@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side rocblas_int stridep = 0; rocblas_int batch_count=1; - return rocsolver_larf_template(handle,side, + return rocsolver_larf_template(handle,side, m,n, x,0, //vector shifted 0 entries incx, @@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side stridep, A,0, //matrix shifted 0 entries lda, - stridea, + stridea, batch_count); } @@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side extern "C" { -ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha, float* A, const rocsolver_int lda) { return rocsolver_larf_impl(handle, side, m, n, x, incx, alpha, A, lda); } -ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, +ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha, double* A, const rocsolver_int lda) { diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp index 27a5a0d4..3755ea14 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp @@ -19,8 +19,8 @@ template rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, - const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, - const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, + const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, + const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count) { // quick return @@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ T* zeroInt; //constant 0 in device hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_ // OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + //memory in GPU (workspace) T *workvec; hipMalloc(&workvec, sizeof(T)*order*batch_count); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute the matrix vector product (W=tau*A'*X or W=tau*A*X) for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp index 12ed4e92..d28b4a03 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp @@ -5,10 +5,10 @@ #include "rocauxiliary_larfb.hpp" template -rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf, T* A, const rocsolver_int lda) { @@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid if (storev == rocsolver_row_wise) { if (ldv < k) return rocblas_status_invalid_size; - } else { + } else { if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n)) return rocblas_status_invalid_size; } @@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid rocblas_int stridef = 0; rocblas_int batch_count=1; - return rocsolver_larfb_template(handle,side,trans,direct,storev, + return rocsolver_larfb_template(handle,side,trans,direct,storev, m,n,k, V,0, //shifted 0 entries ldv, @@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid stridef, A,0, //shifted 0 entries lda, - stridea, + stridea, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp index 5214e29a..dc4ee469 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp @@ -19,7 +19,7 @@ template -__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U } template -__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) +__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) { const auto blocksizex = hipBlockDim_x; const auto blocksizey = hipBlockDim_y; @@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A Wp = work + b*strideW; Ap = load_ptr_batch(A,shiftA,b,strideA); - Ap[i + j*lda] -= Wp[i + j*ldw]; + Ap[i + j*lda] -= Wp[i + j*ldw]; } } template -rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, - const rocsolver_operation trans, const rocsolver_direct direct, +rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, + const rocsolver_operation trans, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, T *F, const rocsolver_int shiftF, - const rocsolver_int ldf, const rocsolver_int strideF, + const rocsolver_int ldf, const rocsolver_int strideF, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, const rocsolver_int batch_count) { @@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver //determine the side, size of workspace //and whether V is trapezoidal - rocsolver_operation transp; + rocsolver_operation transp; rocsolver_fill uploV; bool trap; rocblas_int order, ldw; - bool colwise = (storev == rocsolver_column_wise); + bool colwise = (storev == rocsolver_column_wise); bool leftside = (side == rocblas_side_left); size_t offsetV; - + if (leftside) { order = n; ldw = k; @@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver if (colwise) { uploV = rocblas_fill_lower; offsetV = idx2D(k,0,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_transpose; - else + else transp = rocblas_operation_none; } else { uploV = rocblas_fill_upper; offsetV = idx2D(0,k,ldv); - if (leftside) + if (leftside) transp = rocblas_operation_none; - else + else transp = rocblas_operation_transpose; } @@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver rocblas_int blocksx = (order - 1)/32 + 1; rocblas_int blocksy = (ldw - 1)/32 + 1; hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + // BACKWARD DIRECTION TO BE IMPLEMENTED... rocsolver_fill uploT = rocblas_fill_upper; if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - + //compute: // V1' * A1, or - // or + // or // A1 * V1 for (int b=0;b(VV,shiftV,b,strideV); @@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } // compute: - // V1' * A1 + V2' * A2 - // or + // V1' * A1 + V2' * A2 + // or // A1 * V1 + A2 * V2 - if (trap) { + if (trap) { for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt, (Vp + offsetV),ldv, (Ap + idx2D(k,0,lda)),lda, @@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } - // compute: + // compute: // trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) + // (A1 * V1 + A2 * V2) * trans(T) for (int b=0;b(FF,shiftF,b,strideF); rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw); @@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver // compute: // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2) // or - // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' + // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2' if (transp == rocblas_operation_transpose) transp = rocblas_operation_none; else @@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver for (int b=0;b(AA,shiftA,b,strideA); Vp = load_ptr_batch(VV,shiftV,b,strideV); - if (leftside) { + if (leftside) { rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt, (Vp + offsetV),ldv, (work + b*strideW),ldw, @@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver } } } - + // compute: // V1 * trans(T) * (V1' * A1 + V2' * A2) // or - // (A1 * V1 + A2 * V2) * trans(T) * V1' + // (A1 * V1 + A2 * V2) * trans(T) * V1' for (int b=0;b(VV,shiftV,b,strideV); rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw); } - + // compute: // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2) // or // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1' hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work); - + hipFree(minoneInt); hipFree(oneInt); hipFree(work); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp index 4b1e00fa..8e651066 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp @@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n, incx, stridex, tau, - strideP, + strideP, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp index f4fc193c..38683f5d 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp @@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha template -rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, +rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex, T *tau, const rocblas_int strideP, const rocblas_int batch_count) { @@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int hipStream_t stream; rocblas_get_stream(handle, &stream); dim3 gridReset(1, batch_count, 1); - dim3 threads(1, 1, 1); + dim3 threads(1, 1, 1); if (n == 1) { hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0); - return rocblas_status_success; - } + return rocblas_status_success; + } T *xp; @@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *norms; - hipMalloc(&norms, sizeof(T)*batch_count); + hipMalloc(&norms, sizeof(T)*batch_count); // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + //compute norm of x for (int b=0;b(xx,shiftx,b,stridex); @@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int //set value of tau and beta and scalling factor for vector x //alpha <- beta - //norms <- scalling + //norms <- scalling hipLaunchKernelGGL(set_taubeta,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex); - + //compute vector v=x*norms for (int b=0;b(xx,shiftx,b,stridex); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp index 5ab79a92..10915015 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_larft.hpp" template -rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, - const rocsolver_storev storev, const rocsolver_int n, +rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, + const rocsolver_storev storev, const rocsolver_int n, const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau, T* F, const rocsolver_int ldf) { @@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir stridet, F, ldf, - stridef, + stridef, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp index ee2add09..8a38ac3f 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp @@ -17,8 +17,8 @@ #include "common_device.hpp" template -__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, - T* tau, const rocsolver_int strideT, +__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, + T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev) { const auto blocksize = hipBlockDim_x; @@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid const auto blocksize = hipBlockDim_x; const auto b = hipBlockIdx_x; const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x; - + if (i < k) { T *tp; tp = tau + b*strideT; tp[i] = -tp[i]; } } - + template -rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, +rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, const rocsolver_storev storev, const rocsolver_int n, - const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, - const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, + const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, + const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count) { // quick return @@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice); hipMalloc(&zeroInt, sizeof(T)); hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver if (direct == rocsolver_backward_direction) return rocblas_status_not_implemented; - //Fix diagonal of T, make zero the non used triangular part, + //Fix diagonal of T, make zero the non used triangular part, //setup tau (changing signs) and account for the non-stored 1's on the householder vectors rocblas_int blocks = (k - 1)/32 + 1; hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream, k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev); hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT); - // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS + // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS // AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU. // IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF // ZERO ENTRIES **** - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - - rocblas_operation trans; - - for (int i = 1; i < k; ++i) { + rocblas_operation trans; + + + for (int i = 1; i < k; ++i) { //compute the matrix vector product, using the householder vectors for (int b=0;b(VV,shiftV,b,strideV); Fp = F + b*strideF; - rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, + rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1); - } + } } //restore tau @@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver hipFree(oneInt); hipFree(zeroInt); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp index e79f652f..360fef79 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp @@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const } ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); } ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, + rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, const rocsolver_int *ipiv, const rocblas_int incx) { return rocsolver_laswp_impl(handle, n, A, lda, k1, k2, ipiv, incx); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp index 0dc74205..4615a7ec 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp @@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA template rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2, - const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, + const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, const rocblas_int batch_count) { // quick return - if (n == 0 || !batch_count) + if (n == 0 || !batch_count) return rocblas_status_success; rocblas_int start, end, inc; @@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int end = k1 - 1; inc = -1; incx = -incx; - } + } else { start = k1; end = k2 + 1; diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp index 102fd83e..465b3635 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_org2r.hpp" template -rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp index 08d072aa..2dbcc11e 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j > i) + else if (j > i) Ap[i + j*lda] = 0.0; else if (j >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j,j+1,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th column -corresponding to H(i)- if (j < m - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), - (M + idx2D(j + 1, j, lda)), 1); - } + rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), + (M + idx2D(j + 1, j, lda)), 1); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp index bd3e4714..eb4f0bb6 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orgbr.hpp" template -rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp index a1315b6e..deec30a8 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp @@ -23,7 +23,7 @@ #define BS 32 //blocksize for kernels template -__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && j <= i) { rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy columns - Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); - + Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]); + } else { - // shift columns to the right + // shift columns to the right Ap[i+1 + j*lda] = Wp[i + j*ldw - offset]; - + // make first row the identity if (i == j) { Ap[(j+1)*lda] = 0.0; @@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const } template -__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, +__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW) { const auto b = hipBlockIdx_z; @@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const if (i < dim && j < dim && i <= j) { rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W - T *Ap = load_ptr_batch(A,shiftA,b,strideA); + T *Ap = load_ptr_batch(A,shiftA,b,strideA); T *Wp = load_ptr_batch(W,shiftW,b,strideW); - + if (copy) { //copy rows - Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); - + Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]); + } else { - // shift rows downward + // shift rows downward Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset]; - + // make first column the identity if (i == j) { Ap[i+1] = 0.0; @@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const } template -rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization + // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization // of a m-by-k matrix A (given by gebrd) if (storev == rocsolver_column_wise) { if (m >= k) { - rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); + rocsolver_orgqr_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); } else { // shift the householder vectors provided by gebrd as they come below the first subdiagonal // workspace @@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (m - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); - + hipLaunchKernelGGL(copyshift_col,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + // result - rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + rocsolver_orgqr_template(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); + hipFree(W); - } + } } - - // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization + + // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization // of a k-by-n matrix A (given by gebrd) else { if (n > k) { @@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver rocblas_int blocks = (n - 2)/BS + 1; // copy - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // shift - hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, - false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); + hipLaunchKernelGGL(copyshift_row,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, + false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW); // result rocsolver_orglq_template(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count); - + hipFree(W); } - } + } return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp index 27e3d8ed..ec38dc16 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgl2.hpp" template -rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp index 202a4fc3..35475070 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp @@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r if (i < m && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - - if (i == j) + + if (i == j) Ap[i + j*lda] = 1.0; - else if (j < i) + else if (j < i) Ap[i + j*lda] = 0.0; else if (i >= k) Ap[i + j*lda] = 0.0; @@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r } template -rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** - + T* M; // Initialize identity matrix (non used columns) @@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) A, shiftA + idx2D(j+1,j,lda), //matrix to work on lda, strideA, //leading dimension - batch_count); + batch_count); } // set the diagonal element and negative tau hipLaunchKernelGGL(setdiag,dim3(batch_count),dim3(1),0,stream, j,A,shiftA,lda,strideA,ipiv,strideP); - + // update i-th row -corresponding to H(i)- if (j < n - 1) { for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), - (M + idx2D(j, j + 1, lda)), lda); - } + rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), + (M + idx2D(j, j + 1, lda)), lda); + } } } - + // restore values of tau blocksx = (k - 1)/128 + 1; hipLaunchKernelGGL(restau,dim3(blocksx,batch_count),dim3(128),0,stream, k,ipiv,strideP); - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp index 35b17482..e3039734 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orglq.hpp" template -rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp index 97886fce..39f77a46 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp @@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A, if (i < m && j < kk) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_orgl2_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding left submatrix if (kk < m) { blocksx = (m - kk - 1)/32 + 1; blocksy = (kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, m,kk,A,shiftA,lda,strideA); - - rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_orgl2_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < m) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_row,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_orgl2_template(handle, jb, n - j, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_orgl2_template(handle, jb, n - j, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp index ef11bd5e..7b1aceec 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp @@ -5,7 +5,7 @@ #include "rocauxiliary_orgqr.hpp" template -rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv) { if(!handle) diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp index 86386317..8079413c 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp @@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A, if (i < kk && j < n) { T *Ap = load_ptr_batch(A,shiftA,b,strideA); - + Ap[i + j*lda] = 0.0; } } template -rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, - const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, - const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, + const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, + const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, const rocsolver_int batch_count) { // quick return @@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipStream_t stream; rocblas_get_stream(handle, &stream); - + // if the matrix is small, use the unblocked variant of the algorithm - if (k <= GEQRF_GEQR2_SWITCHSIZE) + if (k <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_org2r_template(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count); //memory in GPU (workspace) @@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver // start of first blocked block rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE; rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb; - + // start of the unblocked block - rocblas_int kk = min(k, j + jb); + rocblas_int kk = min(k, j + jb); rocblas_int blocksy, blocksx; - - // compute the unblockled part and set to zero the + + // compute the unblockled part and set to zero the // corresponding top submatrix if (kk < n) { blocksx = (kk - 1)/32 + 1; blocksy = (n- kk - 1)/32 + 1; hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, n,kk,A,shiftA,lda,strideA); - - rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, - A, shiftA + idx2D(kk, kk, lda), lda, + + rocsolver_org2r_template(handle, m - kk, n - kk, k - kk, + A, shiftA + idx2D(kk, kk, lda), lda, strideA, (ipiv + kk), strideP, batch_count); } // compute the blocked part while (j >= 0) { - + // first update the already computed part // applying the current block reflector using larft + larfb if (j + jb < n) { - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver hipLaunchKernelGGL(set_zero_col,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream, j+jb,j,A,shiftA,lda,strideA); } - rocsolver_org2r_template(handle, m - j, jb, jb, - A, shiftA + idx2D(j, j, lda), lda, + rocsolver_org2r_template(handle, m - j, jb, jb, + A, shiftA + idx2D(j, j, lda), lda, strideA, (ipiv + j), strideP, batch_count); j -= jb; } - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp index 34ee185b..fdaa1724 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_orm2r.hpp" template -rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp index 10522f08..dd83c375 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp @@ -18,10 +18,10 @@ #include "../auxiliary/rocauxiliary_larf.hpp" template -rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver ncol = n - i; jc = i; } - - // insert one in A(i,i) tobuild/apply the householder matrix + + // insert one in A(i,i) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); - // Apply current Householder reflector + // Apply current Householder reflector rocsolver_larf_template(handle,side, //side nrow, //number of rows of matrix to modify - ncol, //number of columns of matrix to modify + ncol, //number of columns of matrix to modify A, shiftA + idx2D(i,i,lda), //householder vector x 1, strideA, //inc of x (ipiv + i), strideP, //householder scalar (alpha) @@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver // restore original value of A(i,i) hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp index 7d11d5e6..820f4a46 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp @@ -5,8 +5,8 @@ #include "rocauxiliary_ormqr.hpp" template -rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, +rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc) { if(!handle) @@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid strideA, ipiv, strideP, - C,0, + C,0, ldc, strideC, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp index fd0b523c..b24d77cd 100644 --- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp +++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp @@ -20,10 +20,10 @@ #include "../auxiliary/rocauxiliary_larft.hpp" template -rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, - const rocsolver_int m, const rocsolver_int n, - const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, - const rocsolver_int strideA, T* ipiv, +rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, + const rocsolver_int m, const rocsolver_int n, + const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, + const rocsolver_int strideA, T* ipiv, const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc, const rocsolver_int strideC, const rocsolver_int batch_count) { @@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked variant of the algorithm - if (k <= ORMQR_ORM2R_BLOCKSIZE) + if (k <= ORMQR_ORM2R_BLOCKSIZE) return rocsolver_orm2r_template(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count); //memory in GPU (workspace) T* work; rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE; rocblas_int strideW = ldw *ldw; - hipMalloc(&work, sizeof(T)*strideW*batch_count); + hipMalloc(&work, sizeof(T)*strideW*batch_count); // determine limits and indices bool left = (side == rocblas_side_left); @@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver C, shiftC + idx2D(ic,jc,ldc),ldc,strideC, batch_count); } - + return rocblas_status_success; } diff --git a/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp index 2d57c7d9..65dd0697 100644 --- a/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp +++ b/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp @@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n, return rocblas_izamax(handle, n, x, incx, result); } -//ger +//ger template <> rocblas_status rocblas_ger(rocblas_handle handle, rocblas_int m, rocblas_int n, diff --git a/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp index 1aaaab61..d28acb79 100644 --- a/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp +++ b/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp @@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int } template -__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) +__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) { int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - + if (b < batch) out[b] = in + b*stride; } template -__forceinline__ __global__ void setdiag(const rocblas_int j, U A, +__forceinline__ __global__ void setdiag(const rocblas_int j, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, T *ipiv, const rocblas_int strideP) { @@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A, T *tau = ipiv + b*strideP; T t = -tau[j]; - tau[j] = t; + tau[j] = t; Ap[j + j*lda] = 1.0 + t; } diff --git a/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp index 5d9cf574..260d9d1f 100644 --- a/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp +++ b/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp @@ -8,7 +8,7 @@ // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE -// MORE SUITABLE VALUES +// MORE SUITABLE VALUES diff --git a/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp index 185d1690..b7e34f6b 100644 --- a/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp +++ b/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp @@ -1,24 +1,24 @@ -/* ************************************************************************ - * Copyright 2019-2020 Advanced Micro Devices, Inc. - * ************************************************************************ */ - -#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP -#define GUARD_ROCBLAS_MANAGE_PTR_HPP - -#include - -namespace rocsolver { -// device_malloc wraps hipMalloc and provides same API as malloc -static void *device_malloc(size_t byte_size) { - void *pointer; - PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); - return pointer; -} - -// device_free wraps hipFree and provides same API as free -static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } -} // namespace rocsolver - -using rocsolver_unique_ptr = std::unique_ptr; - -#endif +/* ************************************************************************ + * Copyright 2019-2020 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP +#define GUARD_ROCBLAS_MANAGE_PTR_HPP + +#include + +namespace rocsolver { +// device_malloc wraps hipMalloc and provides same API as malloc +static void *device_malloc(size_t byte_size) { + void *pointer; + PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size)); + return pointer; +} + +// device_free wraps hipFree and provides same API as free +static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); } +} // namespace rocsolver + +using rocsolver_unique_ptr = std::unique_ptr; + +#endif diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp index d412d69a..f5f6d466 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelq2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp index 29c4266f..81ec19ae 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on row j @@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int n - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on - lda, strideA, //inc of x + lda, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the right + // Apply Householder reflector to the rest of matrix from the right if (j < m - 1) { rocsolver_larf_template(handle,rocblas_side_right, //side m - j - 1, //number of rows of matrix to modify - n - j, //number of columns of matrix to modify + n - j, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x lda, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp index 027572df..35fe7af5 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp index 9eefcb03..569facbb 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelq2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp index a29c5b0f..f75a0da7 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_gelqf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp index b0e15bef..d40b9dd5 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_gelq2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of rows in the block rocsolver_gelq2_template(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < m) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_row_wise, n-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_row_wise, n-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_gelq2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp index 91631008..cee74932 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp index 13e0312f..a5581819 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_gelqf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp index 0cae47b0..249784a0 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqr2_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp index 668fc8a0..485550d7 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp @@ -22,12 +22,12 @@ template rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; @@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int //memory in GPU (workspace) T *diag; hipMalloc(&diag,sizeof(T)*batch_count); - - rocblas_int dim = min(m, n); //total number of pivots + + rocblas_int dim = min(m, n); //total number of pivots for (rocblas_int j = 0; j < dim; ++j) { // generate Householder reflector to work on column j @@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m - j, //order of reflector A, shiftA + idx2D(j,j,lda), //value of alpha A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on - 1, strideA, //inc of x + 1, strideA, //inc of x (ipiv + j), strideP, //tau batch_count); - // insert one in A(j,j) tobuild/apply the householder matrix + // insert one in A(j,j) tobuild/apply the householder matrix hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA); - // Apply Householder reflector to the rest of matrix from the left + // Apply Householder reflector to the rest of matrix from the left if (j < n - 1) { rocsolver_larf_template(handle,rocblas_side_left, //side m - j, //number of rows of matrix to modify - n - j - 1, //number of columns of matrix to modify + n - j - 1, //number of columns of matrix to modify A, shiftA + idx2D(j,j,lda), //householder vector x 1, strideA, //inc of x (ipiv + j), strideP, //householder scalar (alpha) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp index ef67a2eb..70e765e8 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp index 26816634..e468de7e 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqr2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp index d941c762..b91aa412 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv) -{ + T* ipiv) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m) @@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, float *ipiv) + const rocblas_int lda, float *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, double *ipiv) + const rocblas_int lda, double *ipiv) { return rocsolver_geqrf_impl(handle, m, n, A, lda, ipiv); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp index fcdb4935..e1a3adaf 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp @@ -24,21 +24,21 @@ template rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, T* ipiv, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, T* ipiv, const rocblas_int strideP, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; hipStream_t stream; rocblas_get_stream(handle, &stream); // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) + if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) return rocsolver_geqr2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count); - + rocblas_int dim = min(m, n); //total number of pivots rocblas_int jb, j = 0; @@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int hipMalloc(&work, sizeof(T)*strideW*batch_count); while (j < dim - GEQRF_GEQR2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE); //number of columns in the block rocsolver_geqr2_template(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); //apply transformation to the rest of the matrix if (j + jb < n) { - + //compute block reflector - rocsolver_larft_template(handle, rocsolver_forward_direction, - rocsolver_column_wise, m-j, jb, - A, shiftA + idx2D(j,j,lda), lda, strideA, + rocsolver_larft_template(handle, rocsolver_forward_direction, + rocsolver_column_wise, m-j, jb, + A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, work, ldw, strideW, batch_count); @@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int } //factor last block - if (j < dim) + if (j < dim) rocsolver_geqr2_template(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count); - + hipFree(work); return rocblas_status_success; diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp index 3ae16e6a..41bb01e6 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp @@ -8,13 +8,13 @@ template rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_batched_impl(handle, m, n, A, lda, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp index b3e3809d..bd670e1f 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) -{ + T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) { return rocsolver_geqrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp index 9b01a5af..d74da116 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp @@ -7,13 +7,13 @@ template rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int *ipiv, rocblas_int* info) -{ + rocblas_int *ipiv, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || lda < 1) @@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) + const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) { return rocsolver_getf2_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp index 727a76c3..5630004e 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp @@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c template rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m, - const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, - rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, + const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, + rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice); //pivoting info in device (to avoid continuous synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int //info=0 (starting with a nonsingular matrix) hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0); - + // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS // FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** @@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well! for (int b=0;b(AA,shiftA,b,strideA); - rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, + rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, (ipiv + shiftP + b*strideP + j)); } @@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int hipLaunchKernelGGL(getf2_check_singularity, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info); - // Swap pivot row and j-th row + // Swap pivot row and j-th row rocsolver_laswp_template(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count); // Compute elements J+1:M of J'th column for (int b=0;b(AA,shiftA,b,strideA); - rocblas_scal(handle, (m-j-1), (pivotGPU + b), - (M + idx2D(j + 1, j, lda)), oneInt); + rocblas_scal(handle, (m-j-1), (pivotGPU + b), + (M + idx2D(j + 1, j, lda)), oneInt); } // update trailing submatrix @@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int for (int b=0;b(AA,shiftA,b,strideA); rocblas_ger(handle, m - j - 1, n - j - 1, minoneInt, - (M + idx2D(j + 1, j, lda)), oneInt, + (M + idx2D(j + 1, j, lda)), oneInt, (M + idx2D(j, j + 1, lda)), lda, (M + idx2D(j + 1, j + 1, lda)), lda); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp index bd9e7240..462e932d 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) @@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[], - const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp index ccb2d252..b3ea05e9 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp @@ -7,19 +7,19 @@ template rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m, const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) -{ + rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; if (m < 0 || n < 0 || lda < m || batch_count < 0) return rocblas_status_invalid_size; - + return rocsolver_getf2_template(handle,m,n, A,0, //the matrix is shifted 0 entries (will work on the entire matrix) @@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A, - const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getf2_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp index 4a1c1b91..9b3bdf70 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, rocblas_int *ipiv, rocblas_int* info) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m, extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) + rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) { return rocsolver_getrf_impl(handle, m, n, A, lda, ipiv, info); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp index f19138bb..395fd187 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp @@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) { // quick return - if (m == 0 || n == 0 || batch_count == 0) + if (m == 0 || n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) + if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) return rocsolver_getf2_template(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count); - + #ifdef batched // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL // BATCH-BLAS FUNCTIONALITY IS ENABLED. **** @@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int // BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS **** for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(dim - j, GETRF_GETF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_getf2_template(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count); - + // adjust pivot indices and check singularity sizePivot = min(m - j, jb); //number of pivots in the block - blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; + blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; gridPivot = dim3(blocksPivot, batch_count, 1); hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info); @@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int (M + idx2D(j + jb, j + jb, lda)), lda); } } - } + } } hipFree(pivotGPU); diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp index 5ed946d0..44317213 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m, - rocblas_int n, U A, rocblas_int lda, + rocblas_int n, U A, rocblas_int lda, rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) + rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) { return rocsolver_getrf_batched_impl(handle, m, n, A, lda, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp index c1ef590b..35443146 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp @@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - if (m < 0 || n < 0 || batch_count < 0 || lda < m) + //logging is missing ??? + + if (m < 0 || n < 0 || batch_count < 0 || lda < m) return rocblas_status_invalid_size; if (!A || !ipiv || !info) return rocblas_status_invalid_pointer; @@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const extern "C" { ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, - rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) + rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) { return rocsolver_getrf_strided_batched_impl(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count); } diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp index 255e306c..435339c1 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, T *A, const rocblas_int lda, - const rocblas_int *ipiv, T *B, const rocblas_int ldb) + const rocblas_int *ipiv, T *B, const rocblas_int ldb) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n) + if (n < 0 || nrhs < 0 || lda < n || ldb < n) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, - const rocblas_int *ipiv, float *B, const rocblas_int ldb) + const rocblas_int *ipiv, float *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } @@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, - const rocblas_int *ipiv, double *B, const rocblas_int ldb) + const rocblas_int *ipiv, double *B, const rocblas_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda, - const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) + const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) { return rocsolver_getrs_impl(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs( rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n, const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda, diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp index 1209770f..e18816df 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp @@ -19,7 +19,7 @@ template rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B, - const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { // quick return if (n == 0 || nrhs == 0 || batch_count == 0) { @@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve L*X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_lower, trans, rocblas_diagonal_unit, n, nrhs, @@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope trans, rocblas_diagonal_non_unit, n, nrhs, oneInt, Ap, lda, Bp, ldb); } - + } else { for (int b = 0; b < batch_count; ++b) { Ap = load_ptr_batch(AA,shiftA,b,strideA); Bp = load_ptr_batch(BB,shiftB,b,strideB); - + // solve U**T *X = B or U**H *X = B, overwriting B with X rocblas_trsm(handle, rocblas_side_left, rocblas_fill_upper, trans, rocblas_diagonal_non_unit, n, nrhs, diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp index dd2dbe6a..43d48ac5 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp @@ -8,14 +8,14 @@ template rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } @@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], const rocblas_int ldb, const rocblas_int batch_count) { return rocsolver_getrs_batched_impl(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp index 49ced525..e42302d3 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp @@ -7,14 +7,14 @@ template rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { if(!handle) return rocblas_status_invalid_handle; - //logging is missing ??? + //logging is missing ??? - if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) + if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) return rocblas_status_invalid_size; if (!A || !ipiv || !B) @@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } @@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation extern "C" ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) + const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); } -extern "C" ROCSOLVER_EXPORT rocsolver_status +extern "C" ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched( rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA, - const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, + const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) { return rocsolver_getrs_strided_batched_impl(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count); diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp index 1ed3f0ee..0127cbe0 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp @@ -5,14 +5,14 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp index 4e1c3c91..518d202e 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp @@ -18,9 +18,9 @@ #include "common_device.hpp" #include "ideal_sizes.hpp" -template -__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, - const rocblas_int j, T *res, rocblas_int *info) +template +__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, + const rocblas_int j, T *res, rocblas_int *info) { int id = hipBlockIdx_x; @@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; #ifdef batched @@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //diagonal info in device (device memory workspace to avoid synchronization with CPU) - T *pivotGPU; + T *pivotGPU; hipMalloc(&pivotGPU, sizeof(T)*batch_count); hipStream_t stream; @@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(0, j, lda)), 1, (M + idx2D(0, j, lda)), 1, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_transpose, j, n - j - 1, - d_minone, (M + idx2D(0, j + 1, lda)), lda, + d_minone, (M + idx2D(0, j + 1, lda)), lda, (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda); - } + } for (int b=0;b(AA,shiftA,b,strideA); rocblas_scal(handle, n - j - 1, (pivotGPU + b), @@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, rocblas_dot(handle, j, (M + idx2D(j, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, (pivotGPU + b)); } - hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, + hipLaunchKernelGGL(sqrtDiagOnward, dim3(batch_count), dim3(1), 0, stream, A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info); // Compute elements J+1:N of row J @@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle, for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemv(handle, rocblas_operation_none, n - j - 1, j, - d_minone, (M + idx2D(j + 1, 0, lda)), lda, + d_minone, (M + idx2D(j + 1, 0, lda)), lda, (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1); } for (int b=0;b -rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp index 4988f364..4e88e448 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potf2.hpp" template -rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp index e0512eed..b8be605f 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp @@ -5,14 +5,14 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) -{ +rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp index 1f1c6650..aef657d4 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp @@ -19,12 +19,12 @@ #include "ideal_sizes.hpp" #include "roclapack_potf2.hpp" -inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) +inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) { int id = hipBlockIdx_x; if (info[id] == 0 && iinfo[id] > 0) - info[id] = iinfo[id] + j; + info[id] = iinfo[id] + j; } template @@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA, - rocblas_int *info, const rocblas_int batch_count) + rocblas_int *info, const rocblas_int batch_count) { // quick return - if (n == 0 || batch_count == 0) + if (n == 0 || batch_count == 0) return rocblas_status_success; // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm - if (n < POTRF_POTF2_SWITCHSIZE) + if (n < POTRF_POTF2_SWITCHSIZE) return rocsolver_potf2_template(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count); #ifdef batched @@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice); //info in device (device memory workspace to avoid synchronization with CPU) - rocblas_int *iinfo; + rocblas_int *iinfo; hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count); hipStream_t stream; @@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none, @@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle, } else { // Compute the Cholesky factorization A = L'*L. for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) { - // Factor diagonal and subdiagonal blocks + // Factor diagonal and subdiagonal blocks jb = min(n - j, POTRF_POTF2_SWITCHSIZE); //number of columns in the block hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0); rocsolver_potf2_template(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count); - + // test for non-positive-definiteness. hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j); - + if (j + jb < n) { // update trailing submatrix for (int b=0;b(AA,shiftA,b,strideA); rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose, diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp index 7ac5061e..06dda30c 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp @@ -6,15 +6,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp index 2e49ab4b..6c081fc4 100644 --- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp +++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp @@ -5,15 +5,15 @@ #include "roclapack_potrf.hpp" template -rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, - const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, - rocblas_int* info, const rocblas_int batch_count) -{ +rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo, + const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, + rocblas_int* info, const rocblas_int batch_count) +{ if(!handle) return rocblas_status_invalid_handle; - - //logging is missing ??? - + + //logging is missing ??? + if (!A || !info) return rocblas_status_invalid_pointer; if (n < 0 || lda < n || batch_count < 0) diff --git a/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in index 970adc43..8b6304e0 100644 --- a/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in +++ b/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in @@ -1,6 +1,6 @@ @PACKAGE_INIT@ - + set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@) set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@) diff --git a/ROCm_Libraries/rocSPARSE/Doxyfile b/ROCm_Libraries/rocSPARSE/Doxyfile index e7a87a6b..0f6ee32e 100644 --- a/ROCm_Libraries/rocSPARSE/Doxyfile +++ b/ROCm_Libraries/rocSPARSE/Doxyfile @@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -171,7 +171,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -244,7 +244,7 @@ ALIASES = # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -696,7 +696,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -706,7 +706,7 @@ LAYOUT_FILE = # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. -CITE_BIB_FILES = +CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages @@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -781,7 +781,7 @@ INPUT = ROCm_Libraries/rocSPARSE/src/modules.dox \ ROCm_Libraries/rocSPARSE/src/rocsparse-functions_sed.h \ ROCm_Libraries/rocSPARSE/src/rocsparse-auxiliary_sed.h \ ROCm_Libraries/rocSPARSE/src/rocsparse-types.h \ - ROCm_Libraries/rocSPARSE/src/rocsparse.h + ROCm_Libraries/rocSPARSE/src/rocsparse.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -862,7 +862,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -878,7 +878,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -889,13 +889,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -915,7 +915,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -932,7 +932,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -941,7 +941,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -956,7 +956,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1068,7 +1068,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1094,7 +1094,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1138,7 +1138,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1148,7 +1148,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1160,7 +1160,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1173,7 +1173,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1183,7 +1183,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1312,7 +1312,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1320,7 +1320,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1333,7 +1333,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1364,7 +1364,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1389,7 +1389,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1397,21 +1397,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1553,7 +1553,7 @@ MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1613,7 +1613,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1629,7 +1629,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1639,7 +1639,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1703,7 +1703,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1719,7 +1719,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1730,7 +1730,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1741,7 +1741,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1749,7 +1749,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1851,14 +1851,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1903,7 +1903,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -1922,7 +1922,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = YES +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -2016,7 +2016,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2057,7 +2057,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2065,7 +2065,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2075,7 +2075,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2084,7 +2084,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2113,13 +2113,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2168,14 +2168,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2224,7 +2224,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2368,26 +2368,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2395,12 +2395,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocr/Doxyfile b/ROCm_Libraries/rocr/Doxyfile index ecf24e6d..3d39ca34 100644 --- a/ROCm_Libraries/rocr/Doxyfile +++ b/ROCm_Libraries/rocr/Doxyfile @@ -164,7 +164,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -173,7 +173,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -240,13 +240,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -295,7 +295,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -649,7 +649,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -691,7 +691,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -704,7 +704,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -773,7 +773,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -868,7 +868,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -884,7 +884,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -895,13 +895,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -921,7 +921,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -938,7 +938,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -947,7 +947,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -962,7 +962,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1074,7 +1074,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1100,7 +1100,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1145,7 +1145,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1155,7 +1155,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1167,7 +1167,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1180,7 +1180,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1190,7 +1190,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1319,7 +1319,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1327,7 +1327,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1340,7 +1340,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1371,7 +1371,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1396,7 +1396,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1404,21 +1404,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1551,7 +1551,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1559,7 +1559,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1619,7 +1619,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1635,7 +1635,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1645,7 +1645,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1709,7 +1709,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1725,7 +1725,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1736,7 +1736,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1747,7 +1747,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1755,7 +1755,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1855,14 +1855,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1907,7 +1907,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -2019,7 +2019,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2060,7 +2060,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2068,7 +2068,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2078,7 +2078,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2087,7 +2087,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2116,13 +2116,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2171,14 +2171,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2227,7 +2227,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2371,26 +2371,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2398,12 +2398,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/ROCm_Libraries/rocr/src/README.md b/ROCm_Libraries/rocr/src/README.md index f06be976..c4a790c9 100644 --- a/ROCm_Libraries/rocr/src/README.md +++ b/ROCm_Libraries/rocr/src/README.md @@ -40,7 +40,7 @@ hsakmt.h header file must be available. The latest version of these files can be obtained from the ROCT-Thunk-Interface repository, available here: https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface - + Specify the directory containing libhsakmt.so.1 and hsakmt.h using the cmake variables, HSAKMT_LIB_PATH and HSAKMT_INC_PATH. These can be specified either on the command line or via standard cmake configuration tools such as ccmake or cmake-gui. @@ -52,7 +52,7 @@ For example, from the top level ROCR repository execute: -DHSAKMT_LIB_PATH:STRING= \ .. make - + alternately using ccmake: mkdir build diff --git a/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS b/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS index 4b417765..53b6b71e 100644 --- a/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS +++ b/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS @@ -7,7 +7,7 @@ are met: 2. Redistributions in binary form must reproduce the copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. The name of the author may not be used to endorse or promote products +3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR diff --git a/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake b/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake index 0530c87f..44a62e62 100644 --- a/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake +++ b/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake @@ -90,21 +90,21 @@ function ( get_version DEFAULT_VERSION_STRING ) parse_version ( ${DEFAULT_VERSION_STRING} ) ## find_program ( GIT NAMES git ) -## +## ## if ( GIT ) -## +## ## execute_process ( COMMAND git describe --tags --dirty --long ## WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ## OUTPUT_VARIABLE GIT_TAG_STRING ## OUTPUT_STRIP_TRAILING_WHITESPACE ## RESULT_VARIABLE RESULT ) -## +## ## if ( ${RESULT} EQUAL 0 ) -## +## ## parse_version ( ${GIT_TAG_STRING} ) -## +## ## endif () -## +## ## endif () set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) diff --git a/ROCm_Libraries/rocr/src/core/common/shared.h b/ROCm_Libraries/rocr/src/core/common/shared.h index dc33ac7d..5ca99d93 100644 --- a/ROCm_Libraries/rocr/src/core/common/shared.h +++ b/ROCm_Libraries/rocr/src/core/common/shared.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/agent.h b/ROCm_Libraries/rocr/src/core/inc/agent.h index 8a1b4050..0760df70 100644 --- a/ROCm_Libraries/rocr/src/core/inc/agent.h +++ b/ROCm_Libraries/rocr/src/core/inc/agent.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h b/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h index b7e63d03..07d5229d 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h b/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h index 181cd687..dc40421b 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h b/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h index af5de53d..dd994ef2 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp index c0cde933..bd181757 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp +++ b/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h b/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h index db299842..fef245a3 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp index b3fcbc2f..06b577c2 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp +++ b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp index 4b90f0e2..0e743bb2 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp +++ b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp index 27830ff3..4254e4cc 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp +++ b/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h b/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h index 08bb78d9..dad165f0 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_topology.h b/ROCm_Libraries/rocr/src/core/inc/amd_topology.h index f0c0eabc..8e62679d 100644 --- a/ROCm_Libraries/rocr/src/core/inc/amd_topology.h +++ b/ROCm_Libraries/rocr/src/core/inc/amd_topology.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/blit.h b/ROCm_Libraries/rocr/src/core/inc/blit.h index 57189361..e7427e43 100644 --- a/ROCm_Libraries/rocr/src/core/inc/blit.h +++ b/ROCm_Libraries/rocr/src/core/inc/blit.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/checked.h b/ROCm_Libraries/rocr/src/core/inc/checked.h index 856d22ba..ea2b2122 100644 --- a/ROCm_Libraries/rocr/src/core/inc/checked.h +++ b/ROCm_Libraries/rocr/src/core/inc/checked.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h b/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h index b41a8161..f458deb1 100644 --- a/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h +++ b/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h b/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h index 236a165c..7b664003 100644 --- a/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h +++ b/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h b/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h index 8f1f7610..3b4151a5 100644 --- a/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h +++ b/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h b/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h index 8571c9a1..21081d0f 100644 --- a/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h +++ b/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/isa.h b/ROCm_Libraries/rocr/src/core/inc/isa.h index c5dba5f7..13fa38b8 100644 --- a/ROCm_Libraries/rocr/src/core/inc/isa.h +++ b/ROCm_Libraries/rocr/src/core/inc/isa.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/memory_region.h b/ROCm_Libraries/rocr/src/core/inc/memory_region.h index 6281413d..391a6607 100644 --- a/ROCm_Libraries/rocr/src/core/inc/memory_region.h +++ b/ROCm_Libraries/rocr/src/core/inc/memory_region.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/inc/registers.h b/ROCm_Libraries/rocr/src/core/inc/registers.h index 39d86aec..d2bffb65 100644 --- a/ROCm_Libraries/rocr/src/core/inc/registers.h +++ b/ROCm_Libraries/rocr/src/core/inc/registers.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp index f1f235c2..846c0d71 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp index d97bebf7..adf1d207 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -187,12 +187,12 @@ hsa_status_t CpuAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, } hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { - + // agent, and vendor name size limit const size_t attribute_u = static_cast(attribute); - + switch (attribute_u) { - + // The code copies HsaNodeProperties.MarketingName a Unicode string // which is encoded in UTF-16 as a 7-bit ASCII string. The value of // HsaNodeProperties.MarketingName is obtained from the "model name" diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp index 9706ca07..3d1ddd9c 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp @@ -696,12 +696,12 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) { } hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { - + // agent, and vendor name size limit const size_t attribute_u = static_cast(attribute); - + switch (attribute_u) { - + // Build agent name by concatenating the Major, Minor and Stepping Ids // of devices compute capability with a prefix of "gfx" case HSA_AGENT_INFO_NAME: { @@ -873,7 +873,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY: *((uint32_t*)value) = memory_max_frequency_; break; - + // The code copies HsaNodeProperties.MarketingName a Unicode string // which is encoded in UTF-16 as a 7-bit ASCII string case HSA_AMD_AGENT_INFO_PRODUCT_NAME: { diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp index 14b2b4de..ce45d47f 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp index 97daa850..51bdbe19 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp b/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp index 1fc08ca8..b8db95ba 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -176,7 +176,7 @@ ExtensionEntryPoints::ExtensionEntryPoints() { // Initialize Finalizer function table to be NULLs void ExtensionEntryPoints::InitFinalizerExtTable() { - + // Initialize Version of Api Table finalizer_api.version.major_id = 0x00; finalizer_api.version.minor_id = 0x00; @@ -192,7 +192,7 @@ void ExtensionEntryPoints::InitFinalizerExtTable() { // Initialize Image function table to be NULLs void ExtensionEntryPoints::InitImageExtTable() { - + // Initialize Version of Api Table image_api.version.major_id = 0x00; image_api.version.minor_id = 0x00; @@ -224,16 +224,16 @@ void ExtensionEntryPoints::InitAmdExtTable() { // @note: Interface should be updated when Amd Ext table // begins hosting Api's from other extension libraries void ExtensionEntryPoints::UpdateAmdExtTable(void *func_ptr) { - + assert(hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn == - (decltype(hsa_amd_image_create)*)hsa_ext_null && + (decltype(hsa_amd_image_create)*)hsa_ext_null && "Duplicate load of extension import."); assert(hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn == - (decltype(hsa_amd_image_create)*)hsa_ext_null && + (decltype(hsa_amd_image_create)*)hsa_ext_null && "Duplicate load of extension import."); - hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = + hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = (decltype(hsa_amd_image_create)*)func_ptr; - hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = + hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = (decltype(hsa_amd_image_create)*)func_ptr; } @@ -265,7 +265,7 @@ bool ExtensionEntryPoints::LoadImage(std::string library_name) { return false; } libs_.push_back(lib); - + void* ptr; ptr = os::GetExportAddress(lib, "hsa_ext_image_get_capability_impl"); @@ -390,7 +390,7 @@ bool ExtensionEntryPoints::LoadImage(std::string library_name) { if (ptr != NULL) { UpdateAmdExtTable(ptr); } - + // Initialize Version of Api Table image_api.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; image_api.version.minor_id = sizeof(ImageExtTable); @@ -414,7 +414,7 @@ bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) { return false; } libs_.push_back(lib); - + void* ptr; ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl"); @@ -469,12 +469,12 @@ bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) { finalizer_api.hsa_ext_program_finalize_fn = (decltype(::hsa_ext_program_finalize)*)ptr; } - + // Initialize Version of Api Table finalizer_api.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; finalizer_api.version.minor_id = sizeof(::FinalizerExtTable); finalizer_api.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; - + // Update handle of table of HSA extensions hsa_internal_api_table_.CloneExts(&finalizer_api, core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); diff --git a/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp b/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp index 4342decc..b2fb6a3a 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp @@ -193,7 +193,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( value = atomic::Load(&signal_.value, std::memory_order_relaxed); return hsa_signal_value_t(value); } - + if (wait_hint == HSA_WAIT_STATE_ACTIVE) { continue; } diff --git a/ROCm_Libraries/rocr/src/core/runtime/isa.cpp b/ROCm_Libraries/rocr/src/core/runtime/isa.cpp index 7c9768c1..bc916ea8 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/isa.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/isa.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp b/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp index d4896424..f381305b 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp @@ -374,7 +374,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { /* GPU-GPU - functional support, not a performance path. - + This goes through system memory because we have to support copying between non-peer GPUs and we can't use P2P pointers even if the GPUs are peers. Because hsa_amd_agents_allow_access requires the caller to specify all allowed agents we can't assume that a peer mapped pointer diff --git a/ROCm_Libraries/rocr/src/core/runtime/signal.cpp b/ROCm_Libraries/rocr/src/core/runtime/signal.cpp index fa24c421..e0890a47 100644 --- a/ROCm_Libraries/rocr/src/core/runtime/signal.cpp +++ b/ROCm_Libraries/rocr/src/core/runtime/signal.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h b/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h index 69a2a58a..c162629c 100644 --- a/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h +++ b/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp b/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp index 24974185..86be5524 100644 --- a/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp +++ b/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/locks.h b/ROCm_Libraries/rocr/src/core/util/locks.h index 4b13c1e9..0a593667 100644 --- a/ROCm_Libraries/rocr/src/core/util/locks.h +++ b/ROCm_Libraries/rocr/src/core/util/locks.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/os.h b/ROCm_Libraries/rocr/src/core/util/os.h index 51031786..57b3eb2e 100644 --- a/ROCm_Libraries/rocr/src/core/util/os.h +++ b/ROCm_Libraries/rocr/src/core/util/os.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/small_heap.cpp b/ROCm_Libraries/rocr/src/core/util/small_heap.cpp index 6cd8e117..8c3b8560 100644 --- a/ROCm_Libraries/rocr/src/core/util/small_heap.cpp +++ b/ROCm_Libraries/rocr/src/core/util/small_heap.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/small_heap.h b/ROCm_Libraries/rocr/src/core/util/small_heap.h index d9064bba..824f5681 100644 --- a/ROCm_Libraries/rocr/src/core/util/small_heap.h +++ b/ROCm_Libraries/rocr/src/core/util/small_heap.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -51,7 +51,7 @@ #include #include "utils.h" - + class SmallHeap { private: struct Node; diff --git a/ROCm_Libraries/rocr/src/core/util/timer.cpp b/ROCm_Libraries/rocr/src/core/util/timer.cpp index a2cf13fb..f4476c11 100644 --- a/ROCm_Libraries/rocr/src/core/util/timer.cpp +++ b/ROCm_Libraries/rocr/src/core/util/timer.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/timer.h b/ROCm_Libraries/rocr/src/core/util/timer.h index 914bda34..42179956 100644 --- a/ROCm_Libraries/rocr/src/core/util/timer.h +++ b/ROCm_Libraries/rocr/src/core/util/timer.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/core/util/utils.h b/ROCm_Libraries/rocr/src/core/util/utils.h index f7f09e9d..312bf044 100755 --- a/ROCm_Libraries/rocr/src/core/util/utils.h +++ b/ROCm_Libraries/rocr/src/core/util/utils.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h index bfb613ec..96b604ce 100644 --- a/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h +++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h index 60f0c6d4..0f6003dd 100644 --- a/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h +++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h index 6c2742a6..34e81b97 100644 --- a/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h +++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h index 57aa1adc..deefc8f0 100644 --- a/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h +++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/hsa.h b/ROCm_Libraries/rocr/src/inc/hsa.h index 3979219a..65db804a 100644 --- a/ROCm_Libraries/rocr/src/inc/hsa.h +++ b/ROCm_Libraries/rocr/src/inc/hsa.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -632,7 +632,7 @@ hsa_status_t HSA_API hsa_system_major_extension_supported( uint16_t version_major, uint16_t *version_minor, bool* result); - + /** * @deprecated @@ -711,7 +711,7 @@ hsa_status_t HSA_API hsa_system_get_major_extension_table( uint16_t extension, uint16_t version_major, size_t table_length, - void *table); + void *table); /** * @brief Struct containing an opaque handle to an agent, a device that participates in @@ -1283,7 +1283,7 @@ hsa_status_t HSA_API hsa_agent_major_extension_supported( uint16_t version_major, uint16_t *version_minor, bool* result); - + /** @} */ diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h index add80e52..ca9a23d6 100644 --- a/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h +++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h @@ -714,7 +714,7 @@ typedef enum { HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, /** * This memory_pool can be made directly accessible by all the agents in the - * system (::hsa_amd_agent_memory_pool_get_info does not return + * system (::hsa_amd_agent_memory_pool_get_info does not return * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this * attribute is bool. */ diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h index 014e49bf..1aeb92d0 100644 --- a/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h +++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h index de358c3d..d64de9d2 100644 --- a/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h +++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -54,7 +54,7 @@ #ifdef __cplusplus extern "C" { -#endif /*__cplusplus*/ +#endif /*__cplusplus*/ /** \defgroup ext-images Images and Samplers * @{ @@ -267,7 +267,7 @@ typedef enum { * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants. */ typedef uint32_t hsa_ext_image_channel_type32_t; - + /** * * @brief Channel order associated with the elements of an image. See @@ -303,7 +303,7 @@ typedef enum { * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants. */ typedef uint32_t hsa_ext_image_channel_order32_t; - + /** * @brief Image format. @@ -1170,7 +1170,7 @@ typedef enum { * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants. */ typedef uint32_t hsa_ext_sampler_coordinate_mode32_t; - + /** * @brief Sampler filter modes. See the Filter Mode section @@ -1446,9 +1446,9 @@ typedef struct hsa_ext_images_1_pfn_s { } hsa_ext_images_1_pfn_t; /** @} */ - + #ifdef __cplusplus } // end extern "C" block -#endif /*__cplusplus*/ +#endif /*__cplusplus*/ #endif diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h b/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h index f087709d..184fc654 100644 --- a/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h +++ b/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h @@ -75,7 +75,7 @@ uint32_t hsa_ven_amd_aqlprofile_version_minor(); // output data. // // Returned status: -// hsa_status_t – HSA status codes are used from hsa.h header +// hsa_status_t - HSA status codes are used from hsa.h header // // Supported profiling features: // @@ -91,7 +91,7 @@ typedef enum { // Supported performance counters (PMC) blocks // The block ID is the same for a block instances set, for example -// each block instance from the TCC block set, TCC0, TCC1, …, TCCN +// each block instance from the TCC block set, TCC0, TCC1, ..., TCCN // will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC. typedef enum { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0, @@ -132,8 +132,8 @@ typedef enum { } hsa_ven_amd_aqlprofile_block_name_t; // PMC event object structure -// ‘counter_id’ value is specified in GFXIPs perfcounter user guides -// which is the counters select value, “Performance Counters Selection” +// 'counter_id' value is specified in GFXIPs perfcounter user guides +// which is the counters select value, "Performance Counters Selection" // chapter. typedef struct { hsa_ven_amd_aqlprofile_block_name_t block_name; @@ -242,7 +242,7 @@ hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4( // Get profile info: // Generic method for getting various profile info including profile buffers // attributes like the command buffer size and the profiling PMC results. -// It’s implied that all counters are 64bit values. +// It's implied that all counters are 64bit values. // // Profile generic output data: typedef struct { diff --git a/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp b/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp index d24e1984..b3f33949 100644 --- a/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp +++ b/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp @@ -1550,7 +1550,7 @@ namespace amd { } } - GElfStringTable* GElfImage::addStringTable(const std::string& name) + GElfStringTable* GElfImage::addStringTable(const std::string& name) { GElfStringTable* stab = new GElfStringTable(this); sections.push_back(std::unique_ptr(stab)); diff --git a/ROCm_Libraries/rocr/src/loader/loaders.hpp b/ROCm_Libraries/rocr/src/loader/loaders.hpp index 94b3ceca..40540f8f 100644 --- a/ROCm_Libraries/rocr/src/loader/loaders.hpp +++ b/ROCm_Libraries/rocr/src/loader/loaders.hpp @@ -70,7 +70,7 @@ namespace loader { void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; - + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; diff --git a/ROCm_Network_Based_Programing/ROCm_RDMA.rst b/ROCm_Network_Based_Programing/ROCm_RDMA.rst index 725a36f6..4ea47661 100644 --- a/ROCm_Network_Based_Programing/ROCm_RDMA.rst +++ b/ROCm_Network_Based_Programing/ROCm_RDMA.rst @@ -1,7 +1,7 @@ RDMA OpenMPI MPICH -GasNet +GasNet OpenSHEMM -Chapel +Chapel UPC++ diff --git a/ROCm_Solutions/ROCr_Error_Codes.rst b/ROCm_Solutions/ROCr_Error_Codes.rst index f63771f1..096ce2c4 100644 --- a/ROCm_Solutions/ROCr_Error_Codes.rst +++ b/ROCm_Solutions/ROCr_Error_Codes.rst @@ -12,7 +12,7 @@ HSA Runtime Queue Error Codes +-----------+-------------------------+ | 64 | Group is too large | +-----------+-------------------------+ -| 128 | Out of VGPR’s | +| 128 | Out of VGPR's | +-----------+-------------------------+ | 0x80000000| Debug Trap | +-----------+-------------------------+ diff --git a/ROCm_System_Managment/ROCm-System-Managment.rst b/ROCm_System_Managment/ROCm-System-Managment.rst index b7912458..c0ed32dd 100644 --- a/ROCm_System_Managment/ROCm-System-Managment.rst +++ b/ROCm_System_Managment/ROCm-System-Managment.rst @@ -58,8 +58,8 @@ usage: rocm-smi [-h] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a] [-i =================================== =================================================================================== -h, --help show this help message and exit --gpureset Reset specified GPU (One GPU must be specified) - --load FILE Load Clock, Fan, Performance and Profile settings - --save FILE Save Clock, Fan, Performance and Profile settings + --load FILE Load Clock, Fan, Performance and Profile settings + --save FILE Save Clock, Fan, Performance and Profile settings =================================== =================================================================================== @@ -186,8 +186,8 @@ If the level ends with a %, the fan speed is calculated as pct*maxlevel/100 (max .. NOTES:: This option can be used in conjunction with the --setsclk/--setmclk mask - - Operating the GPU outside of specifications can cause irreparable damage to your hardware + + Operating the GPU outside of specifications can cause irreparable damage to your hardware Please observe the warning displayed when using this option This flag automatically sets the clock to the highest level, as only the highest level is increased by the OverDrive value @@ -231,16 +231,16 @@ If the level ends with a %, the fan speed is calculated as pct*maxlevel/100 (max **Clock Type Descriptions** -DCEFCLK - DCE (Display) FCLK - Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe SCLK - GFXCLK (Graphics core) +DCEFCLK - DCE (Display) FCLK - Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe SCLK - GFXCLK (Graphics core) .. Note:: SOCCLK split from SCLK as of Vega10. Pre-Vega10 they were both controlled by SCLK -MCLK - GPU Memory (VRAM) PCLK - PCIe bus +MCLK - GPU Memory (VRAM) PCLK - PCIe bus -.. Note:: +.. Note:: This gives 2 speeds, PCIe Gen1 x1 and the highest available based on the hardware @@ -346,12 +346,12 @@ All entries (except name) are optional, and should only be created in a given dr ********************* - Global attributes + Global attributes ********************* ================ ============================================================================================ name | The chip name.This should be a short, lowercase string, not containing whitespace, - | dashes, or the wildcard character '*'.This attribute represents the chip name. + | dashes, or the wildcard character '*'.This attribute represents the chip name. | It is the only mandatory attribute.I2C devices get this attribute created automatically. | RO @@ -363,28 +363,28 @@ update_interval | The interval at which the chip will update readings. ================ ============================================================================================ ************ - Voltages + Voltages ************ ====================== =============================================================================================== in[0-*]_min | Voltage min value. | Unit: millivolt | RW - + in[0-*]_lcrit | Voltage critical min value. | Unit: millivolt | RW | If voltage drops to or below this limit, the system may take drastic action such as power | down or reset. At the very least, it should report a fault. - + in[0-*]_max | Voltage max value. | Unit: millivolt | RW - + in[0-*]_crit | Voltage critical max value. | Unit: millivolt | RW - | If voltage reaches or exceeds this limit, the system may take drastic action such as power + | If voltage reaches or exceeds this limit, the system may take drastic action such as power | down or reset. At the very least, it should report a fault. in[0-*]_input | Voltage input value. @@ -392,8 +392,8 @@ in[0-*]_input | Voltage input value. | RO | Voltage measured on the chip pin.Actual voltage depends on the scaling resistors on the | motherboard, as recommended in the chip datasheet.This varies by chip and by motherboard. - | Because of this variation, values are generally NOT scaled by the chip driver, and must be - | done by the application.However, some drivers (notably lm87 and via686a) do scale, because + | Because of this variation, values are generally NOT scaled by the chip driver, and must be + | done by the application.However, some drivers (notably lm87 and via686a) do scale, because | of internal resistors built into a chip.These drivers will output the actual voltage. Rule of | thumb: drivers should report the voltage values at the "pins" of the chip. @@ -432,10 +432,10 @@ cpu[0-*]_vid | CPU core reference voltage. | RO | Not always correct. -vrm | Voltage Regulator Module version number. +vrm | Voltage Regulator Module version number. | RW (but changing it should no more be necessary) | Originally the VRM standard version multiplied by 10, but now an arbitrary number, as not - | all standards have a version number.Affects the way the driver calculates the CPU core + | all standards have a version number.Affects the way the driver calculates the CPU core | reference voltage from the vid pins. ====================== =============================================================================================== @@ -443,7 +443,7 @@ Also see the Alarms section for status flags associated with voltages. ******** - Fans + Fans ******** =============== ============================================================================================= @@ -470,9 +470,9 @@ fan[1-*]_div | Fan divisor. fan[1-*]_pulses | Number of tachometer pulses per fan revolution. | Integer value, typically between 1 and 4. | RW - | This value is a characteristic of the fan connected to the device's input, - | so it has to be set in accordance with the fan model.Should only be created - | if the chip has a register to configure the number of pulses. In the absence + | This value is a characteristic of the fan connected to the device's input, + | so it has to be set in accordance with the fan model.Should only be created + | if the chip has a register to configure the number of pulses. In the absence | of such a register (and thus attribute) the value assumed by all devices is 2 pulses | per fan revolution. @@ -484,7 +484,7 @@ fan[1-*]_target | Desired fan speed fan[1-*]_label | Suggested fan channel label. | Text string - | Should only be created if the driver has hints about what this fan channel is being + | Should only be created if the driver has hints about what this fan channel is being | used for, and user-space doesn't.In all other cases, the label is provided by user-space. | RO @@ -499,13 +499,13 @@ Also see the Alarms section for status flags associated with fans. ******* - PWM + PWM ******* - + +--------------------------------------+-----------------------------------------------------------------------------------------+ | pwm[1-*] | | Pulse width modulation fan control. | | | | Integer value in the range 0 to 255 | -| | | RW | +| | | RW | | | | 255 is max or 100%. | +--------------------------------------+-----------------------------------------------------------------------------------------+ | pwm[1-*]_enable | | Fan speed control method: | @@ -542,7 +542,7 @@ value (fastest fan speed) wins. **************** - Temperatures + Temperatures **************** ========================= ========================================================================================== @@ -589,7 +589,7 @@ temp[1-*]_crit_hyst | Temperature hysteresis value for critical limit. | Must be reported as an absolute temperature, NOT a delta from the critical value. | RW -temp[1-*]_emergency | Temperature emergency max value, for chips supporting more than two upper +temp[1-*]_emergency | Temperature emergency max value, for chips supporting more than two upper | temperature limits. Must be equal or greater than corresponding temp_crit values. | Unit: millidegree Celsius | RW @@ -613,8 +613,8 @@ temp[1-*]_offset | Temperature offset which is added to the temperature | Read/Write value. temp[1-*]_label | Suggested temperature channel label. - | Text string Should only be created if the driver has hints about what this temperature - | channel is being used for, and user-space doesn't. In all other cases, the label is + | Text string Should only be created if the driver has hints about what this temperature + | channel is being used for, and user-space doesn't. In all other cases, the label is | provided by user-space. | RO @@ -645,7 +645,7 @@ Also see the Alarms section for status flags associated with temperatures. ************ - Currents + Currents ************ ======================= ======================================================== @@ -697,7 +697,7 @@ curr[1-*]_enable | Enable or disable the sensors Also see the Alarms section for status flags associated with currents. ********* - Power + Power ********* ================================ =============================================================================== @@ -705,7 +705,7 @@ power[1-*]_average | Average power use | Unit: microWatt | RO -power[1-*]_average_interval | Power use averaging interval. A poll notification is sent to this +power[1-*]_average_interval | Power use averaging interval. A poll notification is sent to this | file if the hardware changes the averaging interval. | Unit: milliseconds | RW @@ -756,8 +756,8 @@ power[1-*]_accuracy | Accuracy of the power meter. | Unit: Percent | RO -power[1-*]_cap | If power use rises above this limit, the system should take action to - | reduce power use.A poll notification is sent to this file if the cap is +power[1-*]_cap | If power use rises above this limit, the system should take action to + | reduce power use.A poll notification is sent to this file if the cap is | changed by the hardware.The *_cap files only appear if the cap is known | to be enforced by hardware. | Unit: microWatt @@ -796,7 +796,7 @@ power[1-*]_enable | Enable or disable the sensors. Also see the Alarms section for status flags associated with power readings. ********** - Energy + Energy ********** ==================== ======================== @@ -812,7 +812,7 @@ energy[1-*]_enable | Enable or disable the sensors ==================== ======================== ************ - Humidity + Humidity ************ ==================== =========================================== @@ -828,7 +828,7 @@ humidity[1-*]_enable | Enable or disable the sensors ==================== =========================================== ********** - Alarms + Alarms ********** Each channel or limit may have an associated alarm file, containing a @@ -839,13 +839,13 @@ limit-related alarms, not both. The driver should just reflect the hardware implementation. +---------------------+------------------+ -| | in[0-*]_alarm | | Channel alarm | +| | in[0-*]_alarm | | Channel alarm | | | curr[1-*]_alarm | | 0: no alarm | | | power[1-*]_alarm | | 1: alarm | | | fan[1-*]_alarm | | RO | -| | temp[1-*]_alarm | | +| | temp[1-*]_alarm | | +---------------------+------------------+ - + OR +----------------------------+---------------+ @@ -868,7 +868,7 @@ OR | | temp[1-*]_crit_alarm | | | | temp[1-*]_emergency_alarm| | +----------------------------+---------------+ - + Each input channel may have an associated fault file. This can be used to notify open diodes, unconnected fans etc. where the hardware supports it. When this boolean has value 1, the measurement for that @@ -878,23 +878,23 @@ channel should not be trusted. | | fan[1-*]_fault | | Input fault condition | | | temp[1-*]_fault | | 0: no fault occurred | | | | 1: fault condition | -| | | RO | +| | | RO | +-------------------+-------------------------+ - + Some chips also offer the possibility to get beeped when an alarm occurs: +-----------------+----------------------+ | beep_enable | | Master beep enable | -| | | 0: no beeps | -| | | 1: beeps | -| | | RW | +| | | 0: no beeps | +| | | 1: beeps | +| | | RW | +-----------------+----------------------+ -| | in[0-*]_beep | | Channel beep | +| | in[0-*]_beep | | Channel beep | | | curr[1-*]_beep| | 0: disable | | | fan[1-*]_beep | | 1: enable | | | temp[1-*]_beep| | RW | -+-----------------+----------------------+ - ++-----------------+----------------------+ + In theory, a chip could provide per-limit beep masking, but no such chip was seen so far. @@ -926,7 +926,7 @@ beep_mask | Bitmask for beep. *********************** - Intrusion detection + Intrusion detection *********************** ======================= =========================================================== @@ -959,8 +959,8 @@ samples | Sets number of average samples for all types of measurements. | RW in_samples | Sets number of average samples for specific type of measurements. -power_samples | Note that on some devices it won't be possible to set all of -curr_samples | them to different values so changing one might also change +power_samples | Note that on some devices it won't be possible to set all of +curr_samples | them to different values so changing one might also change curr_samples | some others. | RW @@ -1021,10 +1021,10 @@ Example2, fan divider setting, valid values 2, 4 and 8: /* write v to register */ ********* -Performance +Performance ********* -The pcie_bw sysfs file will report the usage of the PCIe bus over the last second, as a string with 3 integers: "bytes-received bytes-sent mps" . As there is no efficient way to calculate the size of each packet transmitted to and from the GPU in real time, the maximum payload size (mps), or the largest size of a PCIe packet, is included. The estimated bandwidth can then be calculated using by "bytes-received*mps + bytes-sent*mps" sed and multiplied by the number of packets received and sent. +The pcie_bw sysfs file will report the usage of the PCIe bus over the last second, as a string with 3 integers: "bytes-received bytes-sent mps" . As there is no efficient way to calculate the size of each packet transmitted to and from the GPU in real time, the maximum payload size (mps), or the largest size of a PCIe packet, is included. The estimated bandwidth can then be calculated using by "bytes-received*mps + bytes-sent*mps" sed and multiplied by the number of packets received and sent. KFD Topology ============== @@ -1032,7 +1032,7 @@ KFD Topology Application software needs to understand the properties of the underlying hardware to leverage the performance capabilities of the platform for feature utilization and task scheduling. The sysfs topology exposes this information in a loosely hierarchal order. The information is populated by the KFD driver is gathered from ACPI (CRAT) and AMDGPU base driver. -| The sysfs topology is arranged hierarchically as following. The root directory of the topology is +| The sysfs topology is arranged hierarchically as following. The root directory of the topology is | **/sys/devices/virtual/kfd/kfd/topology/nodes/** Based on the platform inside this directory there will be sub-directories corresponding to each HSA Agent. A system with N HSA Agents will have N directories as shown below. @@ -1053,12 +1053,12 @@ This is available in the root directory of the HSA agent. This provides informat Memory ******** -The memory bank information attached to this agent is populated in “mem_banks” subdirectory. +The memory bank information attached to this agent is populated in "mem_banks" subdirectory. /sys/devices/virtual/kfd/kfd/topology/nodes/N/mem_banks Cache ******** -The caches available for this agent is populated in “cache” subdirectory +The caches available for this agent is populated in "cache" subdirectory /sys/devices/virtual/kfd/kfd/topology/nodes/N/cache IO-LINKS @@ -1069,7 +1069,7 @@ How to use topology information ********************************* The information provided in sysfs should not be directly used by application software. Application software should always use Thunk library API (libhsakmt) to access topology information. Please refer to Thunk API for more information. -The data are associated with a node ID, forming a per-node element list which references the elements contained at relative offsets within that list. A node associates with a kernel agent or agent. Node ID’s should be 0-based, with the “0” ID representing the primary elements of the system (e.g., “boot cores”, memory) if applicable. The enumeration order and—if applicable—values of the ID should match other information reported through mechanisms outside of the scope of the requirements; +The data are associated with a node ID, forming a per-node element list which references the elements contained at relative offsets within that list. A node associates with a kernel agent or agent. Node ID's should be 0-based, with the "0" ID representing the primary elements of the system (e.g., "boot cores", memory) if applicable. The enumeration order and--if applicable--values of the ID should match other information reported through mechanisms outside of the scope of the requirements; For example, the data and enumeration order contained in the ACPI SRAT table on some systems should match the memory order and properties reported through HSA. Further detail is out of the scope of the System Architecture and outlined in the Runtime API specification. @@ -1079,7 +1079,7 @@ Each of these nodes is interconnected with other nodes in more advanced systems .. image:: More_advanced_topology.png -Where applicable, the node grouping of physical memory follows NUMA principles to leverage memory locality in software when multiple physical memory blocks are available in the system and agents have a different “access cost” (e.g., bandwidth/latency) to that memory. +Where applicable, the node grouping of physical memory follows NUMA principles to leverage memory locality in software when multiple physical memory blocks are available in the system and agents have a different "access cost" (e.g., bandwidth/latency) to that memory. **KFD Topology structure for AMDGPU :** @@ -1110,7 +1110,7 @@ This can used by cooperating applications to effectively allocate GPU/GCDs among Device cgroup *************** -At a system administration level, the GPU/GCD isolation is possible using the device control group (cgroup). For all the AMD GPUs in a compute node, the ROCk-Kernel-Driver exposes a single compute device file /dev/kfd and a separate (Direct Rendering Infrastructure) render device files /dev/dri/renderDN for each device. To participate in the Linux kernel’s cgroup infrastructure, the ROCk driver relies on the render device files. +At a system administration level, the GPU/GCD isolation is possible using the device control group (cgroup). For all the AMD GPUs in a compute node, the ROCk-Kernel-Driver exposes a single compute device file /dev/kfd and a separate (Direct Rendering Infrastructure) render device files /dev/dri/renderDN for each device. To participate in the Linux kernel's cgroup infrastructure, the ROCk driver relies on the render device files. For example, consider a compute node with the two AMD GPUs. The ROCk-Kernel-Driver exposes the following device files: @@ -1122,9 +1122,9 @@ crw-rw---- 1 root video 226, 129 Apr 22 10:31 /dev/dri/renderD129 A ROCm application running on this compute node can use both GPUs only if it has access to all the above-listed device files. The administrator can restrict the devices an application can access by using device cgroup. The device cgroup subsystem allows or denies access to devices by applications in a cgroup. If a cgroup has whitelisted only /dev/kfd and /dev/dri/renderD129, then applications in that cgroup will have access only to that single GPU. -Refer to the Linux kernel's cgroup documentation for information on how to create a cgroup and whitelist devices. +Refer to the Linux kernel's cgroup documentation for information on how to create a cgroup and whitelist devices. -For cgroup-v1, refer https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt +For cgroup-v1, refer https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt For cgroup-v2, refer https://www.kernel.org/doc/Documentation/cgroup-v2.txt diff --git a/ROCm_System_Managment/topo1.rst b/ROCm_System_Managment/topo1.rst index 2a0050e8..c3097968 100644 --- a/ROCm_System_Managment/topo1.rst +++ b/ROCm_System_Managment/topo1.rst @@ -9,7 +9,7 @@ sysfs-class-kfd-topology | Description: Gives the details of system platform -| What: /sys/class/kfd/topology/platform_oem +| What: /sys/class/kfd/topology/platform_oem | Date: may 2018 | KernelVersion: 4.13 | description: This field gives the OEM(original equipment manufacturer) ID. Identifies HSA platform, reflects the OEMID in the CRAT diff --git a/ROCm_System_Managment/topo2.rst b/ROCm_System_Managment/topo2.rst index 96da2168..1293cfc3 100644 --- a/ROCm_System_Managment/topo2.rst +++ b/ROCm_System_Managment/topo2.rst @@ -13,96 +13,96 @@ sysfs-class-kfd-topology-nodes-N | Date: May 2018 | KernelVersion: 4.13 | Description: Here the number of smid (Single Instruction Multiple Data architecture) processes count is registered - + | What: /sys/class/kfd/topology/nodes/N/mem_banks_count | Date: May 2018 | KernelVersion: 4.13 | Description: This field gives the Number of discoverable memory bank affinity properties on this "H-NUMA" node - + | What: /sys/class/kfd/topology/nodes/N/caches_count | Date: May 2018 | KernelVersion: 4.13 | Description: Gives the Number of discoverable cache affinity properties on the "H-NUMA" node. - + | What: /sys/class/kfd/topology/nodes/N/io_links_count | Date: May 2018 | KernelVersion: 4.13 | Description: This field gives the number of discoverable IO link affinity properties of this node connecting to other nodes. - + | What: /sys/class/kfd/topology/nodes/N/cpu_cores_id | Date: May 2018 | KernelVersion: 4.13 | Description: Gives the CPU core id details corresponding to core count - + | What: /sys/class/kfd/topology/nodes/N/simd_id_base | Date: May 2018 | KernelVersion: 4.13 | Description: This field gives simd id value. - + | What: /sys/class/kfd/topology/nodes/N/max_waves_per_simd -| Date: May 2018 +| Date: May 2018 | KernelVersion: 4.13 | Description: This identifies the maximum number of launched waves per SIMD. If NUmSIMDCores is 0, this value is ignored - + | What: /sys/class/kfd/topology/nodes/N/gds_size_in_kb | Date: May 2018 | KernelVersion: 4.13 | Description: This field gives the size of Global Data Store in Kilobytes shared across SIMD Wavefronts, typically 32 or 64 - + | What: /sys/class/kfd/topology/nodes/N/wave_front_size | Date: May 2018 | KernelVersion: 4.13 | Description: wavefront is group of threads (work-item) that execute together for executing kernels and this field gives the size of the wavefront used. Usually 64or 32 or a different value for some HSA based architectures - + | What: /sys/class/kfd/topology/nodes/N/array_count | Date: May 2018 | KernelVersion: 4.13 | Description: This field give Number of SIMD Arrays per Engine - + | What: /sys/class/kfd/topology/nodes/N/simd_arrays_per_engine | Date: May 2018 | KernelVersion: 4.13 | Description: It gives the simd array count for every compute unite (stream engine) -| +| | What: /sys/class/kfd/topology/nodes/N/cu_per_simd_array | Date: May 2018 | KernelVersion: 4.13 | Description: Gives the Number of Compute Units (CU) per SIMD Array -| +| | What: /sys/class/kfd/topology/nodes/N/simd_per_cu | Date: May 2018 -| KernelVersion: 4.13 +| KernelVersion: 4.13 | Description: Number of SIMD representing a Compute Unit (CU) -| +| | What: /sys/class/kfd/topology/nodes/N/max_slots_scratch_cu | Date: May 2018 | KernelVersion: 4.13 | Description: Bitmask of available CU slots, used for CU mask setup for the queues if assignment is desired by application necessary. -| +| | What: /sys/class/kfd/topology/nodes/N/vendor_id | Date: May 2018 | KernelVersion: 4.13 | Description: This field contains the GPU vendor id; 0 on CPU-only nodes -| +| | What: /sys/class/kfd/topology/nodes/N/device_id | Date: May 2018 | KernelVersion: 4.13 | Description: This field contains the GPU device id; 0 on CPU-only nodes -| +| | What: /sys/class/kfd/topology/nodes/N/location_id | Date: May 2018 | KernelVersion: 4.13 | Description: LocationId, 32bit value, equivalent to BDF_ID used by Linux tools especially (identifies device in the overall system) -| +| | What: /sys/class/kfd/topology/nodes/N/drm_render_minor | Date: May 2018 | KernelVersion: 4.13 -| Description: drm (Direct Rendering Manager) render data count is shown -| +| Description: drm (Direct Rendering Manager) render data count is shown +| | What: /sys/class/kfd/topology/nodes/N/max_engine_clk_ccompute | Date: May 2018 | KernelVersion: 4.13 | Description: Maximum engine clock speed of the CPU -| +| diff --git a/ROCm_Tools/HCC-Native-GCN-ISA.rst b/ROCm_Tools/HCC-Native-GCN-ISA.rst index bd8c14d0..fff16500 100644 --- a/ROCm_Tools/HCC-Native-GCN-ISA.rst +++ b/ROCm_Tools/HCC-Native-GCN-ISA.rst @@ -41,7 +41,7 @@ Then install all other dependencies in order to build HCC from source: :: sudo apt-get install cmake git libelf-dev libc++abi-dev libc++-dev libdwarf-dev re2c libncurses5-dev patch wget file xz-utils libc6- dev-i386 python build-essential - + **CMake** If you are using Ubuntu 14.04, you would also need to upgrade to a newer version (>=3.0) of CMake as the version distributed by the distro is old for building clang/llvm. @@ -75,7 +75,7 @@ Install other development tools: :: sudo dnf groupinstall "Development Tools" - + **libc++ & libc++abi** HCC has a dependency on libc++ and libc++abi; however, Fedora/RHEL/CentOS don't provide a working binary package so you will to build them from source by following the instructions `here `_ @@ -122,7 +122,7 @@ It is recommended to install the release_36 release of libc++ and libc++abi and sudo make install cd ../libcxxabi sudo make install - + Add the libc++ and libc++abi installation path to the library search paths (i.e. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ) @@ -149,7 +149,7 @@ Fetch the source code :: repo sync - + **Build Instructions** :: @@ -182,7 +182,7 @@ You could also run the HCC's sanity test :: make test - + **Install the Compiler** :: diff --git a/ROCm_Tools/HCC_WIKI.rst b/ROCm_Tools/HCC_WIKI.rst index 01b6af18..535e5b2b 100644 --- a/ROCm_Tools/HCC_WIKI.rst +++ b/ROCm_Tools/HCC_WIKI.rst @@ -1,12 +1,12 @@ .. _HCCwiki: - + HCC WIKI ========= HCC is an Open Source, Optimizing C++ Compiler for Heterogeneous Compute ************************************************************************** -HCC supports heterogeneous offload to AMD APUs and discrete GPUs via HSA enabled runtimes and drivers. It is an ISO compliant C++ 11/14 compiler. It is based on Clang, the LLVM Compiler Infrastructure and the “libc++” C++ standard library. +HCC supports heterogeneous offload to AMD APUs and discrete GPUs via HSA enabled runtimes and drivers. It is an ISO compliant C++ 11/14 compiler. It is based on Clang, the LLVM Compiler Infrastructure and the "libc++" C++ standard library. Accelerator Modes Supported ***************************** @@ -14,7 +14,7 @@ Accelerator Modes Supported `HC (Heterogeneous Compute) C++ API `_ ++++++++++++++++++++++++++++++++++++++++++ -Inspired by C++ AMP and C++17, this is the default C++ compute API for the HCC compiler. HC has some important differences from C++ AMP including removing the “restrict” keyword, supporting additional data types in kernels, providing more control over synchronization and data movement, and providing pointer-based memory allocation. It is designed to expose cutting edge compute capabilities on Boltzmann and HSA devices to developers while offering the productivity and usability of C++. +Inspired by C++ AMP and C++17, this is the default C++ compute API for the HCC compiler. HC has some important differences from C++ AMP including removing the "restrict" keyword, supporting additional data types in kernels, providing more control over synchronization and data movement, and providing pointer-based memory allocation. It is designed to expose cutting edge compute capabilities on Boltzmann and HSA devices to developers while offering the productivity and usability of C++. `HIP `_ +++++++++++ @@ -73,9 +73,9 @@ Currently, HCC support for openSUSE is experimental and the compiler has to be b Building HCC from Source ######################## -First, install the build dependencies: +First, install the build dependencies: :: - + # Ubuntu 16.04 & 18.04 sudo apt-get install coreutils git cmake make g++ g++-multilib gcc-multilib python \ findutils libelf1 libpci3 file debianutils libunwind-dev pkg-config \ @@ -99,7 +99,7 @@ hsa-rocr-dev hsa-ext-rocr-dev hsakmt-roct-dev rocm-utils # openSUSE Leap 42.3 sudo zypper install coreutils git cmake make gcc-c++ python python-xml findutils elfutils pciutils-devel file rpm-build libunwind-devel pkg-config libpth-devel - + # install libc++ from OSB sudo zypper addrepo \ -f http://download.opensuse.org/repositories/devel:/tools:/compiler/openSUSE_Leap_42.3/ devel_tools_compiler @@ -107,17 +107,17 @@ hsa-rocr-dev hsa-ext-rocr-dev hsakmt-roct-dev rocm-utils sudo zypper install libc++-devel -Clone the HCC source tree: +Clone the HCC source tree: :: # automatically fetches all submodules git clone --recursive -b clang_tot_upgrade https://github.com/RadeonOpenCompute/hcc.git -Create a build directory and run cmake to configure the build: +Create a build directory and run cmake to configure the build: :: mkdir build; cd build cmake ../hcc -Compile HCC: +Compile HCC: :: make -j [number of threads] @@ -125,8 +125,8 @@ Install HCC: :: sudo make install -Run the unit tests: -:: +Run the unit tests: +:: make test Create an installer package (DEB or RPM file) @@ -147,7 +147,7 @@ To compile and link in a single step: To build with separate compile and link steps: :: # Assume HCC is installed and added to PATH - # Notice the the hcc-config command is between two backticks + # Notice the the hcc-config command is between two backticks hcc -hc saxpy.cpp -c -o saxpy.cpp.o hcc -hc saxpy.cpp.o -o saxpy @@ -158,7 +158,7 @@ By default, HCC would auto-detect all the GPUs available to run on and set the c ============ ================== ============================================================== GCN Version GPU/APU Family Examples of Radeon GPU - + ============ ================== ============================================================== gfx803 GFX8 R9 Fury, R9 Fury X, R9 Nano, FirePro S9300 x2, Radeon RX 480, @@ -166,7 +166,7 @@ gfx803 GFX8 R9 Fury, R9 Fury X, R9 Nano, FirePro S9300 x2, gfx900 GFX9 Vega10 -============ ================== ============================================================== +============ ================== ============================================================== Required AMDGPU Attributes diff --git a/ROCm_Tools/ROCm-Tools.rst b/ROCm_Tools/ROCm-Tools.rst index 22f06d3f..61cd831b 100644 --- a/ROCm_Tools/ROCm-Tools.rst +++ b/ROCm_Tools/ROCm-Tools.rst @@ -1,4 +1,4 @@ - + .. _ROCm-Tools: ===================== @@ -24,16 +24,16 @@ GCN Assembler and Disassembler The Art of AMDGCN Assembly: How to Bend the Machine to Your Will ***************************************************************** -The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following: +The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following: * The program may be written in a high level language that does not expose all of the features available on the hardware. - * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. + * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. -Consider a program that uses one of GCN’s new features (source code is available on `GitHub `_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. +Consider a program that uses one of GCN's new features (source code is available on `GitHub `_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. DS Permute Instructions ************************** -Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: +Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: .. code:: cpp @@ -45,7 +45,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov Passing Parameters to a Kernel ******************************* -Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: +Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: .. code:: cpp @@ -88,9 +88,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to “in” buffer - Kernarg(index); // Append base pointer to “index” buffer - Kernarg(out); // Append base pointer to “out” buffer + Kernarg(in); // Add base pointer to "in" buffer + Kernarg(index); // Append base pointer to "index" buffer + Kernarg(out); // Append base pointer to "out" buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as @@ -144,7 +144,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru flat_store_dword v[3:4], v1 s_endpgm -Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. +Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. .. image:: initial_state-768x387.png @@ -152,7 +152,7 @@ Currently, a programmer must manually set all non-default values to provide the The GPR Counting ****************** -The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: +The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: :: @@ -388,7 +388,7 @@ rocprof 2. Profiling Modes ****************** -‘rocprof’ can be used for GPU profiling using HW counters and +'rocprof' can be used for GPU profiling using HW counters and application tracing 2.1. GPU profiling @@ -396,9 +396,9 @@ application tracing GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using -option ‘-i ’. Output CSV file with a line per submitted kernel is +option '-i '. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter -values. By option ‘—stats’ the kernel execution stats can be generated +values. By option '--stats' the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. An example of input file: @@ -414,17 +414,17 @@ submitted kernels. An example of input file: gpu: 0 1 2 3 kernel: simple Pass1 simpleConvolutionPass2 -An example of profiling command line for ‘MatrixTranspose’ application +An example of profiling command line for 'MatrixTranspose' application :: $ rocprof -i input.txt MatrixTranspose - RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' + RPL: on '191018_011134' from '/..../rocprofiler_pkg' in '/..../MatrixTranspose' RPL: profiling '"./MatrixTranspose"' RPL: input file 'input.txt' RPL: output dir '/tmp/rpl_data_191018_011134_9695' RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' - ROCProfiler: rc-file '/…./rpl_rc.xml' + ROCProfiler: rc-file '/..../rpl_rc.xml' ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" gpu_index = kernel = @@ -436,7 +436,7 @@ An example of profiling command line for ‘MatrixTranspose’ application PASSED! ROCProfiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 - RPL: '/…./MatrixTranspose/input.csv' is generated + RPL: '/..../MatrixTranspose/input.csv' is generated **2.1.1. Counters and metrics** @@ -456,8 +456,8 @@ Metrics XML File Example: :: - - + + . . . @@ -469,14 +469,14 @@ Metrics XML File Example: **2.1.1.1. Metrics query** -Available counters and metrics can be queried by options ‘—list-basic’ -for counters and ‘—list-derived’ for derived metrics. The output for +Available counters and metrics can be queried by options '--list-basic' +for counters and '--list-derived' for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. Examples: @@ -484,8 +484,8 @@ expressions. Examples: :: $ rocprof --list-basic - RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' - ROCProfiler: rc-file '/…./rpl_rc.xml' + RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose' + ROCProfiler: rc-file '/..../rpl_rc.xml' Basic HW counters: gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks block GRBM has 2 counters @@ -541,12 +541,12 @@ metric groups: :: $ rocprof -i input.txt ./MatrixTranspose - RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' + RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose' RPL: profiling './MatrixTranspose' RPL: input file 'input.txt' RPL: output dir '/tmp/rpl_data_191018_032645_12106' RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' - ROCProfiler: rc-file '/…./rpl_rc.xml' + ROCProfiler: rc-file '/..../rpl_rc.xml' ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" gpu_index = kernel = @@ -570,35 +570,35 @@ ________________________________ - Collecting with multiple runs To collect several metric groups a full application replay is used by -defining several ‘pmc:’ lines in the input file, see 2.1. +defining several 'pmc:' lines in the input file, see 2.1. 2.2. Application tracing ************************ Supported application tracing includes runtime API and GPU activity -tracing’ Supported runtimes are: ROCr (HSA API) and HIP Supported GPU +tracing' Supported runtimes are: ROCr (HSA API) and HIP Supported GPU activity: kernel execution, async memory copy, barrier packets. The trace is generated in JSON format compatible with Chrome tracing. The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and -parameters. Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, -where ‘sys trace’ is for HIP and HSA combined trace. +parameters. Supported options: '--hsa-trace', '--hip-trace', '--sys-trace', +where 'sys trace' is for HIP and HSA combined trace. **2.2.1. HIP runtime trace** -The trace is generated by option ‘—hip-trace’ and includes HIP API +The trace is generated by option '--hip-trace' and includes HIP API timelines and GPU activity at the runtime level. **2.2.2. ROCr runtime trace** -The trace is generated by option ‘—hsa-trace’ and includes ROCr API +The trace is generated by option '--hsa-trace' and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. **2.2.3. KFD driver trace** -The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API +The trace is generated by option '--kfd-trace' and includes KFD Thunk API timelines. It is planned to include memory allocations/migration activity tracing. @@ -606,7 +606,7 @@ It is planned to include memory allocations/migration activity tracing. **2.2.4. Code annotation** Support for application code annotation. Start/stop API is supported to -programmatically control the profiling. A ‘roctx’ library provides +programmatically control the profiling. A 'roctx' library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. @@ -638,7 +638,7 @@ annotation API. Annotation is visualized in JSON trace as a separate **2.3. Multiple GPUs profiling** -The profiler supports multiple GPU’s profiling and provide GPI id for +The profiler supports multiple GPU's profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. @@ -707,7 +707,7 @@ Profiler errors are logged to global logs: 4. 3rd party visualization tools ******************************** -‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which +'rocprof' is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. 4.1. Chrome tracing @@ -719,7 +719,7 @@ Good review can be found by the link: 5. Command line options *********************** -The command line options can be printed with option ‘-h’: +The command line options can be printed with option '-h': :: @@ -845,34 +845,34 @@ Counters: :: - • GRBM_COUNT : Tie High - Count Number of Clocks - • GRBM_GUI_ACTIVE : The GUI is Active - • SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) - • SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) - • SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) - • SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) - • SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) - • SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) - • SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) - • SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) - • SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) - • SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) - • SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) - • SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) - • SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) - • SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) - • SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) - • TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. - • TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. - • TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. - • TCC_HIT[0-15] : Number of cache hits. - • TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. - • TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. - • TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. - • TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. - • TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) - • TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests - • TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. + o GRBM_COUNT : Tie High - Count Number of Clocks + o GRBM_GUI_ACTIVE : The GUI is Active + o SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) + o SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) + o SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) + o SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) + o SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) + o SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) + o SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) + o SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) + o SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) + o SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) + o SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) + o SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) + o SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) + o SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) + o SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) + o TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. + o TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. + o TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. + o TCC_HIT[0-15] : Number of cache hits. + o TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + o TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. + o TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + o TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. + o TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) + o TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests + o TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. The following derived metrics have been defined and the profiler metrics XML specification can be found at: @@ -882,44 +882,44 @@ Metrics: :: - • TA_BUSY_avr : TA block is busy. Average over TA instances. - • TA_BUSY_max : TA block is busy. Max over TA instances. - • TA_BUSY_min : TA block is busy. Min over TA instances. - • TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. - • TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. - • TCC_HIT_sum : Number of cache hits. Sum over TCC instances. - • TCC_MISS_sum : Number of cache misses. Sum over TCC instances. - • TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. - • TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. - • TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. - • TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. - • TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. - • TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. - • FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - • WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - • GPUBusy : The percentage of time GPU was busy. - • Wavefronts : Total wavefronts. - • VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). - • SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). - • VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. - • SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). - • VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. - • FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. - • LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. - • FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). - • GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). - • VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). - • VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). - • SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). - • Mem32Bwrites : - • FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - • WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - • L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). - • MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). - • MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). - • WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). - • ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). - • LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + o TA_BUSY_avr : TA block is busy. Average over TA instances. + o TA_BUSY_max : TA block is busy. Max over TA instances. + o TA_BUSY_min : TA block is busy. Min over TA instances. + o TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. + o TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. + o TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + o TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + o TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. + o TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. + o TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. + o TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. + o TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. + o TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. + o FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + o WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + o GPUBusy : The percentage of time GPU was busy. + o Wavefronts : Total wavefronts. + o VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). + o SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). + o VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. + o SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + o VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. + o FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. + o LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. + o FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). + o GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + o VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). + o VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + o SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + o Mem32Bwrites : + o FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + o WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + o L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). + o MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + o MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). + o WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). + o ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + o LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). ROC Profiler @@ -1034,7 +1034,7 @@ GitHub: `https://github.com/ROCm-Developer-Tools/roctracer `_ in github. Here are some limitations. - Dwarf debugging is turned off for GPUs. -g will turn on host level debugging only. - - Some simd constructs fail to vectorize on both host and GPUs. + - Some simd constructs fail to vectorize on both host and GPUs. ROCmValidationSuite ===================== -The ROCm Validation Suite (RVS) is a system administrator’s and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform. +The ROCm Validation Suite (RVS) is a system administrator's and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform. -The RVS is a collection of tests, benchmarks and qualification tools each targeting a specific sub-system of the ROCm platform. All of the tools are implemented in software and share a common command line interface. Each set of tests are implemented in a “module” which is a library encapsulating the functionality specific to the tool. The CLI can specify the directory containing modules to use when searching for libraries to load. Each module may have a set of options that it defines and a configuration file that supports its execution. +The RVS is a collection of tests, benchmarks and qualification tools each targeting a specific sub-system of the ROCm platform. All of the tools are implemented in software and share a common command line interface. Each set of tests are implemented in a "module" which is a library encapsulating the functionality specific to the tool. The CLI can specify the directory containing modules to use when searching for libraries to load. Each module may have a set of options that it defines and a configuration file that supports its execution. ROCmValidationSuite Modules ****************************** -**GPU Properties – GPUP** +**GPU Properties - GPUP** -The GPU Properties module queries the configuration of a target device and returns the device’s static characteristics. These static values can be used to debug issues such as device support, performance and firmware problems. +The GPU Properties module queries the configuration of a target device and returns the device's static characteristics. These static values can be used to debug issues such as device support, performance and firmware problems. -**GPU Monitor – GM module** +**GPU Monitor - GM module** The GPU monitor tool is capable of running on one, some or all of the GPU(s) installed and will report various information at regular intervals. The module can be configured to halt another RVS modules execution if one of the quantities exceeds a specified boundary value. -**PCI Express State Monitor – PESM module?** +**PCI Express State Monitor - PESM module?** -The PCIe State Monitor tool is used to actively monitor the PCIe interconnect between the host platform and the GPU. The module will register a “listener” on a target GPU’s PCIe interconnect, and log a message whenever it detects a state change. The PESM will be able to detect the following state changes: +The PCIe State Monitor tool is used to actively monitor the PCIe interconnect between the host platform and the GPU. The module will register a "listener" on a target GPU's PCIe interconnect, and log a message whenever it detects a state change. The PESM will be able to detect the following state changes: * PCIe link speed changes * GPU power state changes @@ -1754,12 +1754,12 @@ The PCIe State Monitor tool is used to actively monitor the PCIe interconnect be The ROCm Configuration Qualification Tool ensures the platform is capable of running ROCm applications and is configured correctly. It checks the installed versions of the ROCm components and the platform configuration of the system. This includes checking that dependencies, corresponding to the associated operating system and runtime environment, are installed correctly. Other qualification steps include checking: * The existence of the /dev/kfd device - * The /dev/kfd device’s permissions + * The /dev/kfd device's permissions * The existence of all required users and groups that support ROCm * That the user mode components are compatible with the drivers, both the KFD and the amdgpu driver. * The configuration of the runtime linker/loader qualifying that all ROCm libraries are in the correct search path. -**PCI Express Qualification Tool – PEQT module** +**PCI Express Qualification Tool - PEQT module** The PCIe Qualification Tool consists is used to qualify the PCIe bus on which the GPU is connected. The qualification test will be capable of determining the following characteristics of the PCIe bus interconnect to a GPU: @@ -1768,21 +1768,21 @@ The PCIe Qualification Tool consists is used to qualify the PCIe bus on which th * PCIe link speed * PCIe link width -**SBIOS Mapping Qualification Tool – SMQT module** +**SBIOS Mapping Qualification Tool - SMQT module** -The GPU SBIOS mapping qualification tool is designed to verify that a platform’s SBIOS has satisfied the BAR mapping requirements for VDI and Radeon Instinct products for ROCm support. +The GPU SBIOS mapping qualification tool is designed to verify that a platform's SBIOS has satisfied the BAR mapping requirements for VDI and Radeon Instinct products for ROCm support. -Refer to the “ROCm Use of Advanced PCIe Features and Overview of How BAR Memory is Used In ROCm Enabled System” web page for more information about how BAR memory is initialized by VDI and Radeon products. +Refer to the "ROCm Use of Advanced PCIe Features and Overview of How BAR Memory is Used In ROCm Enabled System" web page for more information about how BAR memory is initialized by VDI and Radeon products. -**P2P Benchmark and Qualification Tool – PBQT module** +**P2P Benchmark and Qualification Tool - PBQT module** The P2P Benchmark and Qualification Tool is designed to provide the list of all GPUs that support P2P and characterize the P2P links between peers. In addition to testing for P2P compatibility, this test will perform a peer-to-peer throughput test between all P2P pairs for performance evaluation. The P2P Benchmark and Qualification Tool will allow users to pick a collection of two or more GPUs on which to run. The user will also be able to select whether or not they want to run the throughput test on each of the pairs. -Please see the web page “ROCm, a New Era in Open GPU Computing” to find out more about the P2P solutions available in a ROCm environment. +Please see the web page "ROCm, a New Era in Open GPU Computing" to find out more about the P2P solutions available in a ROCm environment. -**PCI Express Bandwidth Benchmark – PEBB module** +**PCI Express Bandwidth Benchmark - PEBB module** -The PCIe Bandwidth Benchmark attempts to saturate the PCIe bus with DMA transfers between system memory and a target GPU card’s memory. The maximum bandwidth obtained is reported to help debug low bandwidth issues. The benchmark should be capable of targeting one, some or all of the GPUs installed in a platform, reporting individual benchmark statistics for each. +The PCIe Bandwidth Benchmark attempts to saturate the PCIe bus with DMA transfers between system memory and a target GPU card's memory. The maximum bandwidth obtained is reported to help debug low bandwidth issues. The benchmark should be capable of targeting one, some or all of the GPUs installed in a platform, reporting individual benchmark statistics for each. **GPU Stress Test - GST module** @@ -1809,16 +1809,16 @@ CentOS : :: - sudo yum install -y cmake3 doxygen pciutils-devel rpm rpm-build git gcc-c++ + sudo yum install -y cmake3 doxygen pciutils-devel rpm rpm-build git gcc-c++ RHEL : :: - sudo yum install -y cmake3 doxygen rpm rpm-build git gcc-c++ - + sudo yum install -y cmake3 doxygen rpm rpm-build git gcc-c++ + wget http://mirror.centos.org/centos/7/os/x86_64/Packages/pciutils-devel-3.5.1-3.el7.x86_64.rpm - + sudo rpm -ivh pciutils-devel-3.5.1-3.el7.x86_64.rpm SLES : @@ -1826,10 +1826,10 @@ SLES : :: sudo SUSEConnect -p sle-module-desktop-applications/15.1/x86_64 - + sudo SUSEConnect --product sle-module-development-tools/15.1/x86_64 - - sudo zypper install -y cmake doxygen pciutils-devel libpci3 rpm git rpm-build gcc-c++ + + sudo zypper install -y cmake doxygen pciutils-devel libpci3 rpm git rpm-build gcc-c++ Install ROCm stack, rocblas and rocm_smi64 ********************************************* @@ -1866,7 +1866,7 @@ CentOS & RHEL : sudo rpm -e rocm_smi64 && sudo yum install rocm_smi64 SUSE : sudo rpm -e rocm_smi64 && sudo zypper install rocm_smi64 Building from Source -********************** +********************** This section explains how to get and compile current development stream of RVS. @@ -1889,7 +1889,7 @@ If OS is Ubuntu and SLES, use cmake :: cmake ./ -B./build - + make -C ./build @@ -1903,11 +1903,11 @@ If OS is CentOS and RHEL, use cmake3 Build package: - + :: - + cd ./build - + make package Note:_ based on your OS, only DEB or RPM package will be built. You may ignore an error for the unrelated configuration @@ -2223,7 +2223,7 @@ MIVisionX :alt: MIVisionX :target: https://gpuopen-professionalcompute-libraries.github.io/MIVisionX/ -MIVisionX toolkit is a set of comprehensive computer vision and machine intelligence libraries, utilities, and applications bundled into a single toolkit. AMD MIVisionX delivers highly optimized open source implementation of the `Khronos OpenVX™ `_ and OpenVX™ Extensions along with Convolution Neural Net Model Compiler & Optimizer supporting `ONNX `_, and `Khronos NNEF™ `_ exchange formats. The toolkit allows for rapid prototyping and deployment of optimized workloads on a wide range of computer hardware, including small embedded x86 CPUs, APUs, discrete GPUs, and heterogeneous servers. +MIVisionX toolkit is a set of comprehensive computer vision and machine intelligence libraries, utilities, and applications bundled into a single toolkit. AMD MIVisionX delivers highly optimized open source implementation of the `Khronos OpenVX(TM) `_ and OpenVX(TM) Extensions along with Convolution Neural Net Model Compiler & Optimizer supporting `ONNX `_, and `Khronos NNEF(TM) `_ exchange formats. The toolkit allows for rapid prototyping and deployment of optimized workloads on a wide range of computer hardware, including small embedded x86 CPUs, APUs, discrete GPUs, and heterogeneous servers. * `AMD OpenVX `_ * `AMD OpenVX Extensions `_ @@ -2366,7 +2366,7 @@ Using live camera usage: :: - + runvx -frames:live canny-LIVE.gdf **OpenCV_orb-LIVE.gdf** @@ -2432,10 +2432,10 @@ Prerequisites Pre-requisites setup script - MIVisionX-setup.py ************************************************ - + For the convenience of the developer, we here provide the setup script which will install all the dependencies required by this project. -**MIVisionX-setup.py**- This scipts builds all the prerequisites required by MIVisionX. The setup script creates a deps folder and installs all the prerequisites, this script only needs to be executed once. If -d option for directory is not given the script will install deps folder in ‘~/’ directory by default, else in the user specified folder. +**MIVisionX-setup.py**- This scipts builds all the prerequisites required by MIVisionX. The setup script creates a deps folder and installs all the prerequisites, this script only needs to be executed once. If -d option for directory is not given the script will install deps folder in '~/' directory by default, else in the user specified folder. **Prerequisites for running the scripts** @@ -2530,7 +2530,7 @@ Build & Install MIVisionX --installer [Package management tool - optional (default:apt-get) [options: Ubuntu:apt-get;CentOS:yum]] --miopen [MIOpen Version - optional (default:2.1.0)] --miopengemm[MIOpenGEMM Version - optional (default:1.1.5)] - --ffmpeg [FFMPEG Installation - optional (default:no) [options:Install ffmpeg - yes]] + --ffmpeg [FFMPEG Installation - optional (default:no) [options:Install ffmpeg - yes]] --rpp [RPP Installation - optional (default:yes) [options:yes/no]] @@ -2558,7 +2558,7 @@ Build & Install MIVisionX * git clone, build and install other ROCm projects (using cmake and % make install) in the below order for vx_nn. * `rocm-cmake `_ * `MIOpenGEMM `_ - * `MIOpen `_ – make sure to use -DMIOPEN_BACKEND=OpenCL option with cmake + * `MIOpen `_ - make sure to use -DMIOPEN_BACKEND=OpenCL option with cmake * install `protobuf `__ * install `OpenCV `__ * install `FFMPEG n4.0.4 `_ - Optional @@ -2583,18 +2583,18 @@ Verify the Installation * Apps, Samples, Documents, Model Compiler and Toolkit are placed into /opt/rocm/mivisionx * Run samples to verify the installation - + * **Canny Edge Detection** - + .. image:: https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/blob/master/samples/images/canny_image.PNG?raw=true :align: center :width: 600 - + :: export PATH=$PATH:/opt/rocm/mivisionx/bin export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/mivisionx/lib - runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf + runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf Note: More samples are available `here `_ @@ -2666,13 +2666,13 @@ MIVisionX provides developers with docker images for Ubuntu 16.04, Ubuntu 18.04, -* Optional: Map localhost directory on the docker image +* Optional: Map localhost directory on the docker image * option to map the localhost directory with trained caffe models to be accessed on the docker image. * usage: -v {LOCAL_HOST_DIRECTORY_PATH}:{DOCKER_DIRECTORY_PATH} - - + + :: - + sudo docker run -it -v /home/:/root/hostDrive/ --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video --network host mivisionx/ubuntu-16.04 @@ -2680,24 +2680,24 @@ MIVisionX provides developers with docker images for Ubuntu 16.04, Ubuntu 18.04, **Note: Display option with docker** * Using host display - + :: - + xhost +local:root - sudo docker run -it --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video - --network host --env DISPLAY=unix$DISPLAY --privileged --volume $XAUTH:/root/.Xauthority + sudo docker run -it --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video + --network host --env DISPLAY=unix$DISPLAY --privileged --volume $XAUTH:/root/.Xauthority --volume /tmp/.X11-unix/:/tmp/.X11-unix mivisionx/ubuntu-16.04:latest * Test display with MIVisionX sample - + :: export PATH=$PATH:/opt/rocm/mivisionx/bin export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/mivisionx/lib - runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf + runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf Release Notes ************* @@ -2705,7 +2705,7 @@ Release Notes **Known issues** * Package (.deb & .rpm) install requires OpenCV v3.4.0 to execute AMD OpenCV extensions - + **Tested configurations** diff --git a/ROCm_Tools/clBLA.rst b/ROCm_Tools/clBLA.rst index 687c38ed..6e9c487e 100644 --- a/ROCm_Tools/clBLA.rst +++ b/ROCm_Tools/clBLA.rst @@ -7,7 +7,7 @@ clBLAS For Github repository `clBLAS `_ -This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. +This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves. @@ -166,7 +166,7 @@ Build dependencies ******************** **Library for Windows** - * Windows® 7/8 + * Windows(R) 7/8 * Visual Studio 2010 SP1, 2012 * An OpenCL SDK, such as APP SDK 2.8 * Latest CMake diff --git a/ROCm_Tools/clFFT.rst b/ROCm_Tools/clFFT.rst index df127b81..8159855c 100644 --- a/ROCm_Tools/clFFT.rst +++ b/ROCm_Tools/clFFT.rst @@ -164,7 +164,7 @@ Build dependencies To develop the clFFT library code on a Windows operating system, ensure to install the following packages on your system: - * Windows® 7/8.1 + * Windows(R) 7/8.1 * Visual Studio 2012 or later @@ -193,7 +193,7 @@ To test the developed clFFT library code, ensure to install the following packag * Googletest v1.6 * Latest FFTW - + * Latest Boost Performance infrastructure diff --git a/ROCm_Tools/clRNG.rst b/ROCm_Tools/clRNG.rst index aa450aa5..aa2e5781 100644 --- a/ROCm_Tools/clRNG.rst +++ b/ROCm_Tools/clRNG.rst @@ -3,12 +3,12 @@ ========= clRNG ========= - + For Github repository `clRNG `_ A library for uniform random number generation in OpenCL. -Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators. +Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators. Documentation *************** @@ -31,7 +31,7 @@ Building ********** 1. Install the runtime dependency: * An OpenCL SDK, such as APP SDK. - + 2. Install the build dependencies: * The CMake cross-platform build system. Visual Studio users can use CMake Tools for Visual Studio. @@ -64,7 +64,7 @@ On a 64-bit Linux platform, steps 3 through 9 from above, executed in a Bash-com export CLRNG_ROOT=$PWD/package export LD_LIBRARY_PATH=$CLRNG_ROOT/lib64:$LD_LIBRARY_PATH $CLRNG_ROOT/bin/CTest - + Examples *********** Examples can be found in src/client. The compiled client program examples can be found under the bin subdirectory of the installation package ($CLRNG_ROOT/bin under Linux). Note that the examples expect an OpenCL GPU device to be available. diff --git a/ROCm_Tools/clSPARSE.rst b/ROCm_Tools/clSPARSE.rst index d5414078..0e6a80c7 100644 --- a/ROCm_Tools/clSPARSE.rst +++ b/ROCm_Tools/clSPARSE.rst @@ -5,10 +5,10 @@ =========== clSPARSE =========== - + For Github repository `clSPARSE `_ -an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. +an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. `_ and `Vratis Ltd. `_. What's new in clSPARSE v0.10.1 ****************************** @@ -30,7 +30,7 @@ clSPARSE features * Dense to CSR conversions (& converse) * COO to CSR conversions (& converse) * Functions to read matrix market files in COO or CSR format -True in spirit with the other clMath libraries, clSPARSE exports a “C” interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API’s to make them less ‘cluttered’ compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3 +True in spirit with the other clMath libraries, clSPARSE exports a "C" interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API's to make them less 'cluttered' compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3 Google Groups *************** @@ -67,7 +67,7 @@ clSPARSE is licensed under the `Apache License `_) * Solution (.sln) or diff --git a/ROCm_Tools/hcFFT.rst b/ROCm_Tools/hcFFT.rst index 2f58473e..627720fb 100644 --- a/ROCm_Tools/hcFFT.rst +++ b/ROCm_Tools/hcFFT.rst @@ -31,7 +31,7 @@ file: hcfft_1D_R2C.cpp :: #!c++ - + #include #include #include "hcfft.h" @@ -73,9 +73,9 @@ file: hcfft_1D_R2C.cpp free(input); free(output); hc::am_free(idata); - hc::am_free(odata); + hc::am_free(odata); } - + * Compiling the example code: Assuming the library and compiler installation is followed as in installation. @@ -94,7 +94,7 @@ The following are the steps to use the library ROCM 1.5 Installation *********************** -To Know more about ROCM refer +To Know more about ROCM refer https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md **a. Installing Debian ROCM repositories** @@ -130,8 +130,8 @@ and Reboot the system Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application: - * cd /opt/rocm/hsa/sample - * make + * cd /opt/rocm/hsa/sample + * make * ./vector_copy Library Installation @@ -178,7 +178,7 @@ The following are the sub-routines that are implemented KeyFeature ############ - + * Support 1D, 2D and 3D Fast Fourier Transforms * Supports R2C, C2R, C2C, D2Z, Z2D and Z2Z Transforms * Support Out-Of-Place data storage @@ -195,7 +195,7 @@ This section lists the known set of hardware and software requirements to build Hardware ********* - * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU + * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU * System Memory >= 4GB (Better if >10GB for NN application over multiple GPUs) * Hard Drive > 200GB (Better if SSD or NVMe driver for NN application over multiple GPUs) * Minimum GPU Memory (Global) > 2GB @@ -250,9 +250,9 @@ Driver versions GPU Cards *********** - * Radeon R9 Nano + * Radeon R9 Nano * Radeon R9 FuryX - * Radeon R9 Fury + * Radeon R9 Fury * Kaveri and Carizo APU Server System diff --git a/ROCm_Tools/hcRNG.rst b/ROCm_Tools/hcRNG.rst index ed6bd664..92be28c3 100644 --- a/ROCm_Tools/hcRNG.rst +++ b/ROCm_Tools/hcRNG.rst @@ -21,7 +21,7 @@ file: Randomarray.cpp :: - + //This example is a simple random array generation and it compares host output with device output //Random number generator Mrg31k3p #include @@ -34,7 +34,7 @@ file: Randomarray.cpp #include #include using namespace hc; - + int main() { hcrngStatus status = HCRNG_SUCCESS; @@ -44,7 +44,7 @@ file: Randomarray.cpp size_t streamCount = 10; //Number of random numbers to be generated //numberCount must be a multiple of streamCount - size_t numberCount = 100; + size_t numberCount = 100; //Enumerate the list of accelerators std::vectoracc = hc::accelerator::get_all(); accelerator_view accl_view = (acc[1].create_view()); @@ -52,21 +52,21 @@ file: Randomarray.cpp float *Random1 = (float*) malloc(sizeof(float) * numberCount); float *Random2 = (float*) malloc(sizeof(float) * numberCount); float *outBufferDevice = hc::am_alloc(sizeof(float) * numberCount, acc[1], 0); - + //Create streams hcrngMrg31k3pStream *streams = hcrngMrg31k3pCreateStreams(NULL, streamCount, &streamBufferSize, NULL); hcrngMrg31k3pStream *streams_buffer = hc::am_alloc(sizeof(hcrngMrg31k3pStream) * streamCount, acc[1], 0); accl_view.copy(streams, streams_buffer, streamCount* sizeof(hcrngMrg31k3pStream)); - - //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) + + //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) status = hcrngMrg31k3pDeviceRandomU01Array_single(accl_view, streamCount, streams_buffer, numberCount, outBufferDevice); - + if(status) std::cout << "TEST FAILED" << std::endl; accl_view.copy(outBufferDevice, Random1, numberCount * sizeof(float)); - + //Invoke random number generators in host for (size_t i = 0; i < numberCount; i++) - Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]); + Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]); // Compare host and device outputs for(int i =0; i < numberCount; i++) { if (Random1[i] != Random2[i]) { @@ -78,7 +78,7 @@ file: Randomarray.cpp continue; } if(!ispassed) std::cout << "TEST FAILED" << std::endl; - + //Free host resources free(Random1); free(Random2); @@ -86,8 +86,8 @@ file: Randomarray.cpp hc::am_free(outBufferDevice); hc::am_free(streams_buffer); return 0; - } - + } + * Compiling the example code: @@ -132,8 +132,8 @@ and **Reboot the system** Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application: :: - cd /opt/rocm/hsa/sample - make + cd /opt/rocm/hsa/sample + make ./vector_copy Library Installation @@ -141,14 +141,14 @@ Library Installation **a. Install using Prebuilt debian** :: - + wget https://github.com/ROCmSoftwarePlatform/hcRNG/blob/master/pre-builds/hcrng-master-184472e-Linux.deb sudo dpkg -i hcrng-master-184472e-Linux.deb **b. Build debian from source** :: - + git clone https://github.com/ROCmSoftwarePlatform/hcRNG.git && cd hcRNG chmod +x build.sh && ./build.sh diff --git a/ROCm_Tools/hipBLAS.rst b/ROCm_Tools/hipBLAS.rst index c99be8ca..69e5edc9 100644 --- a/ROCm_Tools/hipBLAS.rst +++ b/ROCm_Tools/hipBLAS.rst @@ -54,7 +54,7 @@ Batched and strided GEMM API ***************************** hipBLAS GEMM can process matrices in batches with regular strides. There are several permutations of these API's, the following is an example that takes everything -:: +:: hipblasStatus_t hipblasSgemmStridedBatched( hipblasHandle_t handle, diff --git a/ROCm_Tools/hipeigen.rst b/ROCm_Tools/hipeigen.rst index 50ea9c86..e56bc17c 100644 --- a/ROCm_Tools/hipeigen.rst +++ b/ROCm_Tools/hipeigen.rst @@ -26,7 +26,7 @@ AMD is hosting both debian and rpm repositories for the ROCm 2.4 packages. The p Complete installation steps of ROCm can be found `Here `_ -or +or For Debian based systems, like Ubuntu, configure the Debian ROCm repository as follows: @@ -52,7 +52,7 @@ Next, update the apt-get repository list and install/update the rocm package: Then, make the ROCm kernel your default kernel. If using grub2 as your bootloader, you can edit the GRUB_DEFAULT variable in the following file: -:: +:: sudo vi /etc/default/grub sudo update-grub diff --git a/ROCm_Tools/hipinstall.rst b/ROCm_Tools/hipinstall.rst index ea873d27..9f57e0ee 100644 --- a/ROCm_Tools/hipinstall.rst +++ b/ROCm_Tools/hipinstall.rst @@ -18,7 +18,7 @@ AMD-hcc * Default paths and environment variables: - * By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable) + * By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable) * By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable) * By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable). * Optionally, consider adding /opt/rocm/bin to your PATH to make it easier to use the tools. @@ -28,7 +28,7 @@ NVIDIA-nvcc * Configure the additional package server as described `here `_. * Install the "hip_nvcc" package. This will install CUDA SDK and the HIP porting layer. -:: +:: apt-get install hip_nvcc @@ -60,13 +60,13 @@ HIP source code is available and the project can be built from source on the HCC cd HIP mkdir build cd build - cmake .. + cmake .. make make install * Default paths: * By default cmake looks for hcc in /opt/rocm/hcc (can be overridden by setting -DHCC_HOME=/path/to/hcc in the cmake step).* - * By default cmake looks for HSA in /opt/rocm/hsa (can be overridden by setting -DHSA_PATH=/path/to/hsa in the cmake step).* + * By default cmake looks for HSA in /opt/rocm/hsa (can be overridden by setting -DHSA_PATH=/path/to/hsa in the cmake step).* * By default cmake installs HIP to /opt/rocm/hip (can be overridden by setting -DCMAKE_INSTALL_PREFIX=/where/to/install/hip in the cmake step).* Here's a richer command-line that overrides the default paths: diff --git a/ROCm_Tools/rocFFT.rst b/ROCm_Tools/rocFFT.rst index 7e79d871..ce4ff230 100644 --- a/ROCm_Tools/rocFFT.rst +++ b/ROCm_Tools/rocFFT.rst @@ -82,7 +82,7 @@ The following is a simple example code that shows how to use rocFFT to compute a // Copy result back to host std::vector y(N); hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost); - + // Print results for (size_t i = 0; i < N; i++) { diff --git a/ROCm_Tools/rocFFTAPI.rst b/ROCm_Tools/rocFFTAPI.rst index d0a1928e..22c8548c 100644 --- a/ROCm_Tools/rocFFTAPI.rst +++ b/ROCm_Tools/rocFFTAPI.rst @@ -119,7 +119,7 @@ Documentation is TBD. rocfft_transform_type_complex_forward, rocfft_transform_type_complex_inverse, rocfft_transform_type_real_forward, - rocfft_transform_type_real_inverse, + rocfft_transform_type_real_inverse, } rocfft_transform_type; // Precision @@ -136,14 +136,14 @@ Documentation is TBD. rocfft_element_type_complex_double, rocfft_element_type_single, rocfft_element_type_double, - rocfft_element_type_byte, + rocfft_element_type_byte, } rocfft_element_type; // Result placement typedef enum rocfft_result_placement_e { rocfft_placement_inplace, - rocfft_placement_notinplace, + rocfft_placement_notinplace, } rocfft_result_placement; // Array type @@ -153,7 +153,7 @@ Documentation is TBD. rocfft_array_type_complex_planar, rocfft_array_type_real, rocfft_array_type_hermitian_interleaved, - rocfft_array_type_hermitian_planar, + rocfft_array_type_hermitian_planar, } rocfft_array_type; // Execution mode @@ -178,7 +178,7 @@ To give an idea of how the library API is intended to be used, the following seq status = rocfft_plan_description_create(&description); status = rocfft_plan_description_set_data_layout(&description, ...); - // create plan + // create plan status = rocfft_plan_create(&plan, ..., &description); // create execution_info as needed diff --git a/ROCm_Tools/rocblaswiki.rst b/ROCm_Tools/rocblaswiki.rst index d49e3eb1..b13ea6e9 100644 --- a/ROCm_Tools/rocblaswiki.rst +++ b/ROCm_Tools/rocblaswiki.rst @@ -1,7 +1,7 @@ .. _rocblaswiki: ======================== -rocblas build wiki +rocblas build wiki ======================== Home @@ -12,7 +12,7 @@ Building rocBLAS 1. For instructions to build rocblas library and clients, see Build rocBLAS libraries and verification code. 2. For an example using rocBLAS see Example C code calling rocBLAS function. 3. For instructions on how to run/use the client code, see Build rocBLAS libraries, verification-code, tests and benchmarks. - + Functionality *************** rocBLAS exports the following BLAS-like functions at this time. @@ -36,12 +36,12 @@ Rules for obtaining the rocBLAS API from Legacy BLAS * Where Legacy BLAS functions have return values, the return value is instead added as the last function argument. It is returned by reference on either the host or the device. The rocBLAS functions will check to see it the value is on the device. If this is true, it is used, else the value is returned on the host. This applies to the following functions: xDOT, xDOTU, xNRM2, xASUM, IxAMAX, IxAMIN. 7. The return value of all functions is rocblas_status, defined in rocblas_types.h. It is used to check for errors. - + Additional notes ****************** * The rocBLAS library is LP64, so rocblas_int arguments are 32 bit and rocblas_long arguments are 64 bit. - * rocBLAS uses column-major storage for 2D arrays, and 1 based indexing for the functions xMAX and xMIN. This is the same as Legacy BLAS and cuBLAS. If you need row-major and 0 based indexing (used in C language arrays) download the `CBLAS `_ file cblas.tgz. Look at the CBLAS functions that provide a thin interface to Legacy BLAS. They convert from row-major, 0 based, to column-major, 1 based. + * rocBLAS uses column-major storage for 2D arrays, and 1 based indexing for the functions xMAX and xMIN. This is the same as Legacy BLAS and cuBLAS. If you need row-major and 0 based indexing (used in C language arrays) download the `CBLAS `_ file cblas.tgz. Look at the CBLAS functions that provide a thin interface to Legacy BLAS. They convert from row-major, 0 based, to column-major, 1 based. This is done by swapping the order of function arguments. It is not necessary to transpose matrices. * The auxiliary functions rocblas_set_pointer and rocblas_get_pointer are used to set and get the value of the state variable rocblas_pointer_mode. This variable is not used, it is added for compatibility with cuBLAS. rocBLAS will check if your scalar argument passed by reference is on the device. If this is true it will pass by reference on the device, else it passes by reference on the host. @@ -93,7 +93,7 @@ rocblas-test runs Google Tests to test the library rocblas-bench executable to benchmark or test individual functions example-sscal example C code calling rocblas_sscal function ================ =========== - + Common uses of install.sh to build (dependencies + library + client) are in the table below. =================== ============ @@ -159,7 +159,7 @@ Build (library dependencies + client dependencies + library + client) using Indi The unit tests and benchmarking applications in the client introduce the following dependencies: #. `boost `_ -#. `fortran `_ +#. `fortran `_ #. `lapack `_ * lapack itself brings a dependency on a fortran compiler #. `googletest `_ @@ -290,7 +290,7 @@ Example hx[i] = rand() % 10 + 1; //generate a integer number between [1, 10] } - // save a copy in hz + // save a copy in hz hz = hx; hipMemcpy(dx, hx.data(), sizeof(float) * N, hipMemcpyHostToDevice); @@ -370,7 +370,7 @@ Run the executable with the command $(CPP) -c -o $@ $< $(CFLAGS) $(EXE) : $(OBJ) - $(LD) $(OBJ) $(LDFLAGS) -o $@ + $(LD) $(OBJ) $(LDFLAGS) -o $@ clean: rm -f $(EXE) $(OBJ) @@ -646,7 +646,7 @@ Train Tensile for rocBLAS Below are 10 steps that can be used to build Tensile and rocBLAS for the sizes specified in rocblas_sgemm_asm_miopen.yaml :: - + git clone -b develop https://github.com/ROCmSoftwarePlatform/Tensile.git cd Tensile mkdir build diff --git a/ROCm_Tools/rocm-debug.rst b/ROCm_Tools/rocm-debug.rst index 8d734511..1f76a909 100644 --- a/ROCm_Tools/rocm-debug.rst +++ b/ROCm_Tools/rocm-debug.rst @@ -21,10 +21,10 @@ Build Steps ************ 1.Install ROCm using the instruction `here `_ - + 2.Clone the Debug SDK repository -:: +:: git clone https://github.com/RadeonOpenCompute/ROCm-GPUDebugSDK.git 3. Build the AMD HSA Debug Agent Library and the Matrix multiplication examples by calling make in the src/HSADebugAgent and the samples/MatrixMultiplication directories respectively @@ -32,16 +32,16 @@ Build Steps :: cd src/HSADebugAgent make - + * Note that matrixMul_kernel.hsail is included for reference only. This sample will load the pre-built hsa binary (matrixMul_kernel.brig) to run the kernel. - - + + :: - + cd samples/MatrixMultiplication - + :: - + make 4. Build the Debug Facilities library by calling make in the src/HwDbgFacilities directory diff --git a/ROCm_Tools/tensile.rst b/ROCm_Tools/tensile.rst index 04f4a75e..a697a77b 100644 --- a/ROCm_Tools/tensile.rst +++ b/ROCm_Tools/tensile.rst @@ -63,7 +63,7 @@ Benchmark Config Example Benchmark config.yaml -:: +:: GlobalParameters: PrintLevel: 1 @@ -259,18 +259,18 @@ Each step of the benchmark can override what problem sizes will be benchmarked. 1.[1968] * Benchmark only size 1968; n = 1. - + 2.[16, 1920] * Benchmark sizes 16 to 1968 using the default step size (=16); n = 123. - + 3.[16, 32, 1968] * Benchmark sizes 16 to 1968 using a step size of 32; n = 61. - + 4.[64, 32, 16, 1968] * Benchmark sizes from 64 to 1968 with a step size of 32. Also, increase the step size by 16 each iteration. * This causes fewer sizes to be benchmarked when the sizes are large, and more benchmarks where the sizes are small; this is typically desired behavior. * n = 16 (64, 96, 144, 208, 288, 384, 496, 624, 768, 928, 1104, 1296, 1504, 1728, 1968). The stride at the beginning is 32, but the stride at the end is 256. - + 5.[0] * The size of this index is just whatever size index 0 is. For a 3-dimensional ProblemType, this allows benchmarking only a 2- dimensional or 1-dimensional slice of problem sizes. @@ -372,11 +372,11 @@ Tensile can be installed via: :: - git clone https://github.com/RadeonOpenCompute/Tensile.git + git clone https://github.com/RadeonOpenCompute/Tensile.git python Tensile/Tensile/Tensile.py config.yaml benchmark_path -.. _KernelParameters: +.. _KernelParameters: Kernel Parameters ################### @@ -411,7 +411,7 @@ The kernel parameters affect many aspects of performance. Changing a parameter m .. image:: img1.png :align: center - + How N-Dimensional Tensor Contractions Are Mapped to Finite-Dimensional GPU Kernels ************************************************************************************ For a traditional GEMM, the 2-dimensional output, C[i,j], is mapped to launching a 2-dimensional grid of work groups, each of which has a 2-dimensional grid of work items; one dimension belongs to i and one dimension belongs to j. The 1-dimensional summation is represented by a single loop within the kernel body. @@ -448,7 +448,7 @@ The device languages Tensile supports for the gpu kernels is * OpenCL 1.2 * HIP * Assembly - * gfx803 + * gfx803 * gfx900 .. _LibraryLogic: diff --git a/ROCm_Tools/tutorial.rst b/ROCm_Tools/tutorial.rst index 15c2e053..cbb0e632 100644 --- a/ROCm_Tools/tutorial.rst +++ b/ROCm_Tools/tutorial.rst @@ -2,7 +2,7 @@ tutorial ========== - + How do I debug my GPU application? ************************************ You can start your program in rocm-gdb just like you would any application under gdb @@ -190,7 +190,7 @@ Switching the focus to another work-item and printing $s0 allows us to view data :: (ROCm-gdb) rocm thread wg:0,0,0 wi:1,0,0 - [ROCm-gdb]: Switching to work-group (0,0,0) and work-item (1,0,0) + [ROCm-gdb]: Switching to work-group (0,0,0) and work-item (1,0,0) (ROCm-gdb) print rocm:$s0 $3 = 1 @@ -299,7 +299,7 @@ The info rocm work-groups command will show the active work-groups for the activ The info rocm wg 0 command will show the information of work-group 0 for the active dispatch -:: +:: Information for Work-group 0 Index Wave ID {SE,SH,CU,SIMD,Wave} Work-item ID Abs Work-item ID PC Source line @@ -337,11 +337,11 @@ ROCm-gdb helps developers to view information about kernels that have been launc (ROCm-gdb) set rocm trace mytrace.csv (ROCm-gdb) set rocm trace on -You can now execute and debug the application within ROCm-gdb. Anytime during the application’s execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below. - &__OpenCL_matrixMul_kernel -====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== -index queue_id packet_id kernel_name header setup workgroup_size reserved0 grid_size private_segment_size -====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== +You can now execute and debug the application within ROCm-gdb. Anytime during the application's execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below. + &__OpenCL_matrixMul_kernel +====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== +index queue_id packet_id kernel_name header setup workgroup_size reserved0 grid_size private_segment_size +====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== group_segment_size kernel_object kernarg_address reserved2 completion_signal 0 380095252 0 &__Gdt_vectoradd_kernel 5122 1 {64 1 1} 0 {64 1 1} 0 0 140737353981952 0x713000 0 7513216 1 380095252 1 &__OpenCL_matrixMul_kernel 5122 2 {16 16 1} 0 {128 80 1} 0 0 140737353983488 0x6ca000 0 7910848 diff --git a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst index 282c37ea..0013b593 100644 --- a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst +++ b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst @@ -9,21 +9,21 @@ PCIe Passthrough on KVM ================ The following KVM-based instructions assume a headless host with an input/output memory management unit (IOMMU) to pass peripheral devices such as a GPU to guest virtual machines. If you know your host supports IOMMU but the below command does not find "svm" or "vxm", you may need to enable IOMMU in your BIOS. -:: - - cat /proc/cpuinfo | grep -E “svm|vxm” +:: + + cat /proc/cpuinfo | grep -E "svm|vxm" Ubuntu 16.04 **************************** Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 installed - + **a. Install necessary packages and prepare for pass through device** -1. :: - +1. :: + sudo apt-get install qemu-kvm qemu-system bridge-utils virt-manager ubuntu-vm-builder libvirt-dev - + 2. add following modules into /etc/modules | vfio | vfio_iommu_type1 @@ -31,15 +31,15 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal | kvm | kvm_intel - add intel_iommu=on in /etc/default/grub + add intel_iommu=on in /etc/default/grub | GRUB_CMDLINE_LINUX_DEFAULT="quiet splash intel_iommu=on" - :: - + :: + sudo update-grub 3. Blacklist amdgpu by adding the following line to /etc/modprobe.d/blacklist.conf - :: - + :: + blacklist amdgpu **b. Bind pass through device to vfio-pci** @@ -60,8 +60,8 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal 2. Make it executable by enter the command -:: - +:: + chmod 755 vfio-bind 3. Bind the device to vfio by running the command for the three pass through devices @@ -76,17 +76,17 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal **c. Pass through device to guest VM** -1. Start VMM by running “virt-manager” as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU copy host CPU configuration, network use bridge mode. +1. Start VMM by running "virt-manager" as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU copy host CPU configuration, network use bridge mode. 2. Add Hardware --> Select PCI Host device, select the appropriate device to pass through. ex:0000:83:00.0 3. sudo setpci -s 83:00.0 CAP_EXP+28.l=40 4. sudo reboot -After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device. +After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device. Fedora 27 or CentOS 7 (1708) **************************** From a fresh install of Fedora 27 or CentOS 7 (1708) - + **a. Install necessary packages and prepare for pass through device** 1. Identity the vendor and device id(s) for the PCIe device(s) you wish to passthrough, e.g., 1002:6861 and 1002:aaf8 for an AMD Radeon Pro WX 9100 and its associated audio device, @@ -111,7 +111,7 @@ From a fresh install of Fedora 27 or CentOS 7 (1708) echo "options vfio-pci ids=1002:6861,1002:aaf8" | sudo tee -a /etc/modprobe.d/vfio.conf echo "options vfio-pci disable_vga=1" | sudo tee -a /etc/modprobe.d/vfio.conf sed 's/quiet/quiet rd.driver.pre=vfio-pci video=efifb:off/' /etc/sysconfig/grub - + 5. Update the kernel boot settings :: @@ -140,7 +140,7 @@ Note: To pass a device within a particular IOMMU group, all devices within that lspci -nns "${d##*/}" done; - + ROCm-Docker =========== @@ -152,7 +152,7 @@ This repository contains a framework for building the software layers defined in * Docker on `Ubuntu `_ systems or `Fedora systems `_ * Highly recommended: `Docker-Compose `_ to simplify container management - + Docker Hub ********** Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is hosted on `Docker Hub `_ . After the `ROCm kernel is installed `_ , pull the image from Docker Hub and create a new instance of a container. @@ -161,8 +161,8 @@ Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is ho sudo docker pull rocm/rocm-terminal sudo docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video rocm/rocm-terminal - - + + ROCm-docker set up guide ************************* `Installation instructions `_ and asciicasts demos are available to help users quickly get running with rocm-docker. Visit the set up guide to read more. @@ -265,7 +265,7 @@ The dockerfile that serves as a 'terminal' creates a non-root user called **rocm To increase container security: 1.Eliminate the sudo-nopasswd COPY statement in the dockerfile and replace with - + 2.Your own password with RUN echo 'account:password' | chpasswd The docker.ce release 18.02 has known defects working with rocm-user account insider docker image. Please upgrade docker package to the `18.04 build `_. diff --git a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~ b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~ deleted file mode 100644 index b86d0cad..00000000 --- a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~ +++ /dev/null @@ -1,262 +0,0 @@ - -.. _ROCm-Virtualization-&-Containers: - -================================= -ROCm Virtualization & Containers -================================= - -PCIe Passthrough on KVM -================ -The following KVM-based instructions assume a headless host with an input/output memory management unit (IOMMU) to pass peripheral devices such as a GPU to guest virtual machines. If you know your host supports IOMMU but the below command does not find "svm" or "vxm", you may need to enable IOMMU in your BIOS. - -:: - - cat /proc/cpuinfo | grep -E “svm|vxm” - -Ubuntu 16.04 -**************************** -Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 installed - -**a. Install necessary packages and prepare for pass through device** - -1. :: - - sudo apt-get install qemu-kvm qemu-system bridge-utils virt-manager ubuntu-vm-builder libvirt-dev - - -2. add following modules into /etc/modules - | vfio - | vfio_iommu_type1 - | vfio_pci - | kvm - | kvm_intel - - add intel_iommu=on in /etc/default/grub - | GRUB_CMDLINE_LINUX_DEFAULT="quiet splash intel_iommu=on" - :: - - sudo update-grub - -3. Blacklist amdgpu by adding the following line to /etc/modprobe.d/blacklist.conf - :: - - blacklist amdgpu -**b. Bind pass through device to vfio-pci** - -1. Create a script file (vfio-bind) under /usr/bin. The script file has the following content: - -:: - - #!/bin/bash - modprobe vfio-pci - for dev in "$@"; do - vendor=$(cat /sys/bus/pci/devices/$dev/vendor) - device=$(cat /sys/bus/pci/devices/$dev/device) - if [ -e /sys/bus/pci/devices/$dev/driver ]; then - echo $dev > /sys/bus/pci/devices/$dev/driver/unbind - fi - echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id - done - -2. Make it executable by enter the command - -:: - - chmod 755 vfio-bind - -3. Bind the device to vfio by running the command for the three pass through devices - -:: - - lspci -n -d 1002: - 83:00.0 0300: 1002:7300 (rev ca) - vfio.bind 0000:83:00.0 - -4. sudo reboot - -**c. Pass through device to guest VM** - -1. Start VMM by running “virt-manager” as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU copy host CPU configuration, network use bridge mode. -2. Add Hardware --> Select PCI Host device, select the appropriate device to pass through. ex:0000:83:00.0 -3. sudo setpci -s 83:00.0 CAP_EXP+28.l=40 -4. sudo reboot - -After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device. - -Fedora 27 or CentOS 7 (1708) -**************************** -From a fresh install of Fedora 27 or CentOS 7 (1708) - -**a. Install necessary packages and prepare for pass through device** - -1. Identity the vendor and device id(s) for the PCIe device(s) you wish to passthrough, e.g., 1002:6861 and 1002:aaf8 for an AMD Radeon Pro WX 9100 and its associated audio device, - lspci -nnk - -2. Install virtualization packages - sudo dnf install @virtualization - sudo usermod -G libvirt -a $(whoami) - sudo usermod -G kvm -a $(whoami) - -3. Enable IOMMU in the GRUB_CMDLINE_LINUX variable for your target kernel - a. For an AMD CPU - sudo sed 's/quiet/quiet amd_iommu=on iommu=pt/' /etc/sysconfig/grub - b. For an Intel CPU - sudo sed 's/quiet/quiet intel_iommu=on iommu=pt/' /etc/sysconfig/grub - -**b. Bind pass through device to vfio-pci** - -4. Preempt the host claiming the device by loading a stub driver - echo "options vfio-pci ids=1002:6861,1002:aaf8" | sudo tee -a /etc/modprobe.d/vfio.conf - echo "options vfio-pci disable_vga=1" | sudo tee -a /etc/modprobe.d/vfio.conf - sed 's/quiet/quiet rd.driver.pre=vfio-pci video=efifb:off/' /etc/sysconfig/grub - -5. Update the kernel boot settings - sudo grub2-mkconfig -o /etc/grub2-efi.cfg - echo 'add_drivers+="vfio vfio_iommu_type1 vfio_pci"' | sudo tee -a /etc/dracut.conf.d/vfio.conf - sudo dracut -f --kver `uname -r` - -6. Reboot and verify that vfio-pci driver has been loaded - lspci -nnk - -**c. Pass through device to guest VM** - -1. Within virt-manager the device should now appear in the list of available PCI devices - -Note: To pass a device within a particular IOMMU group, all devices within that IOMMU group must also be passed. You may wish to refer to https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF for more details, such as the following script that lists all IOMMU groups and the devices within them. - - #!/bin/bash - shopt -s nullglob - for d in /sys/kernel/iommu_groups/*/devices/*; do - n=${d#*/iommu_groups/*}; n=${n%%/*} - printf 'IOMMU Group %s ' "$n" - lspci -nns "${d##*/}" - done; - - -ROCm-Docker -=========== - - * `ROCm-Docker `_ - -This repository contains a framework for building the software layers defined in the Radeon Open Compute Platform into portable docker images. The following are docker dependencies, which should be installed on the target machine. - - * Docker on `Ubuntu `_ systems or `Fedora systems `_ - * Highly recommended: `Docker-Compose `_ to simplify container management - -Docker Hub -********** -Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is hosted on `Docker Hub `_ . After the `ROCm kernel is installed `_ , pull the image from Docker Hub and create a new instance of a container. - -:: - - sudo docker pull rocm/rocm-terminal - sudo docker run -it --rm --device="/dev/kfd" rocm/rocm-terminal - -ROCm-docker set up guide -************************* -`Installation instructions `_ and asciicasts demos are available to help users quickly get running with rocm-docker. Visit the set up guide to read more. - -**F.A.Q** - -When working with the ROCm containers, the following are common and useful docker commands: - - * A new docker container typically does not house apt repository meta-data. Before trying to install new software using apt, make sure to run sudo apt update first - * A message like the following typically means your user does not have permissions to execute docker; use sudo or `add your user `_ to the docker group. - * Cannot connect to the Docker daemon. Is the docker daemon running on this host? - * Open another terminal into a running container - * sudo docker exec -it bash -l - * Copy files from host machine into running docker container - * sudo docker cp HOST_PATH :/PATH - * Copy files from running docker container onto host machine - * sudo docker cp :/PATH/TO/FILE HOST_PATH - * If receiving messages about no space left on device when pulling images, check the storage driver in use by the docker engine. If its 'device mapper', that means the image size limits imposed by the 'device mapper' storage driver are a problem - Follow the documentation in the :ref:`quickstart` for a solution to change to the storage driver - -**Saving work in a container** - -Docker containers are typically ephemeral, and are discarded after closing the container with the '--rm' flag to docker run. However, there are times when it is desirable to close a container that has arbitrary work in it, and serialize it back into a docker image. This may be to to create a checkpoint in a long and complicated series of instructions, or it may be desired to share the image with others through a docker registry, such as docker hub. - -:: - - sudo docker ps -a # Find container of interest - sudo docker commit - sudo docker images # Confirm existence of a new image - - -Details -******* -Docker does not virtualize or package the linux kernel inside of an image or container. This is a design decision of docker to provide lightweight and fast containerization. The implication for this on the ROCm compute stack is that in order for the docker framework to function, the ROCm kernel and corresponding modules must be installed on the host machine. Containers share the host kernel, so the ROCm KFD component ROCK-Kernel-Driver1 functions outside of docker. - -**Installing ROCK on the host machine.** - -An `apt-get repository `_ is available to automate the installation of the required kernel and kernel modules. - -Building images -**************** -There are two ways to install rocm components: - - 1.install from the rocm apt/rpm repository (packages.amd.com) - - 2.build the components from source and run install scripts - -The first method produces docker images with the smallest footprint and best building speed. The footprint is smaller because no developer tools need to be installed in the image, an the images build speed is fastest because typically downloading binaries is much faster than downloading source and then invoking a build process. Of course, building components allows much greater flexibility on install location and the ability to step through the source with debug builds. ROCm-docker supports making images either way, and depends on the flags passed to the setup script. - -The setup script included in this repository is provides some flexibility to how docker containers are constructed. Unfortunately, Dockerfiles do not have a preprocessor or template language, so typically build instructions are hardcoded. However, the setup script allows us to write a primitive 'template', and after running it instantiates baked dockerfiles with environment variables substituted in. For instance, if you wish to build release images and debug images, first run the setup script to generate release dockerfiles and build the images. Then, run the setup script again and specify debug dockerfiles and build new images. The docker images should generate unique image names and not conflict with each other. - -**setup.sh** - -Currently, the setup.sh scripts checks to make sure that it is running on an Ubuntu system, as it makes a few assumptions about the availability of tools and file locations. If running rocm on a Fedora machine, inspect the source of setup.sh and issue the appropriate commands manually. There are a few parameters to setup.sh of a generic nature that affects all images built after running. If no parameters are given, built images will be based off of Ubuntu 16.04 with rocm components installed from debians downloaded from packages.amd.com. Supported parameters can be queried with ./setup --help. - -============================ ======================== =============================================== -setup.sh parameters parameter [default] description -============================ ======================== =============================================== ---ubuntu xx.yy [16.04] Ubuntu version for to inherit base image ---install-docker-compose helper to install the docker-compose tool -============================ ======================== =============================================== - -The following parameters are specific to building containers that compile rocm components from source. - -============================ ======================== =============================================== -setup.sh parameters parameter [default] description -============================ ======================== =============================================== ---tag string ['master'] string representing a git branch name ---branch string ['master'] alias for tag ---debug build code with debug flags -============================ ======================== =============================================== - -./setup generates finalized Dockerfiles from textual template files ending with the .template suffix. Each sub-directory of this repository corresponds to a docker 'build context' responsible for a software layer in the ROCm stack. After running the script, each directory contains generated dockerfiles for building images from debians and from source. - -Docker compose -***************** - -./setup prepares an environment to be controlled with Docker Compose. While docker-compose is not necessary for proper operation, it is highly recommended. setup.sh does provide a flag to simplify the installation of this tool. Docker-compose coordinates the relationships between the various ROCm software layers, and it remembers flags that should be passed to docker to expose devices and import volumes. - -**Example of using docker-compose** - -docker-compose.yml provides services that build and run containers. YAML is structured data, so it's easy to modify and extend. The setup.sh script generates a .env file that docker-compose reads to satisfy the definitions of the variables in the .yml file. - - * docker-compose run --rm rocm -- Run container using rocm packages - * docker-compose run --rm rocm-from-src -- Run container with rocm built from source - -============================ ===================================================== -Docker-compose description -============================ ===================================================== -docker-compose docker compose executable -run sub-command to bring up interactive container ---rm when shutting the container down, delete it -rocm application service defined in docker-compose.yml -============================ ===================================================== - -**rocm-user has root privileges by default** - -The dockerfile that serves as a 'terminal' creates a non-root user called rocm-user. This container is meant to serve as a development environment (therefore apt-get is likely needed), the user has been added to the linux sudo group. Since it is somewhat difficult to set and change passwords in a container (often requiring a rebuild), the password prompt has been disabled for the sudo group. While this is convenient for development to be able sudo apt-get install packages, it does imply lower security in the container. - -To increase container security: - - 1.Eliminate the sudo-nopasswd COPY statement in the dockerfile and replace with - - 2.Your own password with RUN echo 'account:password' | chpasswd - -**Footnotes:** - -[1] It can be installed into a container, it just doesn't do anything because containers do not go through the traditional boot process. We actually do provide a container for ROCK-Kernel-Driver, but it not used by the rest of the docker images. It does provide isolation and a reproducible environment for kernel development. diff --git a/ROCm_Virtualization_Containers/quickstart.rst b/ROCm_Virtualization_Containers/quickstart.rst index 4fbe4e3e..fdd0f844 100644 --- a/ROCm_Virtualization_Containers/quickstart.rst +++ b/ROCm_Virtualization_Containers/quickstart.rst @@ -16,7 +16,7 @@ It is my recommendation to install the rocm kernel first. Depending on how distr Step 1: Install rocm-kernel **************************** -:: +:: wget -qO - http://packages.amd.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add - sudo sh -c 'echo deb [arch=amd64] http://packages.amd.com/rocm/apt/debian/ trusty main \ diff --git a/Remote_Device_Programming/Memoryhooks.rst b/Remote_Device_Programming/Memoryhooks.rst index 9986eb39..23c53a93 100644 --- a/Remote_Device_Programming/Memoryhooks.rst +++ b/Remote_Device_Programming/Memoryhooks.rst @@ -45,7 +45,7 @@ We use the following algorithm to install the memory hooks: 7. Sometimes it's enough to have hooks for mmap/... to get those events when they are called from malloc/... as well. So first we do some memory allocations and check if we are able to get all events this way. - 8. If we can't, install legacy malloc hooks (__malloc_hook). + 8. If we can't, install legacy malloc hooks (__malloc_hook). We have our own implementation of heap manager in libucm - ptmalloc3. After we replace the original heap manager, we keep track of which pointers were allocated by our library, so we would know ignore all others (since they were allocated by the previous heap manager). Also, we can't restore the previous state, so libucm.so is marked as 'nodelete'. 9. If the former didn't work, modify the relocation tables to point to our implementation of malloc (and friends). diff --git a/Remote_Device_Programming/Performancemeasurement.rst b/Remote_Device_Programming/Performancemeasurement.rst index 2bec57d7..b75d1dd7 100644 --- a/Remote_Device_Programming/Performancemeasurement.rst +++ b/Remote_Device_Programming/Performancemeasurement.rst @@ -13,7 +13,7 @@ Features of the library: * uct_perf_test_run() is the function which runs the test. (currently only UCT API is supported) * No need to do any resource allocation - just pass the testing parameters to the API - * Requires running the function on 2 threads/processes/nodes - by passing RTE callbacks which are used to bootstrap the connections. + * Requires running the function on 2 threads/processes/nodes - by passing RTE callbacks which are used to bootstrap the connections. * Two testing modes - ping-pong and unidirectional stream (TBD bi-directional stream) * Configurabe message size, and data layout (short/bcopy/zcopy) * Supports: warmup cycles, unlimited iterations. @@ -73,7 +73,7 @@ Features of ucx_perftest: Every line of the file is a test to run. The first word is the test name, and the rest are command-line arguments for the test. -h Show this help message. - + Server options: -l Accept clients in an infinite loop @@ -109,4 +109,4 @@ When using mpi as the launcher to run ucx_perftest, please make sure that your u | # iterations | typical | average | overall | average | overall | average | overall | +--------------+---------+---------+---------+----------+----------+-----------+-----------+ 586527 0.845 0.852 0.852 4.47 4.47 586527 586527 - 1000000 0.844 0.848 0.851 4.50 4.48 589339 + 1000000 0.844 0.848 0.851 4.50 4.48 589339 diff --git a/Remote_Device_Programming/PrintUCXinfo.rst b/Remote_Device_Programming/PrintUCXinfo.rst index ea336fbe..2fdd8bb4 100644 --- a/Remote_Device_Programming/PrintUCXinfo.rst +++ b/Remote_Device_Programming/PrintUCXinfo.rst @@ -31,7 +31,7 @@ Sample output: :: - # Transport: rc + # Transport: rc # # mlx5_0:1 # speed: 6502.32 MB/sec @@ -48,8 +48,8 @@ Sample output: # atomic_add: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_swap: 32, 64 bit - # atomic_cswap: 32, 64 bit - # error handling: none + # atomic_cswap: 32, 64 bit + # error handling: none # # mlx4_0:1 # speed: 6502.32 MB/sec @@ -83,7 +83,7 @@ Sample output: # am header: <= 127 # atomic_add: 64 bit # atomic_fadd: 64 bit - # atomic_swap: 64 bit + # atomic_swap: 64 bit # atomic_cswap: 64 bit - # error handling: none + # error handling: none # diff --git a/Remote_Device_Programming/Remote-Device-Programming.rst b/Remote_Device_Programming/Remote-Device-Programming.rst index 655cbd77..e8ad6ab1 100644 --- a/Remote_Device_Programming/Remote-Device-Programming.rst +++ b/Remote_Device_Programming/Remote-Device-Programming.rst @@ -10,12 +10,12 @@ ROCmRDMA **Peer-to-Peer bridge driver for PeerDirect - Deprecated Repo** This is now included as part of the ROCK `Kernel Driver `_ -ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU memory. It allows direct path for data exchange (peer-to-peer) using the standard features of PCI Express. +ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU memory. It allows direct path for data exchange (peer-to-peer) using the standard features of PCI Express. Currently ROCmRDMA provides the following benefits: * Direct access to ROCm memory for 3rd party PCIe devices - * Support for PeerDirect(c) interface to offloads the CPU when dealing + * Support for PeerDirect(c) interface to offloads the CPU when dealing with ROCm memory for RDMA network stacks; Restrictions and limitations @@ -31,11 +31,11 @@ ROCmRDMA interface specification The implementation of ROCmRDMA interface could be found in `[amd_rdma.h] `_ file. Data structures -*************** +*************** + +:: + -:: - - /** * Structure describing information needed to P2P access from another device * to specific location of GPU memory @@ -44,17 +44,17 @@ Data structures uint64_t va; /**< Specify user virt. address * which this page table described */ - + uint64_t size; /**< Specify total size of * allocation */ - + struct pid *pid; /**< Specify process pid to which * virtual address belongs */ - + struct sg_table *pages; /**< Specify DMA/Bus addresses */ - + void *priv; /**< Pointer set by AMD kernel * driver */ @@ -66,7 +66,7 @@ Data structures * Structure providing function pointers to support rdma/p2p requirements. * to specific location of GPU memory */ - + struct amd_rdma_interface { int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid, struct amd_p2p_info **amd_p2p_data, @@ -77,13 +77,13 @@ Data structures int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid, unsigned long *page_size); }; - + The function to query ROCmRDMA interface **************************************** :: - + /** * amdkfd_query_rdma_interface - Return interface (function pointers table) for * rdma interface @@ -93,28 +93,28 @@ The function to query ROCmRDMA interface * \return 0 if operation was successful. */ int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma); - + The function to query ROCmRDMA interface **************************************** :: - + /** * amdkfd_query_rdma_interface - Return interface (function pointers table) for rdma interface * \param interace - OUT: Pointer to interface * \return 0 if operation was successful. */ int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma); - + ROCmRDMA interface functions description ***************************************** -:: +:: + - /** * This function makes the pages underlying a range of GPU virtual memory * accessible for DMA operations from another PCIe device @@ -153,7 +153,7 @@ ROCmRDMA interface functions description int put_pages(struct amd_p2p_info **p_p2p_data) :: - + /** * Check if given address belongs to GPU address space. * \param address - Address to check @@ -174,8 +174,8 @@ ROCmRDMA interface functions description :param pid - Process id structure. Could be NULL if current one. :param page_size - On return: Page size :rtype:return 0 if operation was successful - - + + UCX ==== @@ -217,7 +217,7 @@ MPI Example of the command line (for InfiniBand RC + shared memory): :: - + $ mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 -x UCX_TLS=rc,sm ./app @@ -238,7 +238,7 @@ Example of the command line (for InfiniBand RC + shared memory): 2. The table of MPI and OpenSHMEM distributions that are tested with the HEAD of UCX master ================ =========== -MPI/OpenSHMEM project +MPI/OpenSHMEM project OpenMPI/OSHMEM 2.1.0 MPICH Latest ================ =========== @@ -257,22 +257,22 @@ IPC API **New datatypes** :: - + hsa_amd_ipc_memory_handle_t - + /** IPC memory handle to by passed from one process to another */ typedef struct hsa_amd_ipc_memory_handle_s { uint64_t handle; } hsa_amd_ipc_memory_handle_t; - + hsa_amd_ipc_signal_handle_t - + /** IPC signal handle to by passed from one process to another */ typedef struct hsa_amd_ipc_signal_handle_s { uint64_t handle; } hsa_amd_ipc_signal_handle_t; - + **Memory sharing API** Allows sharing of HSA allocated memory between different processes. @@ -285,9 +285,9 @@ Allows sharing of HSA allocated memory between different processes. | hsa_amd_ipc_get_memory_handle(void *ptr, hsa_amd_ipc_memory_handle_t *ipc_handle); | where: | IN: ptr - Pointer to memory previously allocated via hsa_amd_memory_pool_allocate() call -| OUT: ipc_handle - Unique IPC handle to be used in IPC. -| Application must pass this handle to another process. -| +| OUT: ipc_handle - Unique IPC handle to be used in IPC. +| Application must pass this handle to another process. +| | hsa_amd_ipc_close_memory_handle | Close IPC memory handle previously received via "hsa_amd_ipc_get_memory_handle()" call . @@ -297,7 +297,7 @@ Allows sharing of HSA allocated memory between different processes. | where: | IN: ipc_handle - IPC Handle to close | -| +| | hsa_amd_ipc_open_memory_handle | Open / import an IPC memory handle exported from another process and return address to be used in the current process. @@ -322,9 +322,9 @@ Allows sharing of HSA allocated memory between different processes. | hsa_amd_ipc_get_signal_handle(hsa_signal_t signal, hsa_amd_ipc_signal_handle_t *ipc_handle); | where: | IN: signal - Signal handle created as the result of hsa_signal_create() call. -| OUT: ipc_handle - Unique IPC handle to be used in IPC. -| Application must pass this handle to another process. -| +| OUT: ipc_handle - Unique IPC handle to be used in IPC. +| Application must pass this handle to another process. +| | hsa_amd_ipc_close_signal_handle | Close IPC signal handle previously received via "hsa_amd_ipc_get_signal_handle()" call . @@ -353,26 +353,26 @@ Client should call hsa_signal_destroy() when access to this resource is not need Allows query information about memory resource based on address. It is partially overlapped with the following requirement Memory info interface so it may be possible to merge those two interfaces. :: typedef enum hsa_amd_address_info_s { - + /* Return uint32_t / boolean if address was allocated via HSA stack */ HSA_AMD_ADDRESS_HSA_ALLOCATED = 0x1, - + /** Return agent where such memory was allocated */ HSA_AMD_ADDRESS_AGENT = 0x2, - + /** Return pool from which this address was allocated */ HSA_AMD_ADDRESS_POOL = 0x3, - + /** Return size of allocation */ HSA_AMD_ADDRESS_ALLOC_SIZE = 0x4 - + } hsa_amd_address_info_t; **hsa_status_t HSA_API** | hsa_amd_get_address_info(void *ptr, hsa_amd_address_info_t attribute, void* value); -| where: +| where: | ptr - Address information about which to query | attribute - Attribute to query diff --git a/Remote_Device_Programming/UCP-Design.rst b/Remote_Device_Programming/UCP-Design.rst index 487c8572..5bb7e00f 100644 --- a/Remote_Device_Programming/UCP-Design.rst +++ b/Remote_Device_Programming/UCP-Design.rst @@ -26,7 +26,7 @@ MPI Tag Matching strategies Data specification ******************** - + * Contiguous data (no lkey required) * Non-contiguous data with strides and hierarchy, but without memory key * Pack/unpack callbacks diff --git a/Remote_Device_Programming/UCT-Design.rst b/Remote_Device_Programming/UCT-Design.rst index e156700b..95b332d3 100644 --- a/Remote_Device_Programming/UCT-Design.rst +++ b/Remote_Device_Programming/UCT-Design.rst @@ -12,7 +12,7 @@ The library will contain an abstraction layer called "transport" or "tl". It ena Communication primitives ************************* - * Remote memory access: + * Remote memory access: * put * get * Remote memory atomics: @@ -147,7 +147,7 @@ Data specifications * single-dimension scatter/gather - iovec (can be either local or remote) * iovec element has: pointer, length, stride, count, key / iovec+len * the key should have been obtained from mmap functions. - * transport exposes its max number of entries in the iovec + * transport exposes its max number of entries in the iovec * IB implementation note: tl will post umr-s in correct order as needed, with temporary memory keys. * atomics - pass the arguments directly without local key, since cost of copying the result is negligible. diff --git a/Remote_Device_Programming/logging.rst b/Remote_Device_Programming/logging.rst index 04cf14b7..c21f3d63 100644 --- a/Remote_Device_Programming/logging.rst +++ b/Remote_Device_Programming/logging.rst @@ -9,10 +9,10 @@ UCS has logging infrastructure. logging is controlled by a single level: * fatal - stops the program * error - an error which does not stop the program and can be reported back to user. * warn - a warning which does not return error to the user. - + info * debug - debugging messages, low volume, about initialization/cleanup. - * trace - debugging messages, high volume, during runtime, for “special” events. + * trace - debugging messages, high volume, during runtime, for "special" events. * req - details of every send/receive request and tag matching. * data - headers of every packet being sent/received. * async - async notifications and progress thread. diff --git a/Remote_Device_Programming/profiling.rst b/Remote_Device_Programming/profiling.rst index 81e21f2e..0cf36f4c 100644 --- a/Remote_Device_Programming/profiling.rst +++ b/Remote_Device_Programming/profiling.rst @@ -50,8 +50,8 @@ Run an application and collect profile: Read profile output file: :: - - $ ucx_read_profile ucx.prof + + $ ucx_read_profile ucx.prof command : ./app host : my_host diff --git a/Remote_Device_Programming/reference b/Remote_Device_Programming/reference index 4502f7f4..727407a3 100644 --- a/Remote_Device_Programming/reference +++ b/Remote_Device_Programming/reference @@ -3,4 +3,4 @@ This section consists of UCX documentation from the following sites: https://www.openucx.org/introduction https://github.com/openucx/ucx/wiki/High-Level-design https://github.com/openucx/ucx/wiki/Infrastructure-and-Tools -https://github.com/openucx/ucx/wiki/FAQ +https://github.com/openucx/ucx/wiki/FAQ diff --git a/Remote_Device_Programming/sideprogresscompletion.rst b/Remote_Device_Programming/sideprogresscompletion.rst index 98c0a2fc..a94ff9e2 100644 --- a/Remote_Device_Programming/sideprogresscompletion.rst +++ b/Remote_Device_Programming/sideprogresscompletion.rst @@ -12,7 +12,7 @@ On the low level, we can consider 2 types of operations: bcopy (including short) :: - ucs_status_t uct_XXX_bcopy(uct_ep_h ep, ..., uint32_t flags); + ucs_status_t uct_XXX_bcopy(uct_ep_h ep, ..., uint32_t flags); ucs_status_t ucx_XXX_zcopy(uct_ep_h ep, ..., uint32_t flags, uct_req_t *req); typedef struct uct_req { @@ -29,8 +29,8 @@ These functions will behave as follows: Implementation notes: - * The transport might limit the amount of sends to single endpoint without considering other endpoints, to enforce fairness. In that case, if the limit is reached, the send will return UCS_ERR_WOULD_BLOCK. - + * The transport might limit the amount of sends to single endpoint without considering other endpoints, to enforce fairness. In that case, if the limit is reached, the send will return UCS_ERR_WOULD_BLOCK. + Protocol layer - Nonblocking MPI ********************************** diff --git a/Tutorial/GCN-asm-tutorial.rst b/Tutorial/GCN-asm-tutorial.rst index 75ff5d2b..a3307b1c 100644 --- a/Tutorial/GCN-asm-tutorial.rst +++ b/Tutorial/GCN-asm-tutorial.rst @@ -7,16 +7,16 @@ GCN asm Tutorial The Art of AMDGCN Assembly: How to Bend the Machine to Your Will ****************************************************************** -The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following: +The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following: * The program may be written in a high level language that does not expose all of the features available on the hardware. - * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. + * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the semantics of a language or because the compiler itself is generating un-optimized code. -Consider a program that uses one of GCN’s new features (source code is available on `GitHub `_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. +Consider a program that uses one of GCN's new features (source code is available on `GitHub `_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide `_. Note: the assembler is currently experimental; some of syntax we describe may change. DS Permute Instructions ************************** -Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: +Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form: :: @@ -28,7 +28,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov Passing Parameters to a Kernel ******************************* -Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: +Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following: :: @@ -50,7 +50,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg aql->kernarg_address = args; /* * Write the args directly to the kernargs buffer; - * the code assumes that memory is already allocated for the + * the code assumes that memory is already allocated for the * buffers that in_ptr, index_ptr and out_ptr point to */ args->in = in_ptr; @@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers. out = AllocateBuffer(size); // Fill Kernarg memory - Kernarg(in); // Add base pointer to “in” buffer - Kernarg(index); // Append base pointer to “index” buffer - Kernarg(out); // Append base pointer to “out” buffer + Kernarg(in); // Add base pointer to "in" buffer + Kernarg(index); // Append base pointer to "index" buffer + Kernarg(out); // Append base pointer to "out" buffer Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as @@ -91,7 +91,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru .text .p2align 8 .amdgpu_hsa_kernel hello_world - + hello_world: .amd_kernel_code_t @@ -131,7 +131,7 @@ Currently, a programmer must manually set all non-default values to provide the The GPR Counting ****************** -The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: +The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations: :: diff --git a/Tutorial/Optimizing-Dispatches.rst b/Tutorial/Optimizing-Dispatches.rst index 38ba7e97..2ffda4ca 100644 --- a/Tutorial/Optimizing-Dispatches.rst +++ b/Tutorial/Optimizing-Dispatches.rst @@ -7,18 +7,18 @@ Optimizing-Dispatches ROCm with Rapid Harmony : Optimizing HSA Dispatch ###################################################### -We `previously `_ looked at how to launch an OpenCL™ kernel using the HSA runtime. That example showed the basics of using the HSA Runtime. `Here `_ we'll turn up the tempo a bit by optimizing the launch code - moving some expensive operations into the setup code (rather than on each dispatch), removing host-side synchronization, and optimizing the memory fences to the bare minimum required. We'll measure the contributions of the different optimizations and discuss the results.The code is available at the `same GitHub repository `_ as before and the optimizations can be enabled with a series of command-line switches. +We `previously `_ looked at how to launch an OpenCL(TM) kernel using the HSA runtime. That example showed the basics of using the HSA Runtime. `Here `_ we'll turn up the tempo a bit by optimizing the launch code - moving some expensive operations into the setup code (rather than on each dispatch), removing host-side synchronization, and optimizing the memory fences to the bare minimum required. We'll measure the contributions of the different optimizations and discuss the results.The code is available at the `same GitHub repository `_ as before and the optimizations can be enabled with a series of command-line switches. Optimizing ############# Bitonic sort involves running the same kernel several times. For the default array length of 32768, the algorithm launches 120 kernels. The original OpenCL code and the associated port used in the example synchronize with the host after each of the kernel code. To improve performance, we can submit all 120 kernels at one time, and only synchronize with the host after the last one completes. To make this change, we will need to restructure the BitonicSort::run call as follows: - * Each kernel still needs to wait for the previous kernel to finish executing. The AQL packet in the HSA system architecture defines a “barrier” bit which provides exactly this synchronization – packets with the barrier bit set will wait for all preceding kernels in the same queue to complete before beginning their own execution. Barrier-bit synchronization only works for commands in the same queue, but will be more efficient than using signals in the cases where it applies. So we’ll set the barrier bit for all the kernels to provide the required synchronization between kernels, and therefore will only need to use a completion_signal for the last kernel in the sequence. (all other kernels set the completion_signal to 0, which saves an atomic decrement operation when the command finishes. ) This optimization is marked with p_optPreallocSignal. + * Each kernel still needs to wait for the previous kernel to finish executing. The AQL packet in the HSA system architecture defines a "barrier" bit which provides exactly this synchronization - packets with the barrier bit set will wait for all preceding kernels in the same queue to complete before beginning their own execution. Barrier-bit synchronization only works for commands in the same queue, but will be more efficient than using signals in the cases where it applies. So we'll set the barrier bit for all the kernels to provide the required synchronization between kernels, and therefore will only need to use a completion_signal for the last kernel in the sequence. (all other kernels set the completion_signal to 0, which saves an atomic decrement operation when the command finishes. ) This optimization is marked with p_optPreallocSignal. - * In HSA, each kernel submission requires a block of “kernarg” memory to hold the kernel arguments. The baseline implementation allocates a single kernarg block and re-uses it for each kernel submission. In the optimized version, we submit all the kernels at the same time, but with different kernel arguments, so we must ensure that each kernel has its own kernarg block. The code actually performs a single kernarg allocation with enough space to cover all of the inflight kernels. Additionally, the code aligns each kernarg block on a 64-byte cache line boundary. This avoids false-sharing cases where the GPU is reading kernargs for one command while the host is writing arguments for another kernel, causing the cache line to ping-pong between CPU and GPU caches. The kernarg optimizations are marked with p_optPreallocKernarg. + * In HSA, each kernel submission requires a block of "kernarg" memory to hold the kernel arguments. The baseline implementation allocates a single kernarg block and re-uses it for each kernel submission. In the optimized version, we submit all the kernels at the same time, but with different kernel arguments, so we must ensure that each kernel has its own kernarg block. The code actually performs a single kernarg allocation with enough space to cover all of the inflight kernels. Additionally, the code aligns each kernarg block on a 64-byte cache line boundary. This avoids false-sharing cases where the GPU is reading kernargs for one command while the host is writing arguments for another kernel, causing the cache line to ping-pong between CPU and GPU caches. The kernarg optimizations are marked with p_optPreallocKernarg. * The function bitonicSortGPU_opt contains the optimized loop which submits the batch of 120 kernels to the GPU. This code is marked with o_optAvoidHostSync). - + * Each AQL kernel dispatch packet contains a field that controls the memory fences applied before and after the kernel executes. In the baseline implementation, the fences conservatively specify system visibility for both acquire and release fences. (The subject of fences and what they control is well beyond the scope of this document but it covered extensively in the HSA System Architecture Specification Memory Model. It turns out we can make a more surgical use of these fences in the optimized version: (code marked with p_optFence) * The first kernel needs a system acquire fence to make sure it gets the data from the host->device copy. @@ -75,7 +75,7 @@ The timing numbers shown here includes the time to transfer the array to the GPU +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+ |RunTime/Iteration (us)| 1943 | 1906 | 1869 | 1665 | 1221 | 1137 | +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+ -|Delta/Iteration(us) | | -37 | -37 | -204 | -444 | -84 | +|Delta/Iteration(us) | | -37 | -37 | -204 | -444 | -84 | +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+ @@ -88,7 +88,7 @@ The system-scope fences are fairly expensive - Fiji has a 2MB L2 cache, and it t Finally, using pinned host memory improves the transfer speeds from around 6GB/s to 14GB/s. In this workload, we see a modest performance improvement (84us) since most of the benchmark is spent running the kernels and synchronizing between them. Overall the performance improvement from these optimizations is 1.7X faster than the baseline version. - + Reference ########### -`Wikipedia `_ has a nice description of the Bitonic sort algorithm, including pictures. Eric Bainville wrote a nice explanation `here `_ describing how to optimize Bitonic Sort for the GPU. +`Wikipedia `_ has a nice description of the Bitonic sort algorithm, including pictures. Eric Bainville wrote a nice explanation `here `_ describing how to optimize Bitonic Sort for the GPU. diff --git a/Tutorial/ROCm-MultiGPU.rst b/Tutorial/ROCm-MultiGPU.rst index 3104582f..f891dd4b 100644 --- a/Tutorial/ROCm-MultiGPU.rst +++ b/Tutorial/ROCm-MultiGPU.rst @@ -9,7 +9,7 @@ In-node * ROCr Base driver has P2P API support * `ROCr (HSA) AGENT API with Peer to Peer support `_. * `HCC Language Runtime support of P2P ROCr Agent API `_. -* `HIP Language Runtime support of P2P P2P API’s model after CUDA P2P API’s `_. +* `HIP Language Runtime support of P2P P2P API's model after CUDA P2P API's `_. * OpenCL Language Runtime P2P API Peer-to-Peer API with Autocopy support over Intel QPI bus * API name - clEnqueueBufferCopyP2PAMD * Releasing in OpenCL with ROCm 1.6.2 @@ -26,7 +26,7 @@ Out of Node Standard Frameworks for Out of Node Communication --------------------------------------------------- -* `OpenUCX UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks - In Development `_ `Source for ROCm `_. +* `OpenUCX UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks - In Development `_ `Source for ROCm `_. * `OpenMPI Open MPI Project is an open source Message Passing Interface https://www.open-mpi.org In Development `_. * `MPICH MPICH is a high-performance and widely portable implementation of the Message Passing Interface (MPI) standard (MPI-1, MPI-2 and MPI-3) `_ `In Development `_. * `OpenSHMEM Partitioned Global Address Space & Communication Library - In Development `_. diff --git a/Tutorial/Tutorial.rst b/Tutorial/Tutorial.rst index 9c1f7510..65c37de6 100644 --- a/Tutorial/Tutorial.rst +++ b/Tutorial/Tutorial.rst @@ -17,8 +17,8 @@ Tutorial * :ref:`rocncloc` ROCm With Harmony: Combining OpenCL Kernels, HCC and HSA in a Single Program. This tutorial demonstrates how to compile OpenCL kernels using the CL offline compiler (CLOC) and integrate them with HCC C++ compiled ROCm applications. - * `The AMD GCN Architecture - A Crash Course, by Layla Mah `_ + * `The AMD GCN Architecture - A Crash Course, by Layla Mah `_ - * `AMD GCN Architecture White paper `_ + * `AMD GCN Architecture White paper `_ - * :ref:`ROCm-MultiGPU` + * :ref:`ROCm-MultiGPU` diff --git a/Tutorial/caffe.rst b/Tutorial/caffe.rst index 211cfe56..9fa281e0 100644 --- a/Tutorial/caffe.rst +++ b/Tutorial/caffe.rst @@ -30,38 +30,38 @@ Installing ROCm Debian packages: :: PKG_REPO="http://repo.radeon.com/rocm/apt/debian/" - + wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add - - + sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list" - + sudo apt-get update - + sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc - + echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc source $HOME/.bashrc - + sudo reboot - + Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name):: - + uname -r - + In addition, check that you can run the simple HSA vector_copy sample application:: - + cd /opt/rocm/hsa/sample make ./vector_copy - + Pre-requisites Installation ++++++++++++++++++++++++++++ Install Caffe dependencies:: - + sudo apt-get install \ pkg-config \ protobuf-compiler \ @@ -78,24 +78,24 @@ Install Caffe dependencies:: libopencv-dev \ libfftw3-dev \ libelf-dev - + Install the necessary ROCm compute libraries:: - + sudo apt-get install rocm-libs miopen-hip miopengemm hipCaffe Build Steps +++++++++++++++++++++ Clone hipCaffe:: - - git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git - + + git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git + cd hipCaffe - + You may need to modify the Makefile.config file for your own installation. Then, build it:: - + cp ./Makefile.config.example ./Makefile.config - make + make To improve build time, consider invoking parallel make with the "-j$(nproc)" flag. @@ -103,7 +103,7 @@ Unit Testing ------------- Run the following commands to perform unit testing of different components of Caffe. -:: +:: make test ./build/test/test_all.testbin @@ -114,7 +114,7 @@ MNIST training ++++++++++++++++ Steps:: - + ./data/mnist/get_mnist.sh ./examples/mnist/create_mnist.sh ./examples/mnist/train_lenet.sh @@ -123,7 +123,7 @@ CIFAR-10 training ++++++++++++++++++ Steps:: - + ./data/cifar10/get_cifar10.sh ./examples/cifar10/create_cifar10.sh ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt @@ -142,7 +142,7 @@ Soumith's Convnet benchmarks Steps: :: - + git clone https://github.com/soumith/convnet-benchmarks.git cd convnet-benchmarks/caffe @@ -183,7 +183,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature: @ 0x8015c3 caffe::Solver<>::Solve() @ 0x71a277 caffe::P2PSync<>::Run() @ 0x42dcbc train() - + See this `comment `_. diff --git a/Tutorial/hipCaffe .rst b/Tutorial/hipCaffe .rst index 025181a0..28e5efde 100644 --- a/Tutorial/hipCaffe .rst +++ b/Tutorial/hipCaffe .rst @@ -4,18 +4,18 @@ hipCaffe Quickstart Guide ########################### -In this quickstart guide, we’ll walk through the steps for ROCm installation. Then, we’ll run a few training and inference experiments and check their accuracy. +In this quickstart guide, we'll walk through the steps for ROCm installation. Then, we'll run a few training and inference experiments and check their accuracy. Install ROCm ------------- -Here are the main ROCm components we’ll be using:: +Here are the main ROCm components we'll be using:: sudo apt install rocm-dkms sudo apt-get install rocm-libs sudo apt-get install miopen-hip miopengemm - + And some misc packages:: - + sudo apt-get install -y \ g++-multilib \ libunwind-dev \ @@ -28,65 +28,65 @@ And some misc packages:: rpm \ unzip \ bc - + Verify ROCm ------------ Test a simple HIP sample:: - + cp -r /opt/rocm/hip/samples ~/hip-samples && cd ~/hip-samples/0_Intro/square/ - + make - + ./square.out - + Install hipCaffe ---------------- Handle the Caffe dependencies first:: - + sudo apt-get install -y \ pkg-config \ protobuf-compiler \ libprotobuf-dev \ libleveldb-dev \ libsnappy-dev \ - libhdf5-serial-dev \ + libhdf5-serial-dev \ libatlas-base-dev \ libboost-all-dev \ libgflags-dev \ libgoogle-glog-dev \ - liblmdb-dev \ + liblmdb-dev \ python-numpy python-scipy python3-dev python-yaml python-pip \ python-skimage python-opencv python-protobuf \ libopencv-dev \ libfftw3-dev \ libelf-dev - + Note that you might need minor changes to Makefile.config (system dependent):: - + cd ~ - + git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git - + cd hipCaffe - + cp ./Makefile.config.example ./Makefile.config - + make -j$(nproc) - + Workloads ----------- MNIST training +++++++++++++++ -Details on MNIST training can be found at this `link `_. - +Details on MNIST training can be found at this `link `_. + Here are the basic instructions:: ./data/mnist/get_mnist.sh ./examples/mnist/create_mnist.sh ./examples/mnist/train_lenet.sh - + Expected result: >99% accuracy after 10000 iterations :: @@ -104,7 +104,7 @@ Expected result: >99% accuracy after 10000 iterations I0717 21:06:58.701591 9965 solver.cpp:404] Test net output #0: accuracy = 0.9917 I0717 21:06:58.701642 9965 solver.cpp:404] Test net output #1: loss = 0.0269806 (* 1 = 0.0269806 loss) I0717 21:06:58.701668 9965 solver.cpp:322] Optimization Done. - + CIFAR-10 training ++++++++++++++++++ @@ -112,14 +112,14 @@ CIFAR-10 training Details on CIFAR-10 training can be found at this `link `_. Here are the basic instructions:: - + ./data/cifar10/get_cifar10.sh ./examples/cifar10/create_cifar10.sh ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt - + Expected result: >70% accuracy after 4000 iterations :: - + I0727 18:29:35.248363 33 solver.cpp:279] Solving CIFAR10_quick I0727 18:29:35.248366 33 solver.cpp:280] Learning Rate Policy: fixed I0727 18:29:35.248883 33 solver.cpp:337] Iteration 0, Testing net (#0) @@ -134,7 +134,7 @@ Expected result: >70% accuracy after 4000 iterations I0727 18:30:13.722070 33 solver.cpp:404] Test net output #0: accuracy = 0.7124 I0727 18:30:13.722090 33 solver.cpp:404] Test net output #1: loss = 0.848089 (* 1 = 0.848089 loss) I0727 18:30:13.722095 33 solver.cpp:322] Optimization Done. - + CaffeNet inference +++++++++++++++++++ @@ -142,20 +142,20 @@ CaffeNet inference Details on CaffeNet inference can be found at this `link `_. Here are the basic instructions:: - + ./data/ilsvrc12/get_ilsvrc_aux.sh ./scripts/download_model_binary.py models/bvlc_reference_caffenet ./build/examples/cpp_classification/classification.bin models/bvlc_reference_caffenet/deploy.prototxt models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel data/ilsvrc12/imagenet_mean.binaryproto data/ilsvrc12/synset_words.txt examples/images/cat.jpg - + Expected result: (note the ordering and associated percentages) :: - + ---------- Prediction for examples/images/cat.jpg ---------- 0.3134 - "n02123045 tabby, tabby cat" 0.2380 - "n02123159 tiger cat" 0.1235 - "n02124075 Egyptian cat" 0.1003 - "n02119022 red fox, Vulpes vulpes" 0.0715 - "n02127052 lynx, catamount" - + diff --git a/Tutorial/rocncloc.rst b/Tutorial/rocncloc.rst index 81de8dc4..4ee137ec 100644 --- a/Tutorial/rocncloc.rst +++ b/Tutorial/rocncloc.rst @@ -11,21 +11,21 @@ ROCm With Harmony: Combining OpenCL, HCC, and HSA in a Single Program Introduction ************* -In a previous blog we discussed the different languages available on the ROCm platform. Here we’ll show you how to combine several of these languages in a single program: +In a previous blog we discussed the different languages available on the ROCm platform. Here we'll show you how to combine several of these languages in a single program: - * We’ll use an offline OpenCL™ compiler to compile the “BitonicSort” OpenCL kernel (from the AMD APP SDK) into a standard HSA code object (“hsaco”) format. - * The host code will employ HCC’s hc dialect for device discovery (ie hc::accelerator and hc::accelerator_view) and memory management (hc::array) - * The actual dispatch will use the low-level HSA Runtime calls. Recall that ROCR is an implementation of the HSA Runtime with extensions for multi-GPU configurations. We’ll show you how to extract HSA queue and agent structures from the HCC C++ ones, and then use them to perform the kernel launch. + * We'll use an offline OpenCL(TM) compiler to compile the "BitonicSort" OpenCL kernel (from the AMD APP SDK) into a standard HSA code object ("hsaco") format. + * The host code will employ HCC's hc dialect for device discovery (ie hc::accelerator and hc::accelerator_view) and memory management (hc::array) + * The actual dispatch will use the low-level HSA Runtime calls. Recall that ROCR is an implementation of the HSA Runtime with extensions for multi-GPU configurations. We'll show you how to extract HSA queue and agent structures from the HCC C++ ones, and then use them to perform the kernel launch. There are several reasons you might want to do something along these lines. First, many kernels exist in OpenCL and re-using this existing investment can save time. The OpenCL kernel language is widely-used, and it enables programmers to use advanced GPU features including local memory, rich math functions, and vector operations. But the OpenCL runtime can be verbose and the memory interface can be difficult to control and optimize. HCC provides the advantage of a full C++ runtime but also full control over the memory allocation and copies. Using the techniques we'll show you here, you can employ OpenCL kernels without having to port the host runtime code to OpenCL. This approach offers a significant advantage for larger C++ programs that can use a few optimized OpenCL kernels while sticking with C++ kernels and features for the rest of the program. hsaco : The Common Currency **************************** -Hsaco is informally pronounced “sock-o” (with a slight emphasis on the first letter to reflect the otherwise silent “h”). It's a standard ELF file ;`ELF `_ (“Executable and Linkable Format”) is a container format widely used in Linux to store object code, and the hsaco ELF container organization matches the one generated by the popular LLVM tool chain. Hsaco stores the compiled GCN code in the .text section, it optionally contains debug information, and it defines symbols that allow the host code to find the kernel entrypoints and functions. Like other ELF files, code objects can contain multiple kernels, functions, and data – so when using hsaco you will need to specify both the code object and the desired symbol. Refer to the `detailed description `_ of the hsaco format for more information. Many tools in AMD’s compiler chain generate and use the hsaco format including OpenCL, HCC, HIP, the GCN assembler and the HSAIL Finalizer. Kernel code contained in hsaco can be extracted and then launched onto the GPU.Additionally, the `dissembler tool `_ can disassemble hsaco files so you can see what is going on inside the kernel. In a future blog, we’ll talk about using the same techniques described here to assemble and then launch kernels written in GCN assembly. Essentially, hsaco is the interchange format used to pass code between these different tools, and allows code written in different languages to be used together. +Hsaco is informally pronounced "sock-o" (with a slight emphasis on the first letter to reflect the otherwise silent "h"). It's a standard ELF file ;`ELF `_ ("Executable and Linkable Format") is a container format widely used in Linux to store object code, and the hsaco ELF container organization matches the one generated by the popular LLVM tool chain. Hsaco stores the compiled GCN code in the .text section, it optionally contains debug information, and it defines symbols that allow the host code to find the kernel entrypoints and functions. Like other ELF files, code objects can contain multiple kernels, functions, and data - so when using hsaco you will need to specify both the code object and the desired symbol. Refer to the `detailed description `_ of the hsaco format for more information. Many tools in AMD's compiler chain generate and use the hsaco format including OpenCL, HCC, HIP, the GCN assembler and the HSAIL Finalizer. Kernel code contained in hsaco can be extracted and then launched onto the GPU.Additionally, the `dissembler tool `_ can disassemble hsaco files so you can see what is going on inside the kernel. In a future blog, we'll talk about using the same techniques described here to assemble and then launch kernels written in GCN assembly. Essentially, hsaco is the interchange format used to pass code between these different tools, and allows code written in different languages to be used together. Compiling an OpenCL Kernel into hsaco ************************************** -The Makefile shows the usage of the `CLOC `_ (CL Offline Compiler) tool to compile the CL kernel into the hsaco file. Here’s the relevant call to CLOC: /opt/rocm/cloc/bin/cloc.sh BitonicSort_Kernels.cl -o BitonicSort_Kernels.hsaco +The Makefile shows the usage of the `CLOC `_ (CL Offline Compiler) tool to compile the CL kernel into the hsaco file. Here's the relevant call to CLOC: /opt/rocm/cloc/bin/cloc.sh BitonicSort_Kernels.cl -o BitonicSort_Kernels.hsaco Using hsaco: ************ @@ -34,7 +34,7 @@ This example shows two methods for accessing the hsaco data from the host applic * Use a separate file and load it using C++ file I/O code. See the load_hsa_from_file() command. This path is enabled when p_loadKernelFromFile=true. * Serialize the code into a global string and thus directly link the hsaco into the executable. This approach avoids the need to find the hsaco file at runtime. This path is enabled when p_loadKernelFromFile=false. -The “load_hsa_code_object” shows the use of the standard HSA Runtime API calls to load the code object into memory and extract the pointer to the BitonicSort kernel. If we were working with an HSAIL or BRIG kernel we would first call the finalizer which would produce hsaco data, and the use these exact same finalizer APIs to load the hsaco into memory and find the desired symbols. This is a powerful and extremely useful concept that allows applications using the HSA Runtime to support either: +The "load_hsa_code_object" shows the use of the standard HSA Runtime API calls to load the code object into memory and extract the pointer to the BitonicSort kernel. If we were working with an HSAIL or BRIG kernel we would first call the finalizer which would produce hsaco data, and the use these exact same finalizer APIs to load the hsaco into memory and find the desired symbols. This is a powerful and extremely useful concept that allows applications using the HSA Runtime to support either: * An industry standard portable intermediate language (HSAIL/BRIG) that can be finalized to a vendor-specific binary, or * A standard ELF container that stores vendor-specific binary code (hsaco). This flavor supports vendor-specific ISA inside a standard container format, and still benefits from the standard HSA runtime API. Effectively this enables use cases where apps and tools can use the HSA Runtime APIs without using HSAIL, and still retain source code portability. @@ -46,7 +46,7 @@ The picture below shows the different steps in the code loading process, and in Making HCC Sing ****************** -The example uses the hc `C++ dialect `_ to select the default accelerator and queue. To launch the hsaco file we’ve created, we need to make HCC reveal the details of the HSA data structure that live under the covers. Here’s the critical piece of code that shows how to get from the HCC world to the HSA world using “hc::accelerator_view::get_hsa_queue”: +The example uses the hc `C++ dialect `_ to select the default accelerator and queue. To launch the hsaco file we've created, we need to make HCC reveal the details of the HSA data structure that live under the covers. Here's the critical piece of code that shows how to get from the HCC world to the HSA world using "hc::accelerator_view::get_hsa_queue": :: @@ -63,7 +63,7 @@ Now that we have an HSA queue we can use the low-level HSA runtime API to enqueu Extracting Data Pointers ************************* -The example under discussion uses hc::array<>to store the array of integers that are sorted. The original OpenCL kernel of course knows nothing of the hc::array<> data-type. Here’s the OpenCL kernel signature: +The example under discussion uses hc::array<>to store the array of integers that are sorted. The original OpenCL kernel of course knows nothing of the hc::array<> data-type. Here's the OpenCL kernel signature: :: @@ -76,11 +76,11 @@ When calling this kernel, the first parameter (theArray) is an 8-byte pointer. _inputAccPtr = _inputArray->;accelerator_pointer(); - + Our application is still responsible for ensuring that the data at this pointer is valid on the accelerator, before calling the kernel. In this case, the application copies from host data (allocated with malloc) to the inputArray. -The code also shows the use of hc’s accelerator memory interface to allocate and copy the data. This is an alternative to using hc::array<>, and can be select by setting p_useHcArray=false in the top of the source code. Here’s the relevant code snippet: +The code also shows the use of hc's accelerator memory interface to allocate and copy the data. This is an alternative to using hc::array<>, and can be select by setting p_useHcArray=false in the top of the source code. Here's the relevant code snippet: :: @@ -92,7 +92,7 @@ The code also shows the use of hc’s accelerator memory interface to allocate a We do not recommended usinge hc::array_view<> with the direct hsaco code launching techniques we are discussing here. hc::array_view<> is designed to automatically synchronize the data before and after parallel_for_each blocks are launched. Direct launching with HSA runtime APIs will not automatically synchronize hc::array_view<>. -Finally, HCC provides accessors that allow easy retrieval of the the HSA “regions” associated with an accelerator. The HSA runtime API uses regions to specify where memory on an agent is located - for example coarse-grain device memory or fine-grain system memory. When enumerating accelerators, HCC scans the supported regions for each underlying HSA agent and provides the following accessors: +Finally, HCC provides accessors that allow easy retrieval of the the HSA "regions" associated with an accelerator. The HSA runtime API uses regions to specify where memory on an agent is located - for example coarse-grain device memory or fine-grain system memory. When enumerating accelerators, HCC scans the supported regions for each underlying HSA agent and provides the following accessors: :: @@ -126,7 +126,7 @@ This example uses get_hsa_kernarg_region() to allocate memory for the kernel arg } ; - + /* @@ -144,7 +144,7 @@ This example uses get_hsa_kernarg_region() to allocate memory for the kernel arg assert(HSA_STATUS_SUCCESS == hsa_status); - + /* @@ -165,4 +165,4 @@ Summary We learned how to use offline compilation to convert an OpenCL kernel into a standard hsaco file and then employed the HSA Runtime API to launch that kernel from an HCC program. Harmony! In the future we'll look at how to optimize the HSA Runtime calls, and also how to use other tools to create hsaco files (such as the AMDGCN assembler). Stay tuned. Reference: `GitHub Code for this example `_ -https://en.wikipedia.org/wiki/Bitonic_sorter +https://en.wikipedia.org/wiki/Bitonic_sorter diff --git a/_templates/breadcrumbs.html b/_templates/breadcrumbs.html index f7561ce9..7cd66af3 100644 --- a/_templates/breadcrumbs.html +++ b/_templates/breadcrumbs.html @@ -40,7 +40,7 @@ {% if display_github %} {% if check_meta and 'github_url' in meta %} - + `Documentation Feedback `_ {{ _('Edit on GitHub') }} {% else %} diff --git a/amdstyles.css b/amdstyles.css index 816ea836..02334498 100644 --- a/amdstyles.css +++ b/amdstyles.css @@ -5696,7 +5696,7 @@ fieldset[disabled] .navbar-inverse .btn-link:focus { } .breadcrumb > li + li:before { - content: "/ "; + content: "/ "; padding: 0 5px; color: #ccc; } @@ -7638,7 +7638,7 @@ table.spec-table tr.detail-view td { } /** - * @author: Dennis Hernández + * @author: Dennis Hernandez * @webSite: http://djhvscf.github.io/Blog * @version: v2.1.1 */ @@ -13081,7 +13081,7 @@ readers do not read off random characters that represent icons */ @font-face { font-family: "ProJP"; - src: local("ヒラギノ角ゴ Pro"); + src: local("?????? Pro"); font-stretch: condensed; font-size: 10%; } @@ -32096,7 +32096,7 @@ body.toolbar-fixed.toolbar-vertical.toolbar-tray-open.toolbar-fixed .toolbar-tra } .region-navigation #block-header-search-block .search-toggle-container .search-toggle .fa-search-toggle:before, .region-navigation #block-exposedformacquia-searchpage-2 .search-toggle-container .search-toggle .fa-search-toggle:before { - content: ""; + content: "?"; } .region-navigation #block-header-search-block .search-toggle-container .search-toggle .fa-search-toggle.active, .region-navigation #block-exposedformacquia-searchpage-2 .search-toggle-container .search-toggle .fa-search-toggle.active { @@ -38783,7 +38783,7 @@ fieldset[disabled] .navbar-inverse .btn-link:focus { } .breadcrumb > li + li:before { - content: "/ "; + content: "/ "; padding: 0 5px; color: #ccc; } @@ -40725,7 +40725,7 @@ table.spec-table tr.detail-view td { } /** - * @author: Dennis Hernández + * @author: Dennis Hernandez * @webSite: http://djhvscf.github.io/Blog * @version: v2.1.1 */ @@ -42039,7 +42039,7 @@ ul.field--name-field-game-information > li.field__item { } ul.field--name-field-game-information > li.field__item:before { - content: '✓'; + content: '?'; padding-right: 5px; } @@ -43355,7 +43355,7 @@ article.embedded-entity span.image-title { width: 100%; margin-top: 40px; } - + .page-node-type-product-page .product fieldset .fieldset-wrapper ul, .block-amd-support-product-spec .product fieldset .fieldset-wrapper ul, .page-node-type-product-page .product fieldset .fieldset-wrapper .field, .block-amd-support-product-spec .product fieldset .fieldset-wrapper .field { float: none; diff --git a/cleanup_text.sh b/cleanup_text.sh new file mode 100755 index 00000000..3649153d --- /dev/null +++ b/cleanup_text.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Script to clean up text files +# Lee Killough +# lee.killough@amd.com + +set -ex + +export PATH=/usr/bin:/bin + +# Go through the entire repository, excluding files normally excluded by Git +git ls-files -z --exclude-standard | while read -rd '' file; do + + # Operate only on regular files of MIME type text/* + if [[ -f "$file" && "$(file -b --mime-type "$file")" == text/* ]]; then + + # Remove editor backup files ending in ~ + if [[ "$file" = *~ ]]; then + git rm "$file" + continue + fi + + # Remove trailing whitespace at end of lines (also converts CR-LF to LF) + sed -i -e 's/[[:space:]]*$//' "$file" + + # Add missing newline to end of file + sed -i -e '$a\' "$file" + + # Convert UTF8 non-ASCII to ASCII + temp=$(mktemp) + iconv -s -f utf-8 -t ascii//TRANSLIT "$file" > "$temp" + chmod --reference="$file" "$temp" + mv -f "$temp" "$file" + + # Add the file to the index if it has changed + git add -u "$file" + fi +done + +cat<